import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
plt.figure(figsize=(12, 6))
#Create your df here:
df = pd.read_csv("capstone_starter/profiles.csv")
df.head()
age | body_type | diet | drinks | drugs | education | essay0 | essay1 | essay2 | essay3 | ... | location | offspring | orientation | pets | religion | sex | sign | smokes | speaks | status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | a little extra | strictly anything | socially | never | working on college/university | about me:<br />\n<br />\ni would love to think... | currently working as an international agent fo... | making people laugh.<br />\nranting about a go... | the way i look. i am a six foot half asian, ha... | ... | south san francisco, california | doesn’t have kids, but might want them | straight | likes dogs and likes cats | agnosticism and very serious about it | m | gemini | sometimes | english | single |
1 | 35 | average | mostly other | often | sometimes | working on space camp | i am a chef: this is what that means.<br />\n1... | dedicating everyday to being an unbelievable b... | being silly. having ridiculous amonts of fun w... | NaN | ... | oakland, california | doesn’t have kids, but might want them | straight | likes dogs and likes cats | agnosticism but not too serious about it | m | cancer | no | english (fluently), spanish (poorly), french (... | single |
2 | 38 | thin | anything | socially | NaN | graduated from masters program | i'm not ashamed of much, but writing public te... | i make nerdy software for musicians, artists, ... | improvising in different contexts. alternating... | my large jaw and large glasses are the physica... | ... | san francisco, california | NaN | straight | has cats | NaN | m | pisces but it doesn’t matter | no | english, french, c++ | available |
3 | 23 | thin | vegetarian | socially | NaN | working on college/university | i work in a library and go to school. . . | reading things written by old dead people | playing synthesizers and organizing books acco... | socially awkward but i do my best | ... | berkeley, california | doesn’t want kids | straight | likes cats | NaN | m | pisces | no | english, german (poorly) | single |
4 | 29 | athletic | NaN | socially | never | graduated from college/university | hey how's it going? currently vague on the pro... | work work work work + play | creating imagery to look at:<br />\nhttp://bag... | i smile a lot and my inquisitive nature | ... | san francisco, california | NaN | straight | likes dogs and likes cats | NaN | m | aquarius | no | english | single |
5 rows × 31 columns
<Figure size 864x432 with 0 Axes>
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 59946 entries, 0 to 59945 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 59946 non-null int64 1 body_type 54650 non-null object 2 diet 35551 non-null object 3 drinks 56961 non-null object 4 drugs 45866 non-null object 5 education 53318 non-null object 6 essay0 54458 non-null object 7 essay1 52374 non-null object 8 essay2 50308 non-null object 9 essay3 48470 non-null object 10 essay4 49409 non-null object 11 essay5 49096 non-null object 12 essay6 46175 non-null object 13 essay7 47495 non-null object 14 essay8 40721 non-null object 15 essay9 47343 non-null object 16 ethnicity 54266 non-null object 17 height 59943 non-null float64 18 income 59946 non-null int64 19 job 51748 non-null object 20 last_online 59946 non-null object 21 location 59946 non-null object 22 offspring 24385 non-null object 23 orientation 59946 non-null object 24 pets 40025 non-null object 25 religion 39720 non-null object 26 sex 59946 non-null object 27 sign 48890 non-null object 28 smokes 54434 non-null object 29 speaks 59896 non-null object 30 status 59946 non-null object dtypes: float64(1), int64(2), object(28) memory usage: 14.2+ MB
# Let's look at the "age" column ...
plt.hist(df.age, bins=20)
plt.xlabel('age')
plt.ylabel('frequency')
plt.xlim(16, 80)
(16.0, 80.0)
# Let's dig a little bit into the 'job' column ...
df.job.head()
0 transportation 1 hospitality / travel 2 NaN 3 student 4 artistic / musical / writer Name: job, dtype: object
# We see that at leaast one of the rows has a NaN in it, so we'll check to see how many we have in the entire dataframe ...
nan_count = df['job'].isna().sum()
nan_count
8198
# let's drop those rows and check the dataframe again after handling the missing data ...
df = df.dropna(subset=['job'])
nan_count = df['job'].isna().sum()
nan_count
0
import seaborn as sns
#sns.set(rc={'figure.figsize':(30,25)})
#sns.set(font_scale=1)
# Plot pairplot
sns.pairplot(df)
# Show the plot
plt.show()
sns.countplot(x='job', hue='orientation', data=df)
plt.xticks(rotation=90)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]), [Text(0, 0, 'transportation'), Text(1, 0, 'hospitality / travel'), Text(2, 0, 'student'), Text(3, 0, 'artistic / musical / writer'), Text(4, 0, 'computer / hardware / software'), Text(5, 0, 'banking / financial / real estate'), Text(6, 0, 'entertainment / media'), Text(7, 0, 'sales / marketing / biz dev'), Text(8, 0, 'other'), Text(9, 0, 'medicine / health'), Text(10, 0, 'science / tech / engineering'), Text(11, 0, 'executive / management'), Text(12, 0, 'education / academia'), Text(13, 0, 'clerical / administrative'), Text(14, 0, 'construction / craftsmanship'), Text(15, 0, 'rather not say'), Text(16, 0, 'political / government'), Text(17, 0, 'law / legal services'), Text(18, 0, 'unemployed'), Text(19, 0, 'military'), Text(20, 0, 'retired')])
df.job.value_counts()
other 7589 student 4882 science / tech / engineering 4848 computer / hardware / software 4709 artistic / musical / writer 4439 sales / marketing / biz dev 4391 medicine / health 3680 education / academia 3513 executive / management 2373 banking / financial / real estate 2266 entertainment / media 2250 law / legal services 1381 hospitality / travel 1364 construction / craftsmanship 1021 clerical / administrative 805 political / government 708 rather not say 436 transportation 366 unemployed 273 retired 250 military 204 Name: job, dtype: int64
plt.style.use('dark_background')
plt.tight_layout()
df['job'].value_counts().plot(kind='pie', autopct="%1.1f%%")
<AxesSubplot:ylabel='job'>
# Let's also have a look at the "orientation" column ...
df.orientation.value_counts()
straight 44657 gay 4752 bisexual 2339 Name: orientation, dtype: int64
sns.countplot(x='job', hue='orientation', data=df)
plt.xticks(rotation=90)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]), [Text(0, 0, 'transportation'), Text(1, 0, 'hospitality / travel'), Text(2, 0, 'student'), Text(3, 0, 'artistic / musical / writer'), Text(4, 0, 'computer / hardware / software'), Text(5, 0, 'banking / financial / real estate'), Text(6, 0, 'entertainment / media'), Text(7, 0, 'sales / marketing / biz dev'), Text(8, 0, 'other'), Text(9, 0, 'medicine / health'), Text(10, 0, 'science / tech / engineering'), Text(11, 0, 'executive / management'), Text(12, 0, 'education / academia'), Text(13, 0, 'clerical / administrative'), Text(14, 0, 'construction / craftsmanship'), Text(15, 0, 'rather not say'), Text(16, 0, 'political / government'), Text(17, 0, 'law / legal services'), Text(18, 0, 'unemployed'), Text(19, 0, 'military'), Text(20, 0, 'retired')])
# Formulating a question: Can one use the data provided to predict a person's orientation? What might their "job" have to do with this, if anything ...? So, to start, let's first convert the strings in the "job" column to numbers and then replace all the strings with our numbers ...
unique_jobs = df['job'].unique()
category_to_number = {job: i for i, job in enumerate(unique_jobs)}
df['job_code'] = df['job'].replace(category_to_number)
print(df['job_code'].head())
print(df['job_code'].value_counts())
0 0 1 1 3 2 4 3 5 4 Name: job_code, dtype: int64 8 7589 2 4882 10 4848 4 4709 3 4439 7 4391 9 3680 12 3513 11 2373 5 2266 6 2250 17 1381 1 1364 14 1021 13 805 16 708 15 436 0 366 18 273 20 250 19 204 Name: job_code, dtype: int64
# And let's have a look at the values for the "sex" column too ...
df['sex'].value_counts()
m 31287 f 20461 Name: sex, dtype: int64
# Now let's also convert the "m" and "f" values in that column to numbers and then replace everything in that column with the numbers; we'll also do the same for the "orientation" column, which is the one we'll ask our model to make predictions about ...
sex_category_to_number = {
'm': 0,
'f': 1
}
orientation_category_to_number = {
'straight': 0,
'gay': 1,
'bisexual': 2
}
df['sex_code'] = df['sex'].replace(sex_category_to_number)
df['orientation_code'] = df['orientation'].replace(sex_category_to_number)
# Next let's pull in our sklearn library and get the data normalized (docs for sklearn's MinMaxScaler is available at https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)
import sklearn.preprocessing as preprocessing
from sklearn.preprocessing import MinMaxScaler
# We'll start by seeing if one can predict the value of the "orientation" column using the data from the converted "job" and "sex" columns:
feature_data = df[['job_code', 'sex_code']]
x = feature_data.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
feature_data = pd.DataFrame(x_scaled, columns=feature_data.columns)
# After we split the dataset we'll try a K-Means Nearest Neigbors (KNN) Algorithm (docs available at https://scikit-learn.org/stable/modules/neighbors.html)
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(feature_data, df['orientation_code'], test_size=0.2, random_state=42)
n_neighbors_list = range(1, 201)
best_n_neighbors = None
best_score = 0
accuracy_score = []
# loop over the values of n_neighbors and train a KNN classifier for each one
for n_neighbors in n_neighbors_list:
# create a KNN classifier object with the current value of n_neighbors
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
# train the classifier on the training data
knn.fit(X_train, y_train)
# evaluate the performance of the classifier on the test data
y_pred = knn.predict(X_test)
score = knn.score(X_test, y_test)
accuracy_score.append(score)
plt.figure(figsize=(12, 6))
plt.plot(n_neighbors_list, accuracy_score)
plt.xlabel("number of neighbors (k)")
plt.ylabel("accuracy score")
plt.show()
This model produces an 86% accuracy score—with the ideal number of neighbors around 15 or so.
# What about a different algorithm—say, the GaussianNB (docs at https://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes)
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
# create a Gaussian Naive Bayes classifier object
nb = GaussianNB()
# train the classifier on the training data
nb.fit(X_train, y_train)
# evaluate the performance of the classifier on the test data
y_pred = nb.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support bisexual 0.00 0.00 0.00 470 gay 0.00 0.00 0.00 957 straight 0.86 1.00 0.93 8923 accuracy 0.86 10350 macro avg 0.29 0.33 0.31 10350 weighted avg 0.74 0.86 0.80 10350
c:\Users\KSpicer\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) c:\Users\KSpicer\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) c:\Users\KSpicer\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
# What about a "Support Vectors Machine" algorithm (docs at https://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm)
from sklearn.svm import SVC
from sklearn.metrics import classification_report
# create an SVM classifier object with a linear kernel
svm = SVC(kernel='linear')
# train the classifier on the training data
svm.fit(X_train, y_train)
# evaluate the performance of the classifier on the test data
y_pred = svm.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support bisexual 0.00 0.00 0.00 470 gay 0.00 0.00 0.00 957 straight 0.86 1.00 0.93 8923 accuracy 0.86 10350 macro avg 0.29 0.33 0.31 10350 weighted avg 0.74 0.86 0.80 10350
c:\Users\KSpicer\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) c:\Users\KSpicer\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) c:\Users\KSpicer\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# create a decision tree classifier object
dtc = DecisionTreeClassifier()
# train the model on the training data
dtc.fit(X_train, y_train)
# evaluate the performance of the model on the test data
y_pred = dtc.predict(X_test)
print(classification_report(y_test, y_pred))
# create a random forest classifier object
rfc = RandomForestClassifier()
# train the model on the training data
rfc.fit(X_train, y_train)
# evaluate the performance of the model on the test data
y_pred = rfc.predict(X_test)
print(classification_report(y_test, y_pred))
c:\Users\KSpicer\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) c:\Users\KSpicer\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) c:\Users\KSpicer\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support bisexual 0.00 0.00 0.00 470 gay 0.00 0.00 0.00 957 straight 0.86 1.00 0.93 8923 accuracy 0.86 10350 macro avg 0.29 0.33 0.31 10350 weighted avg 0.74 0.86 0.80 10350 precision recall f1-score support bisexual 0.00 0.00 0.00 470 gay 0.00 0.00 0.00 957 straight 0.86 1.00 0.93 8923 accuracy 0.86 10350 macro avg 0.29 0.33 0.31 10350 weighted avg 0.74 0.86 0.80 10350
c:\Users\KSpicer\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) c:\Users\KSpicer\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) c:\Users\KSpicer\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import learning_curve
# Define the DTC model
dtc = DecisionTreeClassifier(max_depth=3)
# Generate the learning curve
train_sizes, train_scores, test_scores = learning_curve(dtc, X_train, y_train, cv=5)
# Calculate the mean and standard deviation of the training and testing scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
# Plot the learning curve
plt.plot(train_sizes, train_mean, label='Training score')
plt.plot(train_sizes, test_mean, label='Cross-validation score')
# Add error bars
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1)
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1)
# Add labels and legend
plt.xlabel('Training set size')
plt.ylabel('Accuracy score')
plt.title('Decision Tree Classifier Learning Curve')
plt.legend()
# Show the plot
plt.show()
# What if we added some features to our feature_data from the original dataset? Would that give us appreciably different results, still using the same KNN Classifier?
feature_data = df[['age', 'income', 'job_code', 'sex_code', ]]
x = feature_data.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
feature_data = pd.DataFrame(x_scaled, columns=feature_data.columns)
# split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(df[['job_code', 'sex_code']], df['orientation_code'], test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(feature_data, df['orientation_code'], test_size=0.2, random_state=42)
n_neighbors_list = range(1, 201)
best_n_neighbors = None
best_score = 0
accuracy_score = []
# loop over the values of n_neighbors and train a KNN classifier for each one
for n_neighbors in n_neighbors_list:
# create a KNN classifier object with the current value of n_neighbors
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
# train the classifier on the training data
knn.fit(X_train, y_train)
# evaluate the performance of the classifier on the test data
y_pred = knn.predict(X_test)
score = knn.score(X_test, y_test)
accuracy_score.append(score)
plt.figure(figsize=(12, 6))
plt.plot(n_neighbors_list, accuracy_score)
plt.xlabel("number of neighbors (k)")
plt.ylabel("accuracy score")
plt.show()