import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%pylab inline
import sklearn
sns.set_style("whitegrid")
%run helper_functions.py
import pydotplus
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
nudge = pd.read_csv("master.csv")
nudge.head(10)
nudge.age.hist(bins=80)
plt.show()
nudge.groupby('politics').N1.mean().plot(kind='bar')
plt.show()
nudge.politics.describe()
nudge.gender.describe()
ax=sns.countplot(x="gender", hue="N1", data=nudge)
sns.plt.show()
sns.boxplot(x='gender', y='age', data=nudge)
plt.show()
sns.kdeplot(nudge.age.dropna())
plt.show()
nudge.describe()
nudge["origin"]=nudge.country.map({1:"USA",2:"GER",3:"DEN",4:"SK",5:"BE"}) # One hot encoding of country
nudge=pd.concat([nudge, pd.get_dummies(nudge.origin)], axis=1)
nudge=nudge.drop(["country","origin"], axis=1)
nudge["NC1"]=(nudge.N6+nudge.N7+nudge.N12)*1/3
nudge['NC2']=(nudge.N1+nudge.N10+nudge.N2)*1/3
nudge['NC3']=(nudge.N3+nudge.N13+nudge.N9+nudge.N11+nudge.N5+nudge.N4)*1/6
nudge['NC4']=(nudge.N8)
nudge['NC5']=(nudge.N14+nudge.N15)*1/2
nudge['N']=(nudge.N1+nudge.N2+nudge.N3+nudge.N4+nudge.N5+nudge.N6+nudge.N7+nudge.N8+nudge.N9+nudge.N10+nudge.N11+nudge.N12+nudge.N13+nudge.N14+nudge.N15)*1/15
nudge['WEIGHT']=nudge.WEIGHT.fillna(value=1)
nudge.describe()
#shuffeling data set
row=nudge.shape[0]
#nudge["rn"]
nudge['rn']=np.random.randint(row+100, size=row)
col=nudge.shape[1]
nudge=nudge.sort_values('rn')
nudge.head(10)
X.head()
# Construction of relevant data set(s)
# Building of clusters
#X=nudge.dropna(axis=1, how='any')
nudge['bmi']=(nudge.weight)/((nudge.height/100)*(nudge.height/100))
X=nudge.drop(["N1","N2","N3","N4","N5","N6","N7","N8","N9","N10","N11","N12","N13","N14","N15","rn","weight", "height"], axis=1)
X=X.drop(X[X.bmi >=50].index)
X=X.drop(X[X.bmi <12].index)
WEIGHT=X.WEIGHT.values
X=X.drop(["WEIGHT"], axis=1)
X['N']=X.N+0.01
X['NC1']=X.NC1+0.01
X['NC2']=X.NC2+0.01
X['NC3']=X.NC3+0.01
X['NC4']=X.NC4+0.01
X['NC5']=X.NC5+0.01
y=X.round({"N":0}).N
y1=X.round({"NC1":0}).NC1
y2=X.round({"NC2":0}).NC2
y3=X.round({"NC3":0}).NC3
y4=X.round({"NC4":0}).NC4
y5=X.round({"NC5":0}).NC5
X=X.drop(["NC1","NC2","NC3","NC4","NC5","N"], axis=1)
feature_names=X.columns
class_labels=["Disapprove","Approve"]
X.describe()
k=nudge.drop(["rn"], axis=1)
corr=k.corr() # Needs some adjustment
plt.figure(figsize=(10, 10))
sns.heatmap(corr, vmax=.5, linewidths=0.01,
square=True,annot=False,cmap='seismic',linecolor="white")
plt.savefig('corr.png')
plt.show()
#YlGnBu
from IPython.display import Image
def plot_tree(clf, feature_names, target_names):
dot_data = sklearn.tree.export_graphviz(clf, out_file=None,
feature_names=feature_names,
class_names= target_names,
filled=True, rounded=True,
special_characters=True)
return pydotplus.graph_from_dot_data(dot_data).create_png()
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
params = {
'min_samples_leaf': range(2,100),
'max_leaf_nodes' : range(2,10)
}
tree = DecisionTreeClassifier()
cv = ShuffleSplit(n_splits=20, random_state=0)
grid = GridSearchCV(estimator=tree, cv = cv, param_grid= params)
grid.fit(X, y, sample_weight=WEIGHT)
# summarize the results of the grid search
print("Best parameters: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)
Image(plot_tree(grid.best_estimator_, feature_names, class_labels))
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
params = {
'min_samples_leaf': range(1,100),
'max_leaf_nodes' : range(2,100)
}
tree = DecisionTreeClassifier()
cv = ShuffleSplit(n_splits=20, random_state=0)
grid = GridSearchCV(estimator=tree, cv = cv, param_grid= params)
grid.fit(X, y1, sample_weight=WEIGHT)
# summarize the results of the grid search
print("Best parameters: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)
Image(plot_tree(grid.best_estimator_, feature_names, class_labels))
params = {
'min_samples_leaf': range(2,100),
'max_leaf_nodes' : range(2,100)
}
tree = DecisionTreeClassifier()
cv = ShuffleSplit(n_splits=20, random_state=0)
grid = GridSearchCV(estimator=tree, cv = cv, param_grid= params)
grid.fit(X, y2, sample_weight=WEIGHT)
# summarize the results of the grid search
print("Best parameters: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)
Image(plot_tree(grid.best_estimator_, feature_names, class_labels))
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
params = {
'min_samples_leaf': range(2,100),
'max_leaf_nodes' : range(2,100),
}
tree = DecisionTreeClassifier(presort=True)
cv = ShuffleSplit(n_splits=20, random_state=0)
grid = GridSearchCV(estimator=tree, cv = cv, param_grid= params)
grid.fit(X, y3, sample_weight=WEIGHT)
# summarize the results of the grid search
print("Best parameters: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)
Image(plot_tree(grid.best_estimator_, feature_names, class_labels))
params = {
'min_samples_leaf': range(2,100),
'max_leaf_nodes' : range(2,100)
}
tree = DecisionTreeClassifier()
cv = ShuffleSplit(n_splits=20, random_state=0)
grid = GridSearchCV(estimator=tree, cv = cv, param_grid= params)
grid.fit(X, y4, sample_weight=WEIGHT)
# summarize the results of the grid search
print("Best parameters: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)
Image(plot_tree(grid.best_estimator_, feature_names, class_labels))
params = {
'min_samples_leaf': range(2,100),
'max_leaf_nodes' : range(2,100)
}
tree = DecisionTreeClassifier()
cv = ShuffleSplit(n_splits=20, random_state=0)
grid = GridSearchCV(estimator=tree, cv = cv, param_grid= params)
grid.fit(X, y5, sample_weight=WEIGHT)
# summarize the results of the grid search
print("Best parameters: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)
Image(plot_tree(grid.best_estimator_, feature_names, class_labels))