In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%pylab inline
import sklearn
sns.set_style("whitegrid")
%run helper_functions.py

import pydotplus
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
Populating the interactive namespace from numpy and matplotlib
In [2]:
nudge = pd.read_csv("master.csv")
In [3]:
nudge.head(10)
Out[3]:
country gender age yos N1 N2 N3 N4 N5 N6 ... health_concern health_concernf markets risk freedom WEIGHT height trustscore_inst trustscore_priv infoscore
0 2 1 57 10 1 1 1 0 1 1 ... 7 7 1 1 1 0.379489 185 16 25 29
1 2 1 47 17 1 0 1 1 0 1 ... 6 6 4 6 6 0.634562 171 36 24 25
2 1 1 34 6 0 1 1 1 1 1 ... 7 7 7 6 4 0.993351 175 26 24 36
3 3 1 45 0 1 1 1 1 1 1 ... 3 3 4 5 5 0.819534 180 48 38 29
4 2 1 58 9 1 0 1 1 1 1 ... 5 5 4 4 6 0.423669 198 54 33 20
5 3 1 62 4 1 1 1 0 1 1 ... 5 6 1 3 6 0.840970 176 37 28 27
6 1 1 63 16 0 0 1 1 0 1 ... 4 5 7 4 6 0.702601 185 35 33 19
7 4 1 21 13 1 1 0 1 1 1 ... 5 3 2 3 3 0.618010 175 40 29 23
8 3 1 64 5 0 0 1 1 0 1 ... 4 3 5 6 6 0.821471 182 55 34 28
9 2 1 47 14 1 1 1 1 1 1 ... 3 4 2 2 6 3.793416 171 20 25 20

10 rows × 48 columns

In [4]:
nudge.age.hist(bins=80)
plt.show()
In [5]:
nudge.groupby('politics').N1.mean().plot(kind='bar')
plt.show()
In [6]:
nudge.politics.describe()
Out[6]:
count    5526.000000
mean        3.941549
std         1.355940
min         1.000000
25%         3.000000
50%         4.000000
75%         5.000000
max         7.000000
Name: politics, dtype: float64
In [7]:
nudge.gender.describe()
Out[7]:
count    5526.000000
mean        0.510677
std         0.499931
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: gender, dtype: float64
In [8]:
ax=sns.countplot(x="gender", hue="N1", data=nudge)
sns.plt.show()
In [9]:
sns.boxplot(x='gender', y='age', data=nudge)
plt.show()
In [10]:
sns.kdeplot(nudge.age.dropna())
plt.show()
In [11]:
nudge.describe()
Out[11]:
country gender age yos N1 N2 N3 N4 N5 N6 ... health_concern health_concernf markets risk freedom WEIGHT height trustscore_inst trustscore_priv infoscore
count 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000 ... 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000 4530.000000 5526.000000 5526.000000 5526.000000 5526.000000
mean 2.900471 0.510677 46.548860 12.340391 0.790626 0.671010 0.683858 0.536374 0.615816 0.843286 ... 4.472675 4.586319 3.963626 3.790445 5.006877 1.000000 171.247738 37.408614 25.894861 23.846182
std 1.380488 0.499931 16.372001 4.998149 0.406899 0.469889 0.465011 0.498720 0.486446 0.363564 ... 1.575915 1.507287 1.414961 1.441174 1.379864 0.563066 10.512065 12.211139 6.516528 7.305180
min 1.000000 0.000000 18.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 0.182926 64.000000 10.000000 6.000000 7.000000
25% 2.000000 0.000000 32.000000 10.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 ... 4.000000 4.000000 3.000000 3.000000 4.000000 0.720752 165.000000 29.000000 22.000000 19.000000
50% 3.000000 1.000000 47.000000 12.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 5.000000 5.000000 4.000000 4.000000 5.000000 0.879054 171.000000 38.000000 26.000000 24.000000
75% 4.000000 1.000000 60.000000 16.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 6.000000 6.000000 5.000000 5.000000 6.000000 1.080126 178.000000 46.000000 30.000000 29.000000
max 5.000000 1.000000 99.000000 30.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 7.000000 7.000000 7.000000 7.000000 7.000000 8.425268 213.000000 70.000000 42.000000 49.000000

8 rows × 48 columns

In [12]:
nudge["origin"]=nudge.country.map({1:"USA",2:"GER",3:"DEN",4:"SK",5:"BE"}) # One hot encoding of country
nudge=pd.concat([nudge, pd.get_dummies(nudge.origin)], axis=1)
nudge=nudge.drop(["country","origin"], axis=1)
In [13]:
nudge["NC1"]=(nudge.N6+nudge.N7+nudge.N12)*1/3
nudge['NC2']=(nudge.N1+nudge.N10+nudge.N2)*1/3
nudge['NC3']=(nudge.N3+nudge.N13+nudge.N9+nudge.N11+nudge.N5+nudge.N4)*1/6
nudge['NC4']=(nudge.N8)
nudge['NC5']=(nudge.N14+nudge.N15)*1/2
nudge['N']=(nudge.N1+nudge.N2+nudge.N3+nudge.N4+nudge.N5+nudge.N6+nudge.N7+nudge.N8+nudge.N9+nudge.N10+nudge.N11+nudge.N12+nudge.N13+nudge.N14+nudge.N15)*1/15
nudge['WEIGHT']=nudge.WEIGHT.fillna(value=1)
nudge.describe()
Out[13]:
gender age yos N1 N2 N3 N4 N5 N6 N7 ... DEN GER SK USA NC1 NC2 NC3 NC4 NC5 N
count 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000 ... 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000 5526.000000
mean 0.510677 46.548860 12.340391 0.790626 0.671010 0.683858 0.536374 0.615816 0.843286 0.865364 ... 0.174810 0.277778 0.184039 0.183134 0.756062 0.746652 0.537821 0.432863 0.500995 0.611328
std 0.499931 16.372001 4.998149 0.406899 0.469889 0.465011 0.498720 0.486446 0.363564 0.341366 ... 0.379839 0.447944 0.387551 0.386812 0.281294 0.324369 0.282944 0.495517 0.399489 0.234921
min 0.000000 18.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 32.000000 10.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.000000 ... 0.000000 0.000000 0.000000 0.000000 0.666667 0.666667 0.333333 0.000000 0.000000 0.466667
50% 1.000000 47.000000 12.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 0.000000 0.000000 0.000000 0.000000 0.666667 1.000000 0.500000 0.000000 0.500000 0.666667
75% 1.000000 60.000000 16.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 0.000000 1.000000 0.000000 0.000000 1.000000 1.000000 0.666667 1.000000 1.000000 0.800000
max 1.000000 99.000000 30.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 58 columns

In [14]:
#shuffeling data set
row=nudge.shape[0]
#nudge["rn"]
nudge['rn']=np.random.randint(row+100, size=row)
col=nudge.shape[1]
nudge=nudge.sort_values('rn')
In [15]:
nudge.head(10)
Out[15]:
gender age yos N1 N2 N3 N4 N5 N6 N7 ... GER SK USA NC1 NC2 NC3 NC4 NC5 N rn
177 0 69 1 1 0 0 1 1 1 1 ... 0 0 1 1.000000 0.666667 0.500000 1 0.5 0.666667 0
312 0 55 7 1 1 1 1 0 1 1 ... 0 0 1 1.000000 1.000000 0.666667 0 1.0 0.800000 3
2168 1 33 12 0 0 1 1 1 1 1 ... 0 0 1 1.000000 0.000000 0.666667 1 1.0 0.666667 5
1125 0 66 4 1 1 1 1 1 0 1 ... 1 0 0 0.666667 1.000000 0.666667 1 1.0 0.800000 7
3640 1 35 15 0 0 0 1 0 0 0 ... 1 0 0 0.000000 0.000000 0.166667 1 0.0 0.133333 7
3388 1 34 12 1 1 0 1 1 1 0 ... 0 0 1 0.666667 0.666667 0.666667 1 0.5 0.666667 7
2268 1 61 3 1 1 1 1 1 1 1 ... 0 0 1 0.666667 1.000000 0.666667 0 0.5 0.666667 11
3918 1 32 12 1 1 1 0 1 0 1 ... 0 0 1 0.333333 1.000000 0.666667 0 0.0 0.533333 11
1368 0 23 14 1 1 0 0 0 1 0 ... 0 1 0 0.333333 0.666667 0.166667 1 0.0 0.333333 11
3686 0 40 12 1 0 1 0 1 1 1 ... 1 0 0 0.666667 0.333333 0.500000 0 1.0 0.533333 12

10 rows × 59 columns

In [16]:
X.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-16-ae0451748b41> in <module>()
----> 1 X.head()

NameError: name 'X' is not defined
In [17]:
# Construction of relevant data set(s)
# Building of clusters
#X=nudge.dropna(axis=1, how='any')



nudge['bmi']=(nudge.weight)/((nudge.height/100)*(nudge.height/100))
X=nudge.drop(["N1","N2","N3","N4","N5","N6","N7","N8","N9","N10","N11","N12","N13","N14","N15","rn","weight", "height"], axis=1)
X=X.drop(X[X.bmi >=50].index)
X=X.drop(X[X.bmi <12].index)
WEIGHT=X.WEIGHT.values
X=X.drop(["WEIGHT"], axis=1)

X['N']=X.N+0.01
X['NC1']=X.NC1+0.01
X['NC2']=X.NC2+0.01
X['NC3']=X.NC3+0.01
X['NC4']=X.NC4+0.01
X['NC5']=X.NC5+0.01

y=X.round({"N":0}).N
y1=X.round({"NC1":0}).NC1
y2=X.round({"NC2":0}).NC2
y3=X.round({"NC3":0}).NC3
y4=X.round({"NC4":0}).NC4
y5=X.round({"NC5":0}).NC5
X=X.drop(["NC1","NC2","NC3","NC4","NC5","N"], axis=1)
feature_names=X.columns
class_labels=["Disapprove","Approve"]
In [18]:
X.describe()
Out[18]:
gender age yos city married noc income money_left car politics ... freedom trustscore_inst trustscore_priv infoscore BE DEN GER SK USA bmi
count 5385.000000 5385.000000 5385.000000 5385.000000 5385.000000 5385.000000 5385.000000 5385.000000 5385.000000 5385.000000 ... 5385.000000 5385.000000 5385.000000 5385.000000 5385.000000 5385.000000 5385.000000 5385.000000 5385.000000 5385.000000
mean 0.504921 46.676323 12.296565 3.124420 0.480409 1.179201 5.574373 0.583658 0.753018 3.945218 ... 5.011885 37.461653 25.925720 23.866110 0.184401 0.176787 0.281151 0.172331 0.185330 26.306973
std 0.500022 16.391199 4.988944 1.575292 0.499662 1.249522 3.240899 0.492997 0.431296 1.360040 ... 1.374197 12.161771 6.492885 7.278075 0.387847 0.381524 0.449603 0.377703 0.388601 6.003093
min 0.000000 18.000000 0.000000 1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000 ... 1.000000 10.000000 6.000000 7.000000 0.000000 0.000000 0.000000 0.000000 0.000000 12.017305
25% 0.000000 32.000000 10.000000 2.000000 0.000000 0.000000 3.000000 0.000000 1.000000 3.000000 ... 4.000000 29.000000 22.000000 19.000000 0.000000 0.000000 0.000000 0.000000 0.000000 22.282592
50% 1.000000 47.000000 12.000000 3.000000 0.000000 1.000000 6.000000 1.000000 1.000000 4.000000 ... 5.000000 38.000000 26.000000 24.000000 0.000000 0.000000 0.000000 0.000000 0.000000 25.593737
75% 1.000000 60.000000 16.000000 5.000000 1.000000 2.000000 8.000000 1.000000 1.000000 5.000000 ... 6.000000 46.000000 30.000000 29.000000 0.000000 0.000000 1.000000 0.000000 0.000000 29.585799
max 1.000000 99.000000 30.000000 6.000000 1.000000 10.000000 11.000000 1.000000 1.000000 7.000000 ... 7.000000 70.000000 42.000000 49.000000 1.000000 1.000000 1.000000 1.000000 1.000000 49.593335

8 rows × 35 columns

Start of analysis

In [44]:
k=nudge.drop(["rn"], axis=1)
corr=k.corr() # Needs some adjustment
plt.figure(figsize=(10, 10))
sns.heatmap(corr, vmax=.5, linewidths=0.01,
            square=True,annot=False,cmap='seismic',linecolor="white")
plt.savefig('corr.png') 
plt.show()
#YlGnBu

from IPython.display import Image def plot_tree(clf, feature_names, target_names): dot_data = sklearn.tree.export_graphviz(clf, out_file=None, feature_names=feature_names,
class_names= target_names,
filled=True, rounded=True,
special_characters=True) return pydotplus.graph_from_dot_data(dot_data).create_png()

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

params = { 
           'min_samples_leaf':  range(2,100),
           'max_leaf_nodes'  :  range(2,10)
         }

tree = DecisionTreeClassifier()

cv = ShuffleSplit(n_splits=20, random_state=0)
grid = GridSearchCV(estimator=tree, cv = cv, param_grid= params)
grid.fit(X, y, sample_weight=WEIGHT)
# summarize the results of the grid search
print("Best parameters: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)




Image(plot_tree(grid.best_estimator_, feature_names, class_labels))
Best parameters:  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=9, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=56,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
Best score:  0.7217996289424861
Out[21]:
In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
params = { 
           'min_samples_leaf':  range(1,100),
           'max_leaf_nodes'  :  range(2,100)
         }

tree = DecisionTreeClassifier()

cv = ShuffleSplit(n_splits=20, random_state=0)
grid = GridSearchCV(estimator=tree, cv = cv, param_grid= params)
grid.fit(X, y1, sample_weight=WEIGHT)
# summarize the results of the grid search
print("Best parameters: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)




Image(plot_tree(grid.best_estimator_, feature_names, class_labels))
Best parameters:  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=5, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
Best score:  0.8350649350649351
Out[22]:
In [23]:
params = { 
           'min_samples_leaf':  range(2,100),
           'max_leaf_nodes'  :  range(2,100)
         }

tree = DecisionTreeClassifier()

cv = ShuffleSplit(n_splits=20, random_state=0)
grid = GridSearchCV(estimator=tree, cv = cv, param_grid= params)
grid.fit(X, y2, sample_weight=WEIGHT)
# summarize the results of the grid search
print("Best parameters: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)




Image(plot_tree(grid.best_estimator_, feature_names, class_labels))
Best parameters:  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=14,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=22, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Best score:  0.7777365491651206
Out[23]:
In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
params = { 
           'min_samples_leaf':  range(2,100),
           'max_leaf_nodes'  :  range(2,100),
           
         }

tree = DecisionTreeClassifier(presort=True)

cv = ShuffleSplit(n_splits=20, random_state=0)
grid = GridSearchCV(estimator=tree, cv = cv, param_grid= params)
grid.fit(X, y3, sample_weight=WEIGHT)
# summarize the results of the grid search
print("Best parameters: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)




Image(plot_tree(grid.best_estimator_, feature_names, class_labels))
Best parameters:  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=17,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=96, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=True, random_state=None,
            splitter='best')
Best score:  0.6744897959183673
Out[24]:
In [25]:
params = { 
           'min_samples_leaf':  range(2,100),
           'max_leaf_nodes'  :  range(2,100)
         }

tree = DecisionTreeClassifier()

cv = ShuffleSplit(n_splits=20, random_state=0)
grid = GridSearchCV(estimator=tree, cv = cv, param_grid= params)
grid.fit(X, y4, sample_weight=WEIGHT)
# summarize the results of the grid search
print("Best parameters: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)




Image(plot_tree(grid.best_estimator_, feature_names, class_labels))
Best parameters:  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=22,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=98, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Best score:  0.6337662337662338
Out[25]:
In [26]:
params = { 
           'min_samples_leaf':  range(2,100),
           'max_leaf_nodes'  :  range(2,100)
         }

tree = DecisionTreeClassifier()

cv = ShuffleSplit(n_splits=20, random_state=0)
grid = GridSearchCV(estimator=tree, cv = cv, param_grid= params)
grid.fit(X, y5, sample_weight=WEIGHT)
# summarize the results of the grid search
print("Best parameters: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)




Image(plot_tree(grid.best_estimator_, feature_names, class_labels))
Best parameters:  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=4, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=2,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
Best score:  0.6894248608534322
Out[26]:
In [ ]: