Alessandro Cucci Energee3 srl
import pandas as pd
df = pd.read_csv("storico_prestiti.csv")
df.head()
default | amount | grade | years | ownership | income | age | |
---|---|---|---|---|---|---|---|
0 | 0 | 1000 | B | 2.0 | RENT | 19200.0 | 24 |
1 | 1 | 6500 | A | 2.0 | MORTGAGE | 66000.0 | 28 |
2 | 0 | 2400 | A | 2.0 | RENT | 60000.0 | 36 |
3 | 0 | 10000 | C | 3.0 | RENT | 62000.0 | 24 |
4 | 1 | 4000 | C | 2.0 | RENT | 20000.0 | 28 |
df.dtypes
default int64 amount int64 grade object years float64 ownership object income float64 age int64 dtype: object
print("Righe: {0}, Colonne: {1}".format(*df.shape))
Righe: 7727, Colonne: 7
df.describe()
default | amount | years | income | age | |
---|---|---|---|---|---|
count | 7727.000000 | 7727.000000 | 7448.000000 | 7.727000e+03 | 7727.000000 |
mean | 0.478452 | 9453.345412 | 6.086332 | 6.337197e+04 | 27.542125 |
std | 0.499568 | 6298.595800 | 6.700758 | 4.687195e+04 | 6.132121 |
min | 0.000000 | 500.000000 | 0.000000 | 4.000000e+03 | 20.000000 |
25% | 0.000000 | 5000.000000 | 2.000000 | 3.700000e+04 | 23.000000 |
50% | 0.000000 | 8000.000000 | 4.000000 | 5.400000e+04 | 26.000000 |
75% | 1.000000 | 12000.000000 | 8.000000 | 7.679650e+04 | 30.000000 |
max | 1.000000 | 35000.000000 | 62.000000 | 1.200000e+06 | 94.000000 |
df.isnull().sum()
default 0 amount 0 grade 0 years 279 ownership 0 income 0 age 0 dtype: int64
import numpy as np
df.years = df.years.fillna(np.mean(df.years))
df.default.plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f965a9afb00>
df.grade.value_counts().plot.barh()
<matplotlib.axes._subplots.AxesSubplot at 0x7f965a9e9ac8>
df.age.plot.hist(bins=30)
<matplotlib.axes._subplots.AxesSubplot at 0x7f96575768d0>
df.plot.scatter(x='age', y='income', alpha=0.7)
<matplotlib.axes._subplots.AxesSubplot at 0x7f965739e780>
df.plot.scatter(x='age', y='income', c='default', logx=True, logy=True, cmap='viridis')
<matplotlib.axes._subplots.AxesSubplot at 0x7f965739eba8>
df.describe(include='all')
default | amount | grade | years | ownership | income | age | |
---|---|---|---|---|---|---|---|
count | 7727.00 | 7727.00 | 7727 | 7727.00 | 7727 | 7727.00 | 7727.00 |
unique | nan | nan | 7 | nan | 4 | nan | nan |
top | nan | nan | B | nan | RENT | nan | nan |
freq | nan | nan | 2456 | nan | 4054 | nan | nan |
mean | 0.48 | 9453.35 | NaN | 6.09 | NaN | 63371.97 | 27.54 |
std | 0.50 | 6298.60 | NaN | 6.58 | NaN | 46871.95 | 6.13 |
min | 0.00 | 500.00 | NaN | 0.00 | NaN | 4000.00 | 20.00 |
25% | 0.00 | 5000.00 | NaN | 2.00 | NaN | 37000.00 | 23.00 |
50% | 0.00 | 8000.00 | NaN | 4.00 | NaN | 54000.00 | 26.00 |
75% | 1.00 | 12000.00 | NaN | 8.00 | NaN | 76796.50 | 30.00 |
max | 1.00 | 35000.00 | NaN | 62.00 | NaN | 1200000.00 | 94.00 |
from sklearn.preprocessing import LabelEncoder
df_encoded = df.copy()
le_grade = LabelEncoder().fit(df_encoded["grade"])
df_encoded.grade = le_grade.transform(df_encoded.grade)
le_ownership = LabelEncoder().fit(df["ownership"])
df_encoded.ownership = le_ownership.transform(df_encoded.ownership)
df_encoded.head()
default | amount | grade | years | ownership | income | age | |
---|---|---|---|---|---|---|---|
0 | 0 | 1000 | 1 | 2.00 | 3 | 19200.00 | 24 |
1 | 1 | 6500 | 0 | 2.00 | 0 | 66000.00 | 28 |
2 | 0 | 2400 | 0 | 2.00 | 3 | 60000.00 | 36 |
3 | 0 | 10000 | 2 | 3.00 | 3 | 62000.00 | 24 |
4 | 1 | 4000 | 2 | 2.00 | 3 | 20000.00 | 28 |
from IPython.display import Image
Image("img/decisionanimals.png")
L’algoritmo é applicato ad ogni nodo fino a che la costruzione dell’albero non si arresta.
Esso si definisce in generale come:
$H = 1 - \sum\limits_{j = 1}^J {f_j^2 }$
Dove $f_j$ rappresenta la frequenza relativa di osservazioni la cui modalità della variabile è pari a j.
In un classificatore binario, l’impurità in un nodo sarà quindi pari a:
$i_Y (t) = 1 - \sum\limits_{j = 1}^J {p^2 \left( {t\left| {Y = j} \right.} \right)}$
Dove $i_Y (t)$ è la misura di impurità in un generico nodo t e $p \left( {t\left| {Y = j} \right.} \right)$ è la proporzione di unità nel nodo t che appartengono alla j-esima classe della variabile di risposta Y.
Per maggiori info: https://it.wikipedia.org/wiki/Indice_di_eterogeneità_di_Gini
X_2 = df_encoded.loc[:,('age', 'amount')]
y = df_encoded.loc[:,'default']
from sklearn import tree
clf_dt = tree.DecisionTreeClassifier(max_depth=2)
clf_dt = clf_dt.fit(X_2, y)
Image(graph.create_png())
plot_boundaries(X_2, clf_dt)
clf_dt_5 = tree.DecisionTreeClassifier(max_depth=5).fit(X_2,y)
plot_boundaries(X_2, clf_dt_5)
pred_class = clf_dt_5.predict(X_2)
plt.hist(pred_class);
Image("img/roc.png")
Image("img/cv.jpg")
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
X = df_encoded.iloc[:,1:]
y = df_encoded.iloc[:,0]
def cross_val(clf, k):
kf = StratifiedKFold(n_splits=k)
kfold_auc_score = []
for train_index, test_index in kf.split(X,y):
clf = clf.fit(X.iloc[train_index], y.iloc[train_index])
dt_prediction = clf.predict_proba(X.iloc[test_index])[:,1]
auc_score = roc_auc_score(y.iloc[test_index],dt_prediction)
print(auc_score)
kfold_auc_score.append(auc_score)
print("Media roc della cross validation:", np.mean(kfold_auc_score))
clf_dt_2 = tree.DecisionTreeClassifier(max_depth=2)
print("Cross validation: profondità = 2, k = 3")
cross_val(clf_dt_2, 3)
print("")
print("Cross validation: profondità = 2, k = 5")
cross_val(clf_dt_2, 5)
Cross validation: profondità = 2, k = 3 0.626445552369 0.634841191943 0.712316327567 Media roc della cross validation: 0.657867690626 Cross validation: profondità = 2, k = 5 0.642202903896 0.606714841392 0.63267291659 0.696277411968 0.706100558397 Media roc della cross validation: 0.656793726449
clf_dt_5 = tree.DecisionTreeClassifier(max_depth=5)
cross_val(clf_dt_5, 5)
0.653413587285 0.634923043391 0.654500918349 0.713544727131 0.730137131191 Media roc della cross validation: 0.677303881469
Image("img/tradeoff.png")