Python: Tree and Random Forest

Utilizzo l’environment conda py3

1
~$ conda activate py3

Versione modulo installato

1
2
3
4
5
6
7
8
9
10
11
12
~$ pip show pydot
Name: scikit-learn
Name: pydot
Version: 1.4.1
Summary: Python interface to Graphviz's Dot
Home-page: https://github.com/pydot/pydot
Author: Ero Carrera
Author-email: ero@dkbza.org
License: MIT
Location: /home/user/miniconda3/envs/py3/lib/python3.7/site-packages
Requires: pyparsing
Required-by: 

Tree Methods

Decision Tree

Se si usa l’algoritmo ID3, lo split deve massimizzare l’Information Gain
Entropy
\(H\left (S\right )=-\sum_{i}p_{i}\left (S\right)\log_{2}p_{i}\left (S\right )\)
Information Gain
\(IG\left (S,A\right )=H\left (S\right )-\sum_{\nu \in Values\left (A\right )}\frac{\left | S_{\nu}\right |}{S}H\left (S_{\nu}\right )=H\left (S\right )-H\left (S,A\right )\)

Random Forest

Bootstrapping rows and sampling columns every tree is generated.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# lib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,confusion_matrix

from IPython.display import Image  
from six import StringIO  
from sklearn.tree import export_graphviz
import pydot

from sklearn.ensemble import RandomForestClassifier
1
2
3
# data
df = pd.read_csv(r'kyphosis.csv')
df.head()
Kyphosis Age Number Start
0 absent 71 3 5
1 absent 158 3 14
2 present 128 4 5
3 absent 2 5 1
4 absent 1 4 15
1
2
3
4
# distribuzione target
print(df['Kyphosis'].value_counts())
sns.countplot(x='Kyphosis',data=df,palette='Set1') # palette='RdBu_r'
# è abbastanza sbilanciato
1
2
3
4
5
absent     64
present    17
Name: Kyphosis, dtype: int64

<matplotlib.axes._subplots.AxesSubplot at 0x7f03c04281d0>

png

1
df.info()
1
2
3
4
5
6
7
8
9
10
11
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Kyphosis  81 non-null     object
 1   Age       81 non-null     int64 
 2   Number    81 non-null     int64 
 3   Start     81 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 2.7+ KB
1
2
# pairplot
sns.pairplot(df,hue='Kyphosis',palette='Set1')
1
<seaborn.axisgrid.PairGrid at 0x7f03cc1a2290>

png

1
2
3
4
# train test split
X = df.drop('Kyphosis',axis=1)
y = df['Kyphosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

Decision Tree

1
2
3
# tree
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
1
DecisionTreeClassifier()
1
2
# predictions
predictions = dtree.predict(X_test)
1
2
3
4
5
# confusion matrix and classification metrix
print('\nConfusion Matrix:')
print(confusion_matrix(y_test,predictions))
print('\nClassification metrics:')
print(classification_report(y_test,predictions))
1
2
3
4
5
6
7
8
9
10
11
12
13
Confusion Matrix:
[[17  2]
 [ 2  4]]

Classification metrics:
              precision    recall  f1-score   support

      absent       0.89      0.89      0.89        19
     present       0.67      0.67      0.67         6

    accuracy                           0.84        25
   macro avg       0.78      0.78      0.78        25
weighted avg       0.84      0.84      0.84        25

Tree Visualization

1
2
3
# cols
features = list(df.columns[1:])
features
1
['Age', 'Number', 'Start']
1
2
3
4
5
6
# plot tree
dot_data = StringIO()  
export_graphviz(dtree, out_file=dot_data,feature_names=features,filled=True,rounded=True)

graph = pydot.graph_from_dot_data(dot_data.getvalue())  
Image(graph[0].create_png())

png

Random Forest

1
2
3
# RF
rfc = RandomForestClassifier(n_estimators=300)
rfc.fit(X_train, y_train)
1
RandomForestClassifier(n_estimators=300)
1
2
# predictions
rfc_pred = rfc.predict(X_test)
1
2
3
4
5
# confusion matrix and classification metrix
print('\nConfusion Matrix:')
print(confusion_matrix(y_test,rfc_pred))
print('\nClassification metrics:')
print(classification_report(y_test,rfc_pred))
1
2
3
4
5
6
7
8
9
10
11
12
13
Confusion Matrix:
[[18  1]
 [ 3  3]]

Classification metrics:
              precision    recall  f1-score   support

      absent       0.86      0.95      0.90        19
     present       0.75      0.50      0.60         6

    accuracy                           0.84        25
   macro avg       0.80      0.72      0.75        25
weighted avg       0.83      0.84      0.83        25

Altro

Better confusion matrix

1
2
3
4
5
# matrice per absent è yes
pd.DataFrame(confusion_matrix(y_test,rfc_pred, labels=['absent', 'present']),
             index=['TRUE | Yes', 'TRUE | No'],
             columns=['PRED | Yes', 'PRED | No'])
# esempio: precision per absent è 18/(18+3)=0.86
PRED | Yes PRED | No
TRUE | Yes 18 1
TRUE | No 3 3
1
2
3
4
5
# matrice per present è yes
pd.DataFrame(confusion_matrix(y_test,rfc_pred, labels=['present','absent']),
             index=['TRUE | Yes', 'TRUE | No'],
             columns=['PRED | Yes', 'PRED | No'])
# esempio: recall per present è 3/(3+3)=0.5
PRED | Yes PRED | No
TRUE | Yes 3 3
TRUE | No 1 18
1
2
3
4
5
6
7
8
9
10
# costruisco una funzione
def confusion_matrix_bella(y_true, y_pred, labels='default'):
    if labels is 'default':
        labels = np.unique([y_true,y_pred])
    cmtx = pd.DataFrame(
        confusion_matrix(y_true, y_pred, labels=labels), 
        index=['TRUE | {:}'.format(x) for x in labels], 
        columns=['PRED | {:}'.format(x) for x in labels]
    )
    return(cmtx)
1
confusion_matrix_bella(y_test,rfc_pred,labels=['absent','present'])
PRED | present PRED | absent
TRUE | present 3 3
TRUE | absent 1 18

Better distplot

1
2
3
4
5
6
7
8
9
10
11
12
# better displot function
def distplot_fig(data, x, hue=None, row=None, col=None, height=3, aspect=1, legend=True, hist=False, **kwargs):
    """A figure-level distribution plot with support for hue, col, row arguments."""
    bins = kwargs.pop('bins', None)
    if (bins is None) and hist: 
        # Make sure that the groups have equal-sized bins
        bins = np.histogram_bin_edges(data[x].dropna())
    g = sns.FacetGrid(data, hue=hue, row=row, col=col, height=height, aspect=aspect)
    g.map(sns.distplot, x, bins=bins, hist=hist, **kwargs)
    if legend and (hue is not None) and (hue not in [x, row, col]):
        g.add_legend(title=hue) 
    return g
1
2
# better distplot
distplot_fig(data=df,x='Age',hue='Kyphosis',kde=False,hist=True,hist_kws=dict(edgecolor='grey'),bins=30, height=4, aspect=2)
1
<seaborn.axisgrid.FacetGrid at 0x7f03c05ee910>

png

1
2
3
4
5
# multiple displot (seaborn)
plt.figure(figsize=(10,6))
sns.distplot(df[df['Kyphosis']=='absent']['Age'],kde=False,color='b',bins=20,hist_kws=dict(edgecolor='grey'),label='Kyphosis = absent')
sns.distplot(df[df['Kyphosis']=='present']['Age'],kde=False,color='y',bins=20,hist_kws=dict(edgecolor='grey'),label='Kyphosis = present')
plt.legend()
1
<matplotlib.legend.Legend at 0x7f03c9a55550>

png

1
2
3
4
5
6
7
# multiple distplot (matplotlib)
plt.figure(figsize=(10,6))
df[df['Kyphosis']=='absent']['Age'].hist(alpha=0.5,color='b',bins=20,label='Kyphosis = absent',edgecolor="grey")
df[df['Kyphosis']=='present']['Age'].hist(alpha=0.5,color='y',bins=20,label='Kyphosis = present',edgecolor="grey")
plt.legend()
plt.xlabel('Age')
plt.grid(None)

png