Python: Principal Component Analysis

Utilizzo l’environment conda py3

~$ conda activate py3

PCA

# lib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# cancer data
cancer = load_breast_cancer()
cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

# df
df = pd.DataFrame(cancer['data'],columns=cancer['feature_names'])
df.head()

	mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	...	worst radius	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension
0	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	0.2419	0.07871	...	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	0.1812	0.05667	...	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	0.2069	0.05999	...	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	0.2597	0.09744	...	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	0.1809	0.05883	...	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

5 rows × 30 columns

# scaling data
scaler = StandardScaler()
scaler.fit(df)
scaled_data = scaler.transform(df)

# PCA
pca = PCA(n_components=2)
pca.fit(scaled_data)

PCA(n_components=2)

# trasform data to two components (da 30 colonne a 2)
x_pca = pca.transform(scaled_data)

print(scaled_data.shape)
print(x_pca.shape)

(569, 30)
(569, 2)

# plot PCA
plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0],x_pca[:,1],c=cancer['target'],cmap='plasma',edgecolors='black',alpha=0.5)
plt.xlabel('First principal component')
plt.ylabel('Second Principal Component')

Text(0, 0.5, 'Second Principal Component')

png

Coefficients

# interpretazioni coefficienti
# correlazione tra le variabili e le componenti
df_comp = pd.DataFrame(pca.components_,columns=cancer['feature_names'])
df_comp.head()

	mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	...	worst radius	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension
0	0.218902	0.103725	0.227537	0.220995	0.142590	0.239285	0.258400	0.260854	0.138167	0.064363	...	0.227997	0.104469	0.236640	0.224871	0.127953	0.210096	0.228768	0.250886	0.122905	0.131784
1	-0.233857	-0.059706	-0.215181	-0.231077	0.186113	0.151892	0.060165	-0.034768	0.190349	0.366575	...	-0.219866	-0.045467	-0.199878	-0.219352	0.172304	0.143593	0.097964	-0.008257	0.141883	0.275339

2 rows × 30 columns

# heatmap
plt.figure(figsize=(12,6))
sns.heatmap(df_comp,cmap='plasma')

<matplotlib.axes._subplots.AxesSubplot at 0x7fee706d6d50>

png

Biplot

# il biplot è la rappresentazione grafica delle due prime componenti
# utile anche per identificare gli outliers

# funzione per il biplot
def biplot(score,coeff,y=None,labels=None):
    xs = score[:,0] # projection of PC1
    ys = score[:,1] # projection of PC2
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    plt.scatter(xs * scalex,ys * scaley, c = y)
    
    # aggiungo frecce
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')

    # aggiungo flag outliers
    for i in range(len(xs)):
        if ((xs[i]>np.quantile(xs,0.99)) | (ys[i]>np.quantile(xs,0.99))):
            plt.text(xs[i] * scalex,ys[i] * scaley, str(i))

    # etichette
    plt.xlabel("PC{}".format(1), size=14)
    plt.ylabel("PC{}".format(2), size=14)
    plt.grid()
    plt.tick_params(axis='both', which='both', labelsize=14)
    plt.tight_layout()

# plot
plt.figure(figsize=(10,8))
biplot(score=x_pca,coeff=np.transpose(pca.components_),labels=df.columns)

# se ci fossero più componenti plotto solo le prime due (nb. sono array non dataframes)
# score=x_pca[:,0:2],coef=np.transpose(pca.components_[0:2, :])

# le variabili nella stessa direzione sono correlate

png

# esempio stessa direzione
np.corrcoef(df['worst perimeter'],df['mean perimeter'])[1,0]

0.9703868870426392

Outliers

# l'osservazione 461 è outliers per la prima componente, che ha coefficienti positivi verso le variabili 
# infatti i valori delle sue variabili sono vicini ai massimi (vedi il describe)
x_pca[461,]

array([16.31923323, -7.7758528 ])

# l'osservazione 152 è outliers per la seconda componente
# la PC2 ha coefficienti negativi verso le variabili tranne per 'mean fractal dimension'
# infatti i valori delle sue variabili sono vicini ai minimi, tranne per 'mean fractal dimension' che è elevata
x_pca[152,]

array([ 7.09330671, 12.57319423])

# describe stats assieme a due outliers
pd.concat([df.describe(),df.iloc[[461,152]]],axis=0)

	mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	...	worst radius	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension
count	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	...	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000
mean	14.127292	19.289649	91.969033	654.889104	0.096360	0.104341	0.088799	0.048919	0.181162	0.062798	...	16.269190	25.677223	107.261213	880.583128	0.132369	0.254265	0.272188	0.114606	0.290076	0.083946
std	3.524049	4.301036	24.298981	351.914129	0.014064	0.052813	0.079720	0.038803	0.027414	0.007060	...	4.833242	6.146258	33.602542	569.356993	0.022832	0.157336	0.208624	0.065732	0.061867	0.018061
min	6.981000	9.710000	43.790000	143.500000	0.052630	0.019380	0.000000	0.000000	0.106000	0.049960	...	7.930000	12.020000	50.410000	185.200000	0.071170	0.027290	0.000000	0.000000	0.156500	0.055040
25%	11.700000	16.170000	75.170000	420.300000	0.086370	0.064920	0.029560	0.020310	0.161900	0.057700	...	13.010000	21.080000	84.110000	515.300000	0.116600	0.147200	0.114500	0.064930	0.250400	0.071460
50%	13.370000	18.840000	86.240000	551.100000	0.095870	0.092630	0.061540	0.033500	0.179200	0.061540	...	14.970000	25.410000	97.660000	686.500000	0.131300	0.211900	0.226700	0.099930	0.282200	0.080040
75%	15.780000	21.800000	104.100000	782.700000	0.105300	0.130400	0.130700	0.074000	0.195700	0.066120	...	18.790000	29.720000	125.400000	1084.000000	0.146000	0.339100	0.382900	0.161400	0.317900	0.092080
max	28.110000	39.280000	188.500000	2501.000000	0.163400	0.345400	0.426800	0.201200	0.304000	0.097440	...	36.040000	49.540000	251.200000	4254.000000	0.222600	1.058000	1.252000	0.291000	0.663800	0.207500
461	27.420000	26.270000	186.900000	2501.000000	0.108400	0.198800	0.363500	0.168900	0.206100	0.056230	...	36.040000	31.370000	251.200000	4254.000000	0.135700	0.425600	0.683300	0.262500	0.264100	0.074270
152	9.731000	15.340000	63.780000	300.200000	0.107200	0.159900	0.410800	0.078570	0.254800	0.092960	...	11.020000	19.490000	71.040000	380.500000	0.129200	0.277200	0.821600	0.157100	0.310800	0.125900

10 rows × 30 columns

# pulisco il workspace
%reset -f

Alberto

Python: Principal Component Analysis

PCA

Coefficients

Biplot

Outliers

You may also enjoy

to do

Ricerca e acquisto casa

Statistics: Inference

Statistics: Network Science