Python: Pandas

Utilizzo l’environment conda py3

1
~$ conda activate py3

Versione modulo installato

1
2
3
4
5
6
7
8
9
10
11
~$ pip show pandas
Name: pandas
Version: 1.0.5
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: None
Author-email: None
License: BSD
Location: /home/user/miniconda3/envs/py3/lib/python3.7/site-packages
Requires: pytz, python-dateutil, numpy
Required-by: seaborn

Pandas

Series

1
2
import numpy as np
import pandas as pd
1
2
3
4
labels = ['a', 'b', 'c']   # list
my_data = [10,20,30]       # list
arr = np.array(my_data)    # array
d = {'a':10,'b':20,'c':30} # dictionary
1
pd.Series(data = my_data)
1
2
3
4
0    10
1    20
2    30
dtype: int64
1
pd.Series(data = my_data, index=labels)
1
2
3
4
a    10
b    20
c    30
dtype: int64
1
pd.Series(my_data, labels)
1
2
3
4
a    10
b    20
c    30
dtype: int64
1
pd.Series(arr, labels)  # con numpy array funziona come lista
1
2
3
4
a    10
b    20
c    30
dtype: int64
1
pd.Series(d)  # le key come index e value come elementi
1
2
3
4
a    10
b    20
c    30
dtype: int64
1
pd.Series(data=[sum,print,len]) # un pandas series può contenere anche reference di funzioni
1
2
3
4
0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object
1
2
3
# filtering
ser1 = pd.Series([1,2,3,4],['USA','Germany','USSR','Japan'])
ser1
1
2
3
4
5
USA        1
Germany    2
USSR       3
Japan      4
dtype: int64
1
2
ser2 = pd.Series([1,2,5,4],['USA','Germany','Italy','Japan'])
ser2
1
2
3
4
5
USA        1
Germany    2
Italy      5
Japan      4
dtype: int64
1
ser1['USA']
1
1
1
2
ser3 = pd.Series(data=labels)
ser3
1
2
3
4
0    a
1    b
2    c
dtype: object
1
ser3[0]
1
'a'
1
ser1 + ser2
1
2
3
4
5
6
Germany    4.0
Italy      NaN
Japan      8.0
USA        2.0
USSR       NaN
dtype: float64

DataFrames

1
2
3
import numpy as np
import pandas as pd
from numpy.random import randn
1
2
# set seed
np.random.seed(101)
1
2
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])
df
W X Y Z
A 0.302665 1.693723 -1.706086 -1.159119
B -0.134841 0.390528 0.166905 0.184502
C 0.807706 0.072960 0.638787 0.329646
D -0.497104 -0.754070 -0.943406 0.484752
E -0.116773 1.901755 0.238127 1.996652
1
2
# dettagli data frame
df.info()
1
2
3
4
5
6
7
8
9
10
11
<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, A to E
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   W       5 non-null      float64
 1   X       5 non-null      float64
 2   Y       5 non-null      float64
 3   Z       5 non-null      float64
dtypes: float64(4)
memory usage: 200.0+ bytes

Selection

1
2
3
4
5
6
# selezione colonna
df['W'] # una series
# df[df.columns[0]]  # dalla posizione colonna
# df[df.columns[:-1]] # dalla posizione colonna, escludo l'ultima
# df.iloc[:,0]  # dalla posizione colonna
# df.W  # non conviene usarlo come comando perché si confonde con i methods
1
2
3
4
5
6
A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64
1
type(df['W'])
1
pandas.core.series.Series
1
type(df)
1
pandas.core.frame.DataFrame
1
2
# selezione colonne
df[['W','Z']]
W Z
A 2.706850 0.503826
B 0.651118 0.605965
C -2.018168 -0.589001
D 0.188695 0.955057
E 0.190794 0.683509
1
2
3
# aggiungere colonne
df['new'] = df['W'] + df['Y']
df
W X Y Z new
A 2.706850 0.628133 0.907969 0.503826 3.614819
B 0.651118 -0.319318 -0.848077 0.605965 -0.196959
C -2.018168 0.740122 0.528813 -0.589001 -1.489355
D 0.188695 -0.758872 -0.933237 0.955057 -0.744542
E 0.190794 1.978757 2.605967 0.683509 2.796762
1
2
3
# rimuovere colonne
df.drop('new', axis=1, inplace=True) # axis=0 è di default ed è la riga, inplace va a sostituire il df originale
df
W X Y Z
A 2.706850 0.628133 0.907969 0.503826
B 0.651118 -0.319318 -0.848077 0.605965
C -2.018168 0.740122 0.528813 -0.589001
D 0.188695 -0.758872 -0.933237 0.955057
E 0.190794 1.978757 2.605967 0.683509
1
2
# rimuovere righe
df.drop('E')
W X Y Z
A 2.706850 0.628133 0.907969 0.503826
B 0.651118 -0.319318 -0.848077 0.605965
C -2.018168 0.740122 0.528813 -0.589001
D 0.188695 -0.758872 -0.933237 0.955057
1
df.shape # è una tupla, 0 riguarda i record e 1 le colonne
1
(5, 4)
1
2
# selezione righe, label riga
df.loc['A'] # location, anche le righe sono pandas series
1
2
3
4
5
W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64
1
2
# selezione righe, posizione riga
df.iloc[0]
1
2
3
4
5
W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64
1
2
# selezione subset righe e colonne
df.loc['B','Y']
1
-0.8480769834036315
1
2
# selezione subset righe e colonne
df.loc[['A','B'],['W','Y']]
W Y
A 2.706850 0.907969
B 0.651118 -0.848077

Conditional selection

1
df
W X Y Z
A 2.706850 0.628133 0.907969 0.503826
B 0.651118 -0.319318 -0.848077 0.605965
C -2.018168 0.740122 0.528813 -0.589001
D 0.188695 -0.758872 -0.933237 0.955057
E 0.190794 1.978757 2.605967 0.683509
1
df[df > 0]
W X Y Z
A 2.706850 0.628133 0.907969 0.503826
B 0.651118 NaN NaN 0.605965
C NaN 0.740122 0.528813 NaN
D 0.188695 NaN NaN 0.955057
E 0.190794 1.978757 2.605967 0.683509
1
2
# filtro df in base a condizioni singole colonna
df[df['W']>0]
W X Y Z
A 2.706850 0.628133 0.907969 0.503826
B 0.651118 -0.319318 -0.848077 0.605965
D 0.188695 -0.758872 -0.933237 0.955057
E 0.190794 1.978757 2.605967 0.683509
1
df[df['Z']<0]
W X Y Z
C -2.018168 0.740122 0.528813 -0.589001
1
df[df['W']>0][['Y','X']]
Y X
A 0.907969 0.628133
B -0.848077 -0.319318
D -0.933237 -0.758872
E 2.605967 1.978757
1
df
W X Y Z
A 0.302665 1.693723 -1.706086 -1.159119
B -0.134841 0.390528 0.166905 0.184502
C 0.807706 0.072960 0.638787 0.329646
D -0.497104 -0.754070 -0.943406 0.484752
E -0.116773 1.901755 0.238127 1.996652
1
2
# filtro record che ha il massimo rispetto la colonna W
df.loc[[df['W'].idxmax()]]
W X Y Z
C 0.807706 0.07296 0.638787 0.329646
1
2
3
# multiple conditions, and e or
# df[(df['W']>0) and (df['Y']>1)] # and e or funziona solo su scalari e non vettori o series
df[(df['W']>0) & (df['Y']>1)] # l'or con |
W X Y Z
E 0.190794 1.978757 2.605967 0.683509
1
2
# reset l'index
df.reset_index() # se si vuole sostituire l'originale basta inplace=True
index W X Y Z
0 A 2.706850 0.628133 0.907969 0.503826
1 B 0.651118 -0.319318 -0.848077 0.605965
2 C -2.018168 0.740122 0.528813 -0.589001
3 D 0.188695 -0.758872 -0.933237 0.955057
4 E 0.190794 1.978757 2.605967 0.683509
1
2
3
# sostituzione dell'index
newind = 'CA NY WY OR CO'.split()
newind
1
['CA', 'NY', 'WY', 'OR', 'CO']
1
2
df['States'] = newind
df
W X Y Z States
A 2.706850 0.628133 0.907969 0.503826 CA
B 0.651118 -0.319318 -0.848077 0.605965 NY
C -2.018168 0.740122 0.528813 -0.589001 WY
D 0.188695 -0.758872 -0.933237 0.955057 OR
E 0.190794 1.978757 2.605967 0.683509 CO
1
df.set_index('States')
W X Y Z
States
CA 2.706850 0.628133 0.907969 0.503826
NY 0.651118 -0.319318 -0.848077 0.605965
WY -2.018168 0.740122 0.528813 -0.589001
OR 0.188695 -0.758872 -0.933237 0.955057
CO 0.190794 1.978757 2.605967 0.683509

Multi-Index and Index Hierarchy

1
2
3
import numpy as np
import pandas as pd
from numpy.random import randn
1
2
3
4
5
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)
1
list(zip(outside,inside))
1
[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]
1
hier_index
1
2
3
4
5
6
7
MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )
1
2
df = pd.DataFrame(randn(6,2),hier_index,['A','B'])
df
A B
G1 1 -1.094109 0.224428
2 -1.927079 0.359976
3 0.607172 0.892118
G2 1 0.345103 0.023081
2 -1.820372 -1.061236
3 0.983283 1.096904
1
df.loc['G1']
A B
1 -1.094109 0.224428
2 -1.927079 0.359976
3 0.607172 0.892118
1
df.loc['G1'].loc[1]
1
2
3
A   -1.094109
B    0.224428
Name: 1, dtype: float64
1
df.index.names
1
FrozenList([None, None])
1
2
3
# imposto i names dei multilivelli
df.index.names = ['Groups', 'Num']
df
A B
Groups Num
G1 1 -1.094109 0.224428
2 -1.927079 0.359976
3 0.607172 0.892118
G2 1 0.345103 0.023081
2 -1.820372 -1.061236
3 0.983283 1.096904
1
df.loc['G2'].loc[2].iloc[1]
1
-1.0612359393184747
1
df.loc['G2'].loc[2]['B']
1
df.loc['G2'].loc[2][['B']][0]
1
-1.0612359393184747
1
2
# cross section
df.xs('G1')
A B
Num
1 -1.094109 0.224428
2 -1.927079 0.359976
3 0.607172 0.892118
1
2
# cross section filtering
df.xs(1,level='Num')
A B
Groups
G1 -1.094109 0.224428
G2 0.345103 0.023081

Missing Data

1
2
import numpy as np
import pandas as pd
1
2
d = {'A':[1,2,np.nan],'B':[5,np.nan,np.nan],'C':[1,2,3]}
d
1
{'A': [1, 2, nan], 'B': [5, nan, nan], 'C': [1, 2, 3]}
1
2
df = pd.DataFrame(d)
df
A B C
0 1.0 5.0 1
1 2.0 NaN 2
2 NaN NaN 3
1
2
# dropping with missing
df.dropna() # axis = 0 default agisce sulle righe
A B C
0 1.0 5.0 1
1
df.dropna(thresh=2) # drop se ha almeno n nan la riga
A B C
0 1.0 5.0 1
1 2.0 NaN 2
1
2
# sostituzione tutti i missing
df.fillna(value='FILL VALUE')
A B C
0 1 5 1
1 2 FILL VALUE 2
2 FILL VALUE FILL VALUE 3
1
2
# sostituzione missing colonna con media
df['A'].fillna(value=df['A'].mean())
1
2
3
4
0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

Groupby

1
import pandas as pd
1
2
3
4
5
# Create dataframe
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}
data
1
2
3
{'Company': ['GOOG', 'GOOG', 'MSFT', 'MSFT', 'FB', 'FB'],
 'Person': ['Sam', 'Charlie', 'Amy', 'Vanessa', 'Carl', 'Sarah'],
 'Sales': [200, 120, 340, 124, 243, 350]}
1
2
df = pd.DataFrame(data)
df
Company Person Sales
0 GOOG Sam 200
1 GOOG Charlie 120
2 MSFT Amy 340
3 MSFT Vanessa 124
4 FB Carl 243
5 FB Sarah 350
1
2
# groupby
df.groupby('Company').sum() # sum, count, mean, std, max ...
Sales
Company
FB 593
GOOG 320
MSFT 464
1
df.groupby('Company').sum().loc['FB']
1
2
Sales    593
Name: FB, dtype: int64
1
df.groupby('Company').max()
Person Sales
Company
FB Sarah 350
GOOG Sam 200
MSFT Vanessa 340
1
df.groupby('Company').describe()
Sales
count mean std min 25% 50% 75% max
Company
FB 2.0 296.5 75.660426 243.0 269.75 296.5 323.25 350.0
GOOG 2.0 160.0 56.568542 120.0 140.00 160.0 180.00 200.0
MSFT 2.0 232.0 152.735065 124.0 178.00 232.0 286.00 340.0
1
df.groupby('Company').describe().transpose()['FB']
1
2
3
4
5
6
7
8
9
Sales  count      2.000000
       mean     296.500000
       std       75.660426
       min      243.000000
       25%      269.750000
       50%      296.500000
       75%      323.250000
       max      350.000000
Name: FB, dtype: float64

Merging Joining and Concatenating

1
import pandas as pd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])

df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[4, 5, 6, 7]) 

df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[8, 9, 10, 11])
1
df1
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1
2 A2 B2 C2 D2
3 A3 B3 C3 D3
1
df2
A B C D
4 A4 B4 C4 D4
5 A5 B5 C5 D5
6 A6 B6 C6 D6
7 A7 B7 C7 D7
1
df3
A B C D
8 A8 B8 C8 D8
9 A9 B9 C9 D9
10 A10 B10 C10 D10
11 A11 B11 C11 D11
1
2
# concatenation
pd.concat([df1,df2,df3]) # default axis=0 per riga
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1
2 A2 B2 C2 D2
4 A3 B3 C3 D3
4 A4 B4 C4 D4
5 A5 B5 C5 D5
6 A6 B6 C6 D6
7 A7 B7 C7 D7
8 A8 B8 C8 D8
9 A9 B9 C9 D9
10 A10 B10 C10 D10
11 A11 B11 C11 D11
1
2
# merging
# inner (intersect), outer (union), right, left
1
2
3
4
5
6
7
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
   
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                          'C': ['C0', 'C1', 'C2', 'C3'],
                          'D': ['D0', 'D1', 'D2', 'D3']})
1
left
key A B
0 K0 A0 B0
1 K1 A1 B1
2 K2 A2 B2
3 K3 A3 B3
1
right
key C D
0 K0 C0 D0
1 K1 C1 D1
2 K2 C2 D2
3 K3 C3 D3
1
pd.merge(left,right,how='inner',on='key') # default how inner
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
1
2
3
4
5
6
7
8
9
10
# più complicato
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})
    
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                               'key2': ['K0', 'K0', 'K0', 'K0'],
                                  'C': ['C0', 'C1', 'C2', 'C3'],
                                  'D': ['D0', 'D1', 'D2', 'D3']})
1
left
key1 key2 A B
0 K0 K0 A0 B0
1 K0 K1 A1 B1
2 K1 K0 A2 B2
3 K2 K1 A3 B3
1
right
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K1 K0 C2 D2
3 K2 K0 C3 D3
1
pd.merge(left, right, on=['key1', 'key2'])
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
1
# join come il merge ma le key che vuoi usare sono colonne della tabella
1
2
3
4
5
6
7
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                      index=['K0', 'K1', 'K2']) 

right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                    'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])
1
left
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
1
right
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
1
left.join(right) # default sulal index key
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2

Operations

1
import pandas as pd
1
2
df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
df.head()
col1 col2 col3
0 1 444 abc
1 2 555 def
2 3 666 ghi
3 4 444 xyz
1
df['col2']
1
2
3
4
5
0    444
1    555
2    666
3    444
Name: col2, dtype: int64
1
df[['col2']]
col2
0 444
1 555
2 666
3 444
1
df['col2'].unique()
1
array([444, 555, 666])
1
len(df['col2'].unique())
1
3
1
df['col2'].nunique()
1
3
1
2
# distribuzione di frequenza
df['col2'].value_counts()
1
2
3
4
444    2
555    1
666    1
Name: col2, dtype: int64
1
df.groupby('col2').count() # mette in mezzo altre variabili
col1 col3
col2
444 2 2
555 1 1
666 1 1
1
2
# select data with conditional selection
df[(df['col1']>2) & (df['col2']==444)]
col1 col2 col3
3 4 444 xyz
1
2
def times2(x):
    return x*2
1
df['col1'].sum()
1
10
1
2
# apply broadcasts the function to each element of the column
df['col1'].apply(times2)
1
2
3
4
5
0    2
1    4
2    6
3    8
Name: col1, dtype: int64
1
2
# applico funzione vettoriale ad una colonna intera (importante!)
df['col1'].apply(lambda x: x*2)
1
2
3
4
5
0    2
1    4
2    6
3    8
Name: col1, dtype: int64
1
list(map(lambda x: x*2, df['col1']))
1
[2, 4, 6, 8]
1
df['col3'].apply(len)
1
2
3
4
5
0    3
1    3
2    3
3    3
Name: col3, dtype: int64
1
2
# removing columns
df.drop('col1',axis=1)  # con inplace=True va a sostituire
col2 col3
0 444 abc
1 555 def
2 666 ghi
3 444 xyz
1
2
# list of columns name
df.columns
1
Index(['col1', 'col2', 'col3'], dtype='object')
1
df.index
1
RangeIndex(start=0, stop=4, step=1)
1
2
# sorting and ordering, ordine sort
df.sort_values(by='col2')
col1 col2 col3
0 1 444 abc
3 4 444 xyz
1 2 555 def
2 3 666 ghi
1
df.sort_values(by='col2').reset_index().drop('index',axis=1)
col1 col2 col3
0 1 444 abc
1 4 444 xyz
2 2 555 def
3 3 666 ghi
1
2
# cerca missing nan
df.isnull()
1
2
3
4
5
6
7
8
# pivot table
data = {'A':['foo','foo','foo','bar','bar','bar'],
     'B':['one','one','two','two','one','one'],
       'C':['x','y','x','y','x','y'],
       'D':[1,3,2,5,4,1]}

df = pd.DataFrame(data)
df
A B C D
0 foo one x 1
1 foo one y 3
2 foo two x 2
3 bar two y 5
4 bar one x 4
5 bar one y 1
1
2
# long to wide
df.pivot_table(values='D',index=['A','B'],columns='C')
C x y
A B
bar one 4.0 1.0
two NaN 5.0
foo one 1.0 3.0
two 2.0 NaN

Data Input and Output

1
!conda install -y xlrd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/user/miniconda3/envs/py3

  added / updated specs:
    - xlrd


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    xlrd-1.2.0                 |           py37_0         175 KB
    ------------------------------------------------------------
                                           Total:         175 KB

The following NEW packages will be INSTALLED:

  xlrd               pkgs/main/linux-64::xlrd-1.2.0-py37_0



Downloading and Extracting Packages
xlrd-1.2.0           | 175 KB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
1
2
3
4
5
6
7
# conda install xlrd

# conda install lxml
# conda install html5lib
# conda install BeautifulSoup4

# conda install sqlalchemy
1
2
import numpy as np
import pandas as pd
1
pwd
1
'/media/user/Public/Python/Course 001'

CSV ed EXCEL

1
2
3
4
5
# import csv
pd.read_csv('example')

# import excel
pd.read_excel('Excel_Sample.xlsx',sheet_name='Sheet1')
1
2
3
4
5
# export csv
df.to_csv('example',index=False)

# export excel
df.to_excel('Excel_Sample.xlsx',sheet_name='Sheet1')

HTML

1
df = pd.read_html('http://www.fdic.gov/bank/individual/failed/banklist.html')
1
type(df)
1
list
1
df[0].head()
Bank Name City ST CERT Acquiring Institution Closing Date
0 The First State Bank Barboursville WV 14361 MVB Bank, Inc. April 3, 2020
1 Ericson State Bank Ericson NE 18265 Farmers and Merchants Bank February 14, 2020
2 City National Bank of New Jersey Newark NJ 21111 Industrial Bank November 1, 2019
3 Resolute Bank Maumee OH 58317 Buckeye State Bank October 25, 2019
4 Louisa Community Bank Louisa KY 58112 Kentucky Farmers Bank Corporation October 25, 2019

SQL

1
2
3
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
1
2
# sql light engine
engine = create_engine('sqlite:///:memory:')
1
2
df = pd.read_csv('example')
df
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
1
df.to_sql('data', engine)
1
2
sql_df = pd.read_sql('data',con=engine)
sql_df
index a b c d
0 0 0 1 2 3
1 1 4 5 6 7
2 2 8 9 10 11
3 3 12 13 14 15

Altro

1
2
3
4
5
df = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}
df = pd.DataFrame(df)
df
Company Person Sales
0 GOOG Sam 200
1 GOOG Charlie 120
2 MSFT Amy 340
3 MSFT Vanessa 124
4 FB Carl 243
5 FB Sarah 350
1
df.info()
1
2
3
4
5
6
7
8
9
10
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Company  6 non-null      object
 1   Person   6 non-null      object
 2   Sales    6 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 272.0+ bytes
1
2
3
4
# quanti iniziano con C (seguono codici che danno stessi risultati)
sum(df['Person'].apply(lambda x: x[0]=='C'))
df['Person'][df['Person'].apply(lambda x: x[0]=='C')].count()
len(df[df['Person'].apply(lambda x: x[0]=='C')].index)
1
2
3
# seleziono solo colonne numeriche
df.select_dtypes(include=np.number)
df.loc[:, df.dtypes == np.float64]

Tags:

Updated: