Pandas - Ipynb - Colaboratory
Pandas - Ipynb - Colaboratory
Pandas - Ipynb - Colaboratory
ipynb - Colaboratory
import pandas
pandas.__version__
'1.5.3'
import pandas as pd
import numpy as np
import pandas as pd
0 0.25
1 0.50
2 0.75
3 1.00
dtype: float64
data.values
data.index
data[1]
0.5
data[1:3]
1 0.50
2 0.75
dtype: float64
a 0.25
b 0.50
c 0.75
d 1.00
dtype: float64
data['b']
0.5
2 0.25
5 0.50
3 0.75
7 1.00
dtype: float64
data[5]
0.5
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 1/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
California 38332521
Texas 26448193
New York 19651127
Florida 19552860
Illinois 12882135
dtype: int64
population['California']
38332521
population['California':'Illinois']
California 38332521
Texas 26448193
New York 19651127
Florida 19552860
Illinois 12882135
dtype: int64
pd.Series([2, 4, 6])
0 2
1 4
2 6
dtype: int64
100 5
200 5
300 5
dtype: int64
2 a
1 b
3 c
dtype: object
3 c
2 a
dtype: object
California 423967
Texas 695662
New York 141297
Florida 170312
Illinois 149995
dtype: int64
population area
states.index
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 2/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
states.columns
states['area']
California 423967
Texas 695662
New York 141297
Florida 170312
Illinois 149995
Name: area, dtype: int64
pd.DataFrame(population, columns=['population'])
population
California 38332521
Texas 26448193
Florida 19552860
Illinois 12882135
account_circle a b
0 0 0
1 1 2
2 2 4
a b c
0 1.0 2 NaN
1 NaN 3 4.0
pd.DataFrame({'population': population,
'area': area})
population area
pd.DataFrame(np.random.rand(3, 2),
columns=['foo', 'bar'],
index=['a', 'b', 'c'])
foo bar
a 0.100920 0.764732
b 0.511807 0.194514
c 0.736985 0.615123
array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])
pd.DataFrame(A)
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 3/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
A B
0 0 0.0
1 0 0.0
2 0 0.0
ind[1]
ind[::2]
5 (5,) 1 int64
<ipython-input-343-99513bddffa9>:1: FutureWarning: Index.__and__ operating as a set operation is deprecated, in the future this will
indA & indB
Int64Index([3, 5, 7], dtype='int64')
indA | indB
<ipython-input-344-2c4bfb638f37>:1: FutureWarning: Index.__or__ operating as a set operation is deprecated, in the future this will
indA | indB
Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')
indA ^ indB
<ipython-input-345-3946b5999e74>:1: FutureWarning: Index.__xor__ operating as a set operation is deprecated, in the future this will
indA ^ indB
Int64Index([1, 2, 9, 11], dtype='int64')
import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0],
index=['a', 'b', 'c', 'd'])
data
a 0.25
b 0.50
c 0.75
d 1.00
dtype: float64
data['b']
0.5
'a' in data
True
data.keys()
list(data.items())
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 4/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
data['e'] = 1.25
data
a 0.25
b 0.50
c 0.75
d 1.00
e 1.25
dtype: float64
data['a':'c']
a 0.25
b 0.50
c 0.75
dtype: float64
data[0:2]
a 0.25
b 0.50
dtype: float64
b 0.50
c 0.75
dtype: float64
data[['a', 'e']]
a 0.25
e 1.25
dtype: float64
1 a
3 b
5 c
dtype: object
data[1]
'a'
data[1:3]
3 b
5 c
dtype: object
data.loc[1]
'a'
data.loc[1:3]
1 a
3 b
dtype: object
data.iloc[1]
'b'
data.iloc[1:3]
3 b
5 c
dtype: object
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 5/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
area = pd.Series({'California': 423967, 'Texas': 695662,
'New York': 141297, 'Florida': 170312,
'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data
area pop
data['area']
California 423967
Texas 695662
New York 141297
Florida 170312
Illinois 149995
Name: area, dtype: int64
data.area
California 423967
Texas 695662
New York 141297
Florida 170312
Illinois 149995
Name: area, dtype: int64
data.area is data['area']
True
data.pop is data['pop']
False
data.values
data.T
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 6/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
data.values[0]
data['area']
California 423967
Texas 695662
New York 141297
Florida 170312
Illinois 149995
Name: area, dtype: int64
data.iloc[:3, :2]
area pop
data.loc[:'Illinois', :'pop']
area pop
pop density
data.iloc[0, 2] = 90
data
data['Florida':'Illinois']
data[1:3]
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 7/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
import pandas
Florida as170312
pd 19552860 114.806121
import numpy as np
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser
0 6
1 3
2 7
3 4
dtype: int64
A B C D
0 6 9 2 6
1 7 4 3 7
2 7 2 5 4
np.exp(ser)
0 403.428793
1 20.085537
2 1096.633158
3 54.598150
dtype: float64
np.sin(df * np.pi / 4)
A B C D
population / area
Alaska NaN
California 90.413926
New York NaN
Texas 38.018740
dtype: float64
area.index | population.index
<ipython-input-387-ff558a211efb>:1: FutureWarning: Index.__or__ operating as a set operation is deprecated, in the future this will
area.index | population.index
Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')
0 NaN
1 5.0
2 9.0
3 NaN
dtype: float64
A.add(B, fill_value=0)
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 8/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
0 2.0
1 5.0
2 9.0
3 5.0
dtype: float64
A B
0 1 11
1 5 1
B A C
0 4 0 9
1 5 8 0
2 9 2 6
A + B
A B C
fill = A.stack().mean()
A.add(B, fill_value=fill)
A B C
array([[3, 8, 2, 4],
[2, 6, 4, 8],
[6, 1, 3, 8]])
A - A[0]
array([[ 0, 0, 0, 0],
[-1, -2, 2, 4],
[ 3, -7, 1, 4]])
df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]
Q R S T
0 0 0 0 0
1 -1 -2 2 4
2 3 -7 1 4
df.subtract(df['R'], axis=0)
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 9/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
Q R S T
0 -5 0 -6 -4
halfrow = df.iloc[0, ::2]
1 -4 0 -2 2
halfrow
2 5 0 2 7
Q 3
S 2
Name: 0, dtype: int64
df - halfrow
Q R S T
import numpy as np
import pandas as pd
dtype = object
62.5 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
dtype = int
983 µs ± 183 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
dtype('float64')
1 + np.nan
nan
0 * np.nan
nan
0 1.0
1 NaN
2 2.0
3 NaN
dtype: float64
x = pd.Series(range(2), dtype=int)
x
0 0
1 1
dtype: int64
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 10/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
x[0] = None
x
0 NaN
1 1.0
dtype: float64
data.isnull()
0 False
1 True
2 False
3 True
dtype: bool
data[data.notnull()]
0 1
2 hello
dtype: object
data.dropna()
0 1
2 hello
dtype: object
0 1 2
0 1.0 NaN 2
1 2.0 3.0 5
2 NaN 4.0 6
df.dropna()
0 1 2
1 2.0 3.0 5
df.dropna(axis='columns')
0 2
1 5
2 6
df[3] = np.nan
df
0 1 2 3
df.dropna(axis='columns', how='all')
0 1 2
0 1.0 NaN 2
1 2.0 3.0 5
2 NaN 4.0 6
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 11/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
df.dropna(axis='rows', thresh=3)
0 1 2 3
a 1.0
b NaN
c 2.0
d NaN
e 3.0
dtype: float64
data.fillna(0)
a 1.0
b 0.0
c 2.0
d 0.0
e 3.0
dtype: float64
data.fillna(method='ffill')
a 1.0
b 1.0
c 2.0
d 2.0
e 3.0
dtype: float64
data.fillna(method='bfill')
a 1.0
b 2.0
c 2.0
d 3.0
e 3.0
dtype: float64
df
0 1 2 3
df.fillna(method='ffill', axis=1)
0 1 2 3
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 12/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
pop[('California', 2010):('Texas', 2000)]
index = pd.MultiIndex.from_tuples(index)
index
MultiIndex([('California', 2000),
('California', 2010),
( 'New York', 2000),
( 'New York', 2010),
( 'Texas', 2000),
( 'Texas', 2010)],
)
pop = pop.reindex(index)
pop
pop[:, 2010]
California 37253956
New York 19378102
Texas 25145561
dtype: int64
pop_df = pop.unstack()
pop_df
2000 2010
pop_df.stack()
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 13/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
total under18
df = pd.DataFrame(np.random.rand(4, 2),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=['data1', 'data2'])
df
data1 data2
a 1 0.107411 0.623970
2 0.934661 0.889680
b 1 0.753637 0.395928
2 0.460310 0.289032
MultiIndex([('a', 1),
('a', 2),
('b', 1),
('b', 2)],
)
MultiIndex([('a', 1),
('a', 2),
('b', 1),
('b', 2)],
)
MultiIndex([('a', 1),
('a', 2),
('b', 1),
('b', 2)],
)
state year
California 2000 33871648
2010 37253956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 14/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
names=['subject', 'type'])
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data
year visit
health_data['Guido']
type HR Temp
year visit
2 39.0 38.0
2 35.0 37.4
pop
state year
California 2000 33871648
2010 37253956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64
pop['California', 2000]
33871648
pop['California']
year
2000 33871648
2010 37253956
dtype: int64
pop.loc['California':'New York']
state year
California 2000 33871648
2010 37253956
New York 2000 18976457
2010 19378102
dtype: int64
pop[:, 2000]
state
California 33871648
New York 18976457
Texas 20851820
dtype: int64
state year
California 2000 33871648
2010 37253956
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 15/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
Texas 2010 25145561
dtype: int64
pop[['California', 'Texas']]
state year
California 2000 33871648
2010 37253956
Texas 2000 20851820
2010 25145561
dtype: int64
health_data
year visit
health_data['Guido', 'HR']
year visit
2013 1 41.0
2 39.0
2014 1 45.0
2 35.0
Name: (Guido, HR), dtype: float64
health_data.iloc[:2, :2]
subject Bob
type HR Temp
year visit
2 37.0 35.6
year visit
2013 1 19.0
2 37.0
2014 1 18.0
2 54.0
Name: (Bob, HR), dtype: float64
idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'HR']]
type HR HR HR
year visit
char int
a 1 0.053467
2 0.844292
c 1 0.516159
2 0.688637
b 1 0.539836
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 16/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
2 0.151731
dtype: float64
try:
data['a':'b']
except KeyError as e:
print(type(e))
print(e)
<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'
data = data.sort_index()
data
char int
a 1 0.053467
2 0.844292
b 1 0.539836
2 0.151731
c 1 0.516159
2 0.688637
dtype: float64
data['a':'b']
char int
a 1 0.053467
2 0.844292
b 1 0.539836
2 0.151731
dtype: float64
pop.unstack(level=0)
year
pop.unstack(level=1)
state
pop.unstack().stack()
state year
California 2000 33871648
2010 37253956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64
pop_flat = pop.reset_index(name='population')
pop_flat
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 17/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
pop_flat.set_index(['state', 'year'])
population
state year
2010 37253956
2010 19378102
2010 25145561
health_data
year visit
data_mean = health_data.mean(level='year')
data_mean
year
data_mean.mean(axis=1, level='type')
year
import pandas as pd
import numpy as np
A B C
0 A0 B0 C0
1 A1 B1 C1
2 A2 B2 C2
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 18/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]
np.concatenate([x, y, z])
array([1, 2, 3, 4, 5, 6, 7, 8, 9])
x = [[1, 2],
[3, 4]]
np.concatenate([x, x], axis=1)
array([[1, 2, 1, 2],
[3, 4, 3, 4]])
1 A
2 B
3 C
4 D
5 E
6 F
dtype: object
A B
1 A1 B1
2 A2 B2
A B
3 A3 B3
4 A4 B4
A B
1 A1 B1
2 A2 B2
3 A3 B3
4 A4 B4
A B
0 A0 B0
1 A1 B1
C D
0 C0 D0
1 C1 D1
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1
A B
0 A0 B0
1 A1 B1
A B
0 A2 B2
1 A3 B3
A B
0 A0 B0
1 A1 B1
0 A2 B2
1 A3 B3
try:
pd.concat([x, y], verify_integrity=True)
except ValueError as e:
print("ValueError:", e)
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 19/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
A B
0 A0 B0
1 A1 B1
A B
0 A2 B2
1 A3 B3
A B
0 A0 B0
1 A1 B1
2 A2 B2
3 A3 B3
A B
0 A0 B0
1 A1 B1
A B
0 A2 B2
1 A3 B3
A B
x 0 A0 B0
1 A1 B1
y 0 A2 B2
1 A3 B3
A B C
1 A1 B1 C1
2 A2 B2 C2
B C D
3 B3 C3 D3
4 B4 C4 D4
A B C D
1 A1 B1 C1 NaN
2 A2 B2 C2 NaN
3 NaN B3 C3 D3
4 NaN B4 C4 D4
print(df5); print(df6);
print(pd.concat([df5, df6], join='inner'))
A B C
1 A1 B1 C1
2 A2 B2 C2
B C D
3 B3 C3 D3
4 B4 C4 D4
B C
1 B1 C1
2 B2 C2
3 B3 C3
4 B4 C4
print(df5); print(df6);
print(pd.concat([df5, df6]))
A B C
1 A1 B1 C1
2 A2 B2 C2
B C D
3 B3 C3 D3
4 B4 C4 D4
A B C D
1 A1 B1 C1 NaN
2 A2 B2 C2 NaN
3 NaN B3 C3 D3
4 NaN B4 C4 D4
A B
1 A1 B1
2 A2 B2
A B
3 A3 B3
4 A4 B4
A B
1 A1 B1
2 A2 B2
3 A3 B3
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 20/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
4 A4 B4
<ipython-input-498-f80ce5c761de>:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future
print(df1); print(df2); print(df1.append(df2))
employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
employee hire_date
0 Lisa 2004
1 Bob 2008
2 Jake 2012
3 Sue 2014
3 Sue HR 2014
employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
group skills
0 Accounting math
1 Accounting spreadsheets
2 Engineering coding
3 Engineering linux
4 HR spreadsheets
5 HR organization
employee group skills
0 Bob Accounting math
1 Bob Accounting spreadsheets
2 Jake Engineering coding
3 Jake Engineering linux
4 Lisa Engineering coding
5 Lisa Engineering linux
6 Sue HR spreadsheets
7 Sue HR organization
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 21/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
employee hire_date
0 Lisa 2004
1 Bob 2008
2 Jake 2012
3 Sue 2014
employee group hire_date
0 Bob Accounting 2008
1 Jake Engineering 2012
2 Lisa Engineering 2004
3 Sue HR 2014
employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
name salary
0 Bob 70000
1 Jake 80000
2 Lisa 120000
3 Sue 90000
employee group name salary
0 Bob Accounting Bob 70000
1 Jake Engineering Jake 80000
2 Lisa Engineering Lisa 120000
3 Sue HR Sue 90000
3 Sue HR 90000
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')
print(df1a); print(df2a)
group
employee
Bob Accounting
Jake Engineering
Lisa Engineering
Sue HR
hire_date
employee
Lisa 2004
Bob 2008
Jake 2012
Sue 2014
print(df1a); print(df2a);
print(pd.merge(df1a, df2a, left_index=True, right_index=True))
group
employee
Bob Accounting
Jake Engineering
Lisa Engineering
Sue HR
hire_date
employee
Lisa 2004
Bob 2008
Jake 2012
Sue 2014
group hire_date
employee
Bob Accounting 2008
Jake Engineering 2012
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 22/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
Lisa Engineering 2004
Sue HR 2014
group
employee
Bob Accounting
Jake Engineering
Lisa Engineering
Sue HR
hire_date
employee
Lisa 2004
Bob 2008
Jake 2012
Sue 2014
group hire_date
employee
Bob Accounting 2008
Jake Engineering 2012
Lisa Engineering 2004
Sue HR 2014
print(df1a); print(df3);
print(pd.merge(df1a, df3, left_index=True, right_on='name'))
group
employee
Bob Accounting
Jake Engineering
Lisa Engineering
Sue HR
name salary
0 Bob 70000
1 Jake 80000
2 Lisa 120000
3 Sue 90000
group name salary
0 Accounting Bob 70000
1 Engineering Jake 80000
2 Engineering Lisa 120000
3 HR Sue 90000
name food
0 Peter fish
1 Paul beans
2 Mary bread
name drink
0 Mary wine
1 Joseph beer
name food drink
0 Mary bread wine
name food
0 Peter fish
1 Paul beans
2 Mary bread
name drink
0 Mary wine
1 Joseph beer
name food drink
0 Peter fish NaN
1 Paul beans NaN
2 Mary bread wine
3 Joseph NaN beer
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 23/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
print(df6); print(df7); print(pd.merge(df6, df7, how='left'))
name food
0 Peter fish
1 Paul beans
2 Mary bread
name drink
0 Mary wine
1 Joseph beer
name food drink
0 Peter fish NaN
1 Paul beans NaN
2 Mary bread wine
name rank
0 Bob 1
1 Jake 2
2 Lisa 3
3 Sue 4
name rank
0 Bob 3
1 Jake 1
2 Lisa 4
3 Sue 2
name rank_x rank_y
0 Bob 1 3
1 Jake 2 1
2 Lisa 3 4
3 Sue 4 2
print(df8); print(df9);
print(pd.merge(df8, df9, on="name", suffixes=["_L", "_R"]))
name rank
0 Bob 1
1 Jake 2
2 Lisa 3
3 Sue 4
name rank
0 Bob 3
1 Jake 1
2 Lisa 4
3 Sue 2
name rank_L rank_R
0 Bob 1 3
1 Jake 2 1
2 Lisa 3 4
3 Sue 4 2
(1035, 6)
planets.head()
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
ser
0 0.374540
1 0.950714
2 0.731994
3 0.598658
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 24/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
4 0.156019
dtype: float64
ser.sum()
2.811925491708157
ser.mean()
0.5623850983416314
df = pd.DataFrame({'A': rng.rand(5),
'B': rng.rand(5)})
df
A B
0 0.155995 0.020584
1 0.058084 0.969910
2 0.866176 0.832443
3 0.601115 0.212339
4 0.708073 0.181825
df.mean()
A 0.477888
B 0.443420
dtype: float64
df.mean(axis='columns')
0 0.088290
1 0.513997
2 0.849309
3 0.406727
4 0.444949
dtype: float64
planets.dropna().describe()
key data
0 A 0
1 B 1
2 C 2
3 A 3
4 B 4
5 C 5
df.groupby('key')
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 25/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
df.groupby('key').sum()
data
key
A 3
B 5
C 7
planets.groupby('method')
planets.groupby('method')['orbital_period']
planets.groupby('method')['orbital_period'].median()
method
Astrometry 631.180000
Eclipse Timing Variations 4343.500000
Imaging 27500.000000
Microlensing 3300.000000
Orbital Brightness Modulation 0.342887
Pulsar Timing 66.541900
Pulsation Timing Variations 1170.000000
Radial Velocity 360.200000
Transit 5.714932
Transit Timing Variations 57.011000
Name: orbital_period, dtype: float64
Astrometry shape=(2, 6)
Eclipse Timing Variations shape=(9, 6)
Imaging shape=(38, 6)
Microlensing shape=(23, 6)
Orbital Brightness Modulation shape=(3, 6)
Pulsar Timing shape=(5, 6)
Pulsation Timing Variations shape=(1, 6)
Radial Velocity shape=(553, 6)
Transit shape=(397, 6)
Transit Timing Variations shape=(4, 6)
planets.groupby('method')['year'].describe().unstack()
method
count Astrometry 2.0
Eclipse Timing Variations 9.0
Imaging 38.0
Microlensing 23.0
Orbital Brightness Modulation 3.0
...
max Pulsar Timing 2011.0
Pulsation Timing Variations 2007.0
Radial Velocity 2014.0
Transit 2014.0
Transit Timing Variations 2014.0
Length: 80, dtype: float64
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data1': range(6),
'data2': rng.randint(0, 10, 6)},
columns = ['key', 'data1', 'data2'])
df
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 26/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
0 A 0 5
1 B 1 0
df.groupby('key').aggregate(['min',
2 C 2 3 np.median, max])
3 A 3 3
data1 data2
4 B 4 7
min median max min median max
5 C 5 9
key
A 0 1.5 3 3 4.0 5
B 1 2.5 4 0 3.5 7
C 2 3.5 5 3 6.0 9
df.groupby('key').aggregate({'data1': 'min',
'data2': 'max'})
data1 data2
key
A 0 5
B 1 7
C 2 9
def filter_func(x):
return x['data2'].std() > 4
print(df); print(df.groupby('key').std());
print(df.groupby('key').filter(filter_func))
df.groupby('key').transform(lambda x: x - x.mean())
def norm_by_data2(x):
x['data1'] /= x['data2'].sum()
return x
print(df); print(df.groupby('key').apply(norm_by_data2))
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 27/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
>>> .groupby(..., group_keys=True)
print(df); print(df.groupby('key').apply(norm_by_data2))
L = [0, 1, 0, 1, 2, 0]
print(df); print(df.groupby(L).sum())
print(df); print(df.groupby(df['key']).sum())
df2 = df.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
print(df2); print(df2.groupby(mapping).sum())
data1 data2
key
A 0 5
B 1 0
C 2 3
A 3 3
B 4 7
C 5 9
data1 data2
key
consonant 12 19
vowel 3 8
print(df2); print(df2.groupby(str.lower).mean())
data1 data2
key
A 0 5
B 1 0
C 2 3
A 3 3
B 4 7
C 5 9
data1 data2
key
a 1.5 4.0
b 2.5 3.5
c 3.5 6.0
df2.groupby([str.lower, mapping]).mean()
data1 data2
key key
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 28/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
decade = 10 * (planets['year'] // 10)
decade = decade.astype(str) + 's'
decade.name = 'decade'
planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)
method
import numpy as np
import pandas as pd
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()
survived pclass sex age sibsp parch fare embarked class who adult
titanic.groupby('sex')[['survived']].mean()
survived
sex
female 0.742038
male 0.188908
titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()
sex
sex
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 29/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
sex age
sex age
titanic.pivot_table(index='sex', columns='class',
aggfunc={'survived':sum, 'fare':'mean'})
fare survived
sex
sex
import numpy as np
x = np.array([2, 3, 5, 7, 11, 13])
x * 2
import pandas as pd
names = pd.Series(data)
names
0 peter
1 Paul
2 MARY
3 gUIDO
dtype: object
names.str.capitalize()
0 Peter
1 Paul
2 Mary
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 30/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
3 Guido
dtype: object
monte.str.lower()
0 graham chapman
1 john cleese
2 terry gilliam
3 eric idle
4 terry jones
5 michael palin
dtype: object
monte.str.len()
0 14
1 11
2 13
3 9
4 11
5 13
dtype: int64
monte.str.startswith('T')
0 False
1 False
2 True
3 False
4 True
5 False
dtype: bool
monte.str.split()
0 [Graham, Chapman]
1 [John, Cleese]
2 [Terry, Gilliam]
3 [Eric, Idle]
4 [Terry, Jones]
5 [Michael, Palin]
dtype: object
monte.str.extract('([A-Za-z]+)')
0 Graham
1 John
2 Terry
3 Eric
4 Terry
5 Michael
monte.str.findall(r'^[^AEIOU].*[^aeiou]$')
0 [Graham Chapman]
1 []
2 [Terry Gilliam]
3 []
4 [Terry Jones]
5 [Michael Palin]
dtype: object
monte.str[0:3]
0 Gra
1 Joh
2 Ter
3 Eri
4 Ter
5 Mic
dtype: object
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 31/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
monte.str.split().str.get(-1)
0 Chapman
1 Cleese
2 Gilliam
3 Idle
4 Jones
5 Palin
dtype: object
name info
full_monte['info'].str.get_dummies('|')
A B C D
0 0 1 1 1
1 0 1 0 1
2 1 0 1 0
3 0 1 0 1
4 0 1 1 0
5 0 1 1 1
datetime.datetime(2015, 7, 4, 0, 0)
datetime.datetime(2015, 7, 4, 0, 0)
date.strftime('%A')
'Saturday'
import numpy as np
date = np.array('2015-07-04', dtype=np.datetime64)
date
array('2015-07-04', dtype='datetime64[D]')
date + np.arange(12)
np.datetime64('2015-07-04')
numpy.datetime64('2015-07-04')
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 32/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
np.datetime64('2015-07-04 12:00')
numpy.datetime64('2015-07-04T12:00')
numpy.datetime64('2015-07-04T12:59:59.500000000')
import pandas as pd
date = pd.to_datetime("4th of July, 2015")
date
Timestamp('2015-07-04 00:00:00')
date.strftime('%A')
'Saturday'
2014-07-04 0
2014-08-04 1
2015-07-04 2
2015-08-04 3
dtype: int64
data['2014-07-04':'2015-07-04']
2014-07-04 0
2014-08-04 1
2015-07-04 2
dtype: int64
data['2015']
2015-07-04 2
2015-08-04 3
dtype: int64
dates.to_period('D')
dates - dates[0]
TimedeltaIndex(['0 days', '1 days', '3 days', '4 days', '5 days'], dtype='timedelta64[ns]', freq=None)
pd.date_range('2015-07-03', '2015-07-10')
pd.date_range('2015-07-03', periods=8)
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 33/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
dtype='datetime64[ns]', freq='D')
import pandas as pd
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols))
for i in range(4))
110 ms ± 14.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
True
True
result1 = (df1 < df2) & (df2 <= df3) & (df3 != df4)
result2 = pd.eval('df1 < df2 <= df3 != df4')
np.allclose(result1, result2)
True
result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
result2 = pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)')
np.allclose(result1, result2)
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 34/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
True
result3 = pd.eval('(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)')
np.allclose(result1, result3)
True
True
A B C
True
True
df.head()
A B C
A B C D
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 35/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
A B C D
column_mean = df.mean(1)
0 0.375506 0.406939 0.069938 -0.449425
result1 = df['A'] + column_mean
result2
1 =0.069087
df.eval('A + @column_mean')
0.235615 0.154374 -1.078728
np.allclose(result1, result2)
2 0.677945 0.433839 0.652324 0.374209
True
3 0.264038 0.808055 0.347197 -1.566886
4 =0.589161
result1 df[(df.A 0.252418
< 0.5) & 0.557789 0.603708
(df.B < 0.5)]
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]')
np.allclose(result1, result2)
True
True
Cmean = df['C'].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query('A < @Cmean and B < @Cmean')
np.allclose(result1, result2)
True
df l b t
https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 36/36