Pandas - Ipynb - Colaboratory

Download as pdf or txt
Download as pdf or txt
You are on page 1of 36

11/30/23, 11:37 AM Pandas.

ipynb - Colaboratory

Aim: To practice pandas commands

import pandas
pandas.__version__

'1.5.3'

import pandas as pd

import numpy as np
import pandas as pd

data = pd.Series([0.25, 0.5, 0.75, 1.0])


data

0 0.25
1 0.50
2 0.75
3 1.00
dtype: float64

data.values

array([0.25, 0.5 , 0.75, 1. ])

data.index

RangeIndex(start=0, stop=4, step=1)

data[1]

0.5

data[1:3]

1 0.50
2 0.75
dtype: float64

data = pd.Series([0.25, 0.5, 0.75, 1.0],


index=['a', 'b', 'c', 'd'])
data

a 0.25
b 0.50
c 0.75
d 1.00
dtype: float64

data['b']

0.5

data = pd.Series([0.25, 0.5, 0.75, 1.0],


index=[2, 5, 3, 7])
data

2 0.25
5 0.50
3 0.75
7 1.00
dtype: float64

data[5]

0.5

population_dict = {'California': 38332521,


'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
population = pd.Series(population_dict)
population

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 1/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory

California 38332521
Texas 26448193
New York 19651127
Florida 19552860
Illinois 12882135
dtype: int64

population['California']

38332521

population['California':'Illinois']

California 38332521
Texas 26448193
New York 19651127
Florida 19552860
Illinois 12882135
dtype: int64

pd.Series([2, 4, 6])

0 2
1 4
2 6
dtype: int64

pd.Series(5, index=[100, 200, 300])

100 5
200 5
300 5
dtype: int64

pd.Series({2:'a', 1:'b', 3:'c'})

2 a
1 b
3 c
dtype: object

pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])

3 c
2 a
dtype: object

area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,


'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California 423967
Texas 695662
New York 141297
Florida 170312
Illinois 149995
dtype: int64

states = pd.DataFrame({'population': population,


'area': area})
states

population area

California 38332521 423967

Texas 26448193 695662

New York 19651127 141297

Florida 19552860 170312

Illinois 12882135 149995

states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 2/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
states.columns

Index(['population', 'area'], dtype='object')

states['area']

California 423967
Texas 695662
New York 141297
Florida 170312
Illinois 149995
Name: area, dtype: int64

pd.DataFrame(population, columns=['population'])

population

California 38332521

Texas 26448193

New York 19651127

Florida 19552860

Illinois 12882135

data = [{'a': i, 'b': 2 * i}


for i in range(3)]
pd.DataFrame(data)

account_circle a b

0 0 0

1 1 2

2 2 4

pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

a b c

0 1.0 2 NaN

1 NaN 3 4.0

pd.DataFrame({'population': population,
'area': area})

population area

California 38332521 423967

Texas 26448193 695662

New York 19651127 141297

Florida 19552860 170312

Illinois 12882135 149995

pd.DataFrame(np.random.rand(3, 2),
columns=['foo', 'bar'],
index=['a', 'b', 'c'])

foo bar

a 0.100920 0.764732

b 0.511807 0.194514

c 0.736985 0.615123

A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])


A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

pd.DataFrame(A)

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 3/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory

A B

0 0 0.0

1 0 0.0

2 0 0.0

ind = pd.Index([2, 3, 5, 7, 11])


ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

ind[1]

ind[::2]

Int64Index([2, 5, 11], dtype='int64')

print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64

indA = pd.Index([1, 3, 5, 7, 9])


indB = pd.Index([2, 3, 5, 7, 11])

indA & indB

<ipython-input-343-99513bddffa9>:1: FutureWarning: Index.__and__ operating as a set operation is deprecated, in the future this will
indA & indB
Int64Index([3, 5, 7], dtype='int64')

indA | indB

<ipython-input-344-2c4bfb638f37>:1: FutureWarning: Index.__or__ operating as a set operation is deprecated, in the future this will
indA | indB
Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

indA ^ indB

<ipython-input-345-3946b5999e74>:1: FutureWarning: Index.__xor__ operating as a set operation is deprecated, in the future this will
indA ^ indB
Int64Index([1, 2, 9, 11], dtype='int64')

import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0],
index=['a', 'b', 'c', 'd'])
data

a 0.25
b 0.50
c 0.75
d 1.00
dtype: float64

data['b']

0.5

'a' in data

True

data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 4/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
data['e'] = 1.25
data

a 0.25
b 0.50
c 0.75
d 1.00
e 1.25
dtype: float64

data['a':'c']

a 0.25
b 0.50
c 0.75
dtype: float64

data[0:2]

a 0.25
b 0.50
dtype: float64

data[(data > 0.3) & (data < 0.8)]

b 0.50
c 0.75
dtype: float64

data[['a', 'e']]

a 0.25
e 1.25
dtype: float64

data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])


data

1 a
3 b
5 c
dtype: object

data[1]

'a'

data[1:3]

3 b
5 c
dtype: object

data.loc[1]

'a'

data.loc[1:3]

1 a
3 b
dtype: object

data.iloc[1]

'b'

data.iloc[1:3]

3 b
5 c
dtype: object

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 5/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
area = pd.Series({'California': 423967, 'Texas': 695662,
'New York': 141297, 'Florida': 170312,
'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

area pop

California 423967 38332521

Texas 695662 26448193

New York 141297 19651127

Florida 170312 19552860

Illinois 149995 12882135

data['area']

California 423967
Texas 695662
New York 141297
Florida 170312
Illinois 149995
Name: area, dtype: int64

data.area

California 423967
Texas 695662
New York 141297
Florida 170312
Illinois 149995
Name: area, dtype: int64

data.area is data['area']

True

data.pop is data['pop']

False

data['density'] = data['pop'] / data['area']


data

area pop density

California 423967 38332521 90.413926

Texas 695662 26448193 38.018740

New York 141297 19651127 139.076746

Florida 170312 19552860 114.806121

Illinois 149995 12882135 85.883763

data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],


[6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
[1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
[1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
[1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

data.T

California Texas New York Florida Illinois

area 4.239670e+05 6.956620e+05 1.412970e+05 1.703120e+05 1.499950e+05

pop 3.833252e+07 2.644819e+07 1.965113e+07 1.955286e+07 1.288214e+07

density 9.041393e+01 3.801874e+01 1.390767e+02 1.148061e+02 8.588376e+01

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 6/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
data.values[0]

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])

data['area']

California 423967
Texas 695662
New York 141297
Florida 170312
Illinois 149995
Name: area, dtype: int64

data.iloc[:3, :2]

area pop

California 423967 38332521

Texas 695662 26448193

New York 141297 19651127

data.loc[:'Illinois', :'pop']

area pop

California 423967 38332521

Texas 695662 26448193

New York 141297 19651127

Florida 170312 19552860

Illinois 149995 12882135

data.loc[data.density > 100, ['pop', 'density']]

pop density

New York 19651127 139.076746

Florida 19552860 114.806121

data.iloc[0, 2] = 90
data

area pop density

California 423967 38332521 90.000000

Texas 695662 26448193 38.018740

New York 141297 19651127 139.076746

Florida 170312 19552860 114.806121

Illinois 149995 12882135 85.883763

data['Florida':'Illinois']

area pop density

Florida 170312 19552860 114.806121

Illinois 149995 12882135 85.883763

data[1:3]

area pop density

Texas 695662 26448193 38.018740

New York 141297 19651127 139.076746

data[data.density > 100]

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 7/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory

area pop density

New York 141297 19651127 139.076746

import pandas
Florida as170312
pd 19552860 114.806121
import numpy as np

rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

0 6
1 3
2 7
3 4
dtype: int64

df = pd.DataFrame(rng.randint(0, 10, (3, 4)),


columns=['A', 'B', 'C', 'D'])
df

A B C D

0 6 9 2 6

1 7 4 3 7

2 7 2 5 4

np.exp(ser)

0 403.428793
1 20.085537
2 1096.633158
3 54.598150
dtype: float64

np.sin(df * np.pi / 4)

A B C D

0 -1.000000 7.071068e-01 1.000000 -1.000000e+00

1 -0.707107 1.224647e-16 0.707107 -7.071068e-01

2 -0.707107 1.000000e+00 -0.707107 1.224647e-16

area = pd.Series({'Alaska': 1723337, 'Texas': 695662,


'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127}, name='population')

population / area

Alaska NaN
California 90.413926
New York NaN
Texas 38.018740
dtype: float64

area.index | population.index

<ipython-input-387-ff558a211efb>:1: FutureWarning: Index.__or__ operating as a set operation is deprecated, in the future this will
area.index | population.index
Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

A = pd.Series([2, 4, 6], index=[0, 1, 2])


B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B

0 NaN
1 5.0
2 9.0
3 NaN
dtype: float64

A.add(B, fill_value=0)

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 8/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory

0 2.0
1 5.0
2 9.0
3 5.0
dtype: float64

A = pd.DataFrame(rng.randint(0, 20, (2, 2)),


columns=list('AB'))
A

A B

0 1 11

1 5 1

B = pd.DataFrame(rng.randint(0, 10, (3, 3)),


columns=list('BAC'))
B

B A C

0 4 0 9

1 5 8 0

2 9 2 6

A + B

A B C

0 1.0 15.0 NaN

1 13.0 6.0 NaN

2 NaN NaN NaN

fill = A.stack().mean()
A.add(B, fill_value=fill)

A B C

0 1.0 15.0 13.5

1 13.0 6.0 4.5

2 6.5 13.5 10.5

A = rng.randint(10, size=(3, 4))


A

array([[3, 8, 2, 4],
[2, 6, 4, 8],
[6, 1, 3, 8]])

A - A[0]

array([[ 0, 0, 0, 0],
[-1, -2, 2, 4],
[ 3, -7, 1, 4]])

df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]

Q R S T

0 0 0 0 0

1 -1 -2 2 4

2 3 -7 1 4

df.subtract(df['R'], axis=0)

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 9/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory

Q R S T

0 -5 0 -6 -4
halfrow = df.iloc[0, ::2]
1 -4 0 -2 2
halfrow
2 5 0 2 7
Q 3
S 2
Name: 0, dtype: int64

df - halfrow

Q R S T

0 0.0 NaN 0.0 NaN

1 -1.0 NaN 2.0 NaN

2 3.0 NaN 1.0 NaN

import numpy as np
import pandas as pd

vals1 = np.array([1, None, 3, 4])


vals1

array([1, None, 3, 4], dtype=object)

for dtype in ['object', 'int']:


print("dtype =", dtype)
%timeit np.arange(1E6, dtype=dtype).sum()
print()

dtype = object
62.5 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

dtype = int
983 µs ± 183 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

vals2 = np.array([1, np.nan, 3, 4])


vals2.dtype

dtype('float64')

1 + np.nan

nan

0 * np.nan

nan

vals2.sum(), vals2.min(), vals2.max()

(nan, nan, nan)

np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)

(8.0, 1.0, 4.0)

pd.Series([1, np.nan, 2, None])

0 1.0
1 NaN
2 2.0
3 NaN
dtype: float64

x = pd.Series(range(2), dtype=int)
x

0 0
1 1
dtype: int64

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 10/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
x[0] = None
x

0 NaN
1 1.0
dtype: float64

data = pd.Series([1, np.nan, 'hello', None])

data.isnull()

0 False
1 True
2 False
3 True
dtype: bool

data[data.notnull()]

0 1
2 hello
dtype: object

data.dropna()

0 1
2 hello
dtype: object

df = pd.DataFrame([[1, np.nan, 2],


[2, 3, 5],
[np.nan, 4, 6]])
df

0 1 2

0 1.0 NaN 2

1 2.0 3.0 5

2 NaN 4.0 6

df.dropna()

0 1 2

1 2.0 3.0 5

df.dropna(axis='columns')

0 2

1 5

2 6

df[3] = np.nan
df

0 1 2 3

0 1.0 NaN 2 NaN

1 2.0 3.0 5 NaN

2 NaN 4.0 6 NaN

df.dropna(axis='columns', how='all')

0 1 2

0 1.0 NaN 2

1 2.0 3.0 5

2 NaN 4.0 6

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 11/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory

df.dropna(axis='rows', thresh=3)

0 1 2 3

1 2.0 3.0 5 NaN

data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))


data

a 1.0
b NaN
c 2.0
d NaN
e 3.0
dtype: float64

data.fillna(0)

a 1.0
b 0.0
c 2.0
d 0.0
e 3.0
dtype: float64

data.fillna(method='ffill')

a 1.0
b 1.0
c 2.0
d 2.0
e 3.0
dtype: float64

data.fillna(method='bfill')

a 1.0
b 2.0
c 2.0
d 3.0
e 3.0
dtype: float64

df

0 1 2 3

0 1.0 NaN 2 NaN

1 2.0 3.0 5 NaN

2 NaN 4.0 6 NaN

df.fillna(method='ffill', axis=1)

0 1 2 3

0 1.0 1.0 2.0 2.0

1 2.0 3.0 5.0 5.0

2 NaN 4.0 6.0 6.0

index = [('California', 2000), ('California', 2010),


('New York', 2000), ('New York', 2010),
('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
18976457, 19378102,
20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000) 33871648


(California, 2010) 37253956
(New York, 2000) 18976457
(New York, 2010) 19378102
(Texas, 2000) 20851820
(Texas, 2010) 25145561
dtype: int64

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 12/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
pop[('California', 2010):('Texas', 2000)]

(California, 2010) 37253956


(New York, 2000) 18976457
(New York, 2010) 19378102
(Texas, 2000) 20851820
dtype: int64

pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010) 37253956


(New York, 2010) 19378102
(Texas, 2010) 25145561
dtype: int64

index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
('California', 2010),
( 'New York', 2000),
( 'New York', 2010),
( 'Texas', 2000),
( 'Texas', 2010)],
)

pop = pop.reindex(index)
pop

California 2000 33871648


2010 37253956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64

pop[:, 2010]

California 37253956
New York 19378102
Texas 25145561
dtype: int64

pop_df = pop.unstack()
pop_df

2000 2010

California 33871648 37253956

New York 18976457 19378102

Texas 20851820 25145561

pop_df.stack()

California 2000 33871648


2010 37253956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64

pop_df = pd.DataFrame({'total': pop,


'under18': [9267089, 9284094,
4687374, 4318033,
5906301, 6879014]})
pop_df

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 13/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory

total under18

f_u18 California 2000 33871648


= pop_df['under18'] 9267089
/ pop_df['total']
f_u18.unstack()
2010 37253956 9284094

New York 20002000


18976457 4687374
2010

California 2010 19378102


0.273594 4318033
0.249211
Texas
New York 2000 20851820
0.247010 5906301
0.222831

Texas 2010 25145561


0.283251 6879014
0.273568

df = pd.DataFrame(np.random.rand(4, 2),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=['data1', 'data2'])
df

data1 data2

a 1 0.107411 0.623970

2 0.934661 0.889680

b 1 0.753637 0.395928

2 0.460310 0.289032

data = {('California', 2000): 33871648,


('California', 2010): 37253956,
('Texas', 2000): 20851820,
('Texas', 2010): 25145561,
('New York', 2000): 18976457,
('New York', 2010): 19378102}
pd.Series(data)

California 2000 33871648


2010 37253956
Texas 2000 20851820
2010 25145561
New York 2000 18976457
2010 19378102
dtype: int64

pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

MultiIndex([('a', 1),
('a', 2),
('b', 1),
('b', 2)],
)

pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex([('a', 1),
('a', 2),
('b', 1),
('b', 2)],
)

pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

MultiIndex([('a', 1),
('a', 2),
('b', 1),
('b', 2)],
)

pop.index.names = ['state', 'year']


pop

state year
California 2000 33871648
2010 37253956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 14/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
names=['subject', 'type'])
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

subject Bob Guido Sue

type HR Temp HR Temp HR Temp

year visit

2013 1 19.0 37.7 41.0 37.1 34.0 36.5

2 37.0 35.6 39.0 38.0 38.0 37.0

2014 1 18.0 36.8 45.0 38.0 50.0 36.7

2 54.0 35.5 35.0 37.4 31.0 38.1

health_data['Guido']

type HR Temp

year visit

2013 1 41.0 37.1

2 39.0 38.0

2014 1 45.0 38.0

2 35.0 37.4

pop

state year
California 2000 33871648
2010 37253956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64

pop['California', 2000]

33871648

pop['California']

year
2000 33871648
2010 37253956
dtype: int64

pop.loc['California':'New York']

state year
California 2000 33871648
2010 37253956
New York 2000 18976457
2010 19378102
dtype: int64

pop[:, 2000]

state
California 33871648
New York 18976457
Texas 20851820
dtype: int64

pop[pop > 22000000]

state year
California 2000 33871648
2010 37253956

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 15/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
Texas 2010 25145561
dtype: int64

pop[['California', 'Texas']]

state year
California 2000 33871648
2010 37253956
Texas 2000 20851820
2010 25145561
dtype: int64

health_data

subject Bob Guido Sue

type HR Temp HR Temp HR Temp

year visit

2013 1 19.0 37.7 41.0 37.1 34.0 36.5

2 37.0 35.6 39.0 38.0 38.0 37.0

2014 1 18.0 36.8 45.0 38.0 50.0 36.7

2 54.0 35.5 35.0 37.4 31.0 38.1

health_data['Guido', 'HR']

year visit
2013 1 41.0
2 39.0
2014 1 45.0
2 35.0
Name: (Guido, HR), dtype: float64

health_data.iloc[:2, :2]

subject Bob

type HR Temp

year visit

2013 1 19.0 37.7

2 37.0 35.6

health_data.loc[:, ('Bob', 'HR')]

year visit
2013 1 19.0
2 37.0
2014 1 18.0
2 54.0
Name: (Bob, HR), dtype: float64

idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'HR']]

subject Bob Guido Sue

type HR HR HR

year visit

2013 1 19.0 41.0 34.0

2014 1 18.0 45.0 50.0

index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])


data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data

char int
a 1 0.053467
2 0.844292
c 1 0.516159
2 0.688637
b 1 0.539836

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 16/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
2 0.151731
dtype: float64

try:
data['a':'b']
except KeyError as e:
print(type(e))
print(e)

<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'

data = data.sort_index()
data

char int
a 1 0.053467
2 0.844292
b 1 0.539836
2 0.151731
c 1 0.516159
2 0.688637
dtype: float64

data['a':'b']

char int
a 1 0.053467
2 0.844292
b 1 0.539836
2 0.151731
dtype: float64

pop.unstack(level=0)

state California New York Texas

year

2000 33871648 18976457 20851820

2010 37253956 19378102 25145561

pop.unstack(level=1)

year 2000 2010

state

California 33871648 37253956

New York 18976457 19378102

Texas 20851820 25145561

pop.unstack().stack()

state year
California 2000 33871648
2010 37253956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64

pop_flat = pop.reset_index(name='population')
pop_flat

state year population

0 California 2000 33871648

1 California 2010 37253956

2 New York 2000 18976457

3 New York 2010 19378102

4 Texas 2000 20851820

5 Texas 2010 25145561

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 17/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory

pop_flat.set_index(['state', 'year'])

population

state year

California 2000 33871648

2010 37253956

New York 2000 18976457

2010 19378102

Texas 2000 20851820

2010 25145561

health_data

subject Bob Guido Sue

type HR Temp HR Temp HR Temp

year visit

2013 1 19.0 37.7 41.0 37.1 34.0 36.5

2 37.0 35.6 39.0 38.0 38.0 37.0

2014 1 18.0 36.8 45.0 38.0 50.0 36.7

2 54.0 35.5 35.0 37.4 31.0 38.1

data_mean = health_data.mean(level='year')
data_mean

<ipython-input-469-af3ae0440116>:1: FutureWarning: Using the level keyword in DataFra


data_mean = health_data.mean(level='year')
subject Bob Guido Sue

type HR Temp HR Temp HR Temp

year

2013 28.0 36.65 40.0 37.55 36.0 36.75

2014 36.0 36.15 40.0 37.70 40.5 37.40

data_mean.mean(axis=1, level='type')

<ipython-input-470-e9f5c76486ab>:1: FutureWarning: Using the level keyword in DataFra


data_mean.mean(axis=1, level='type')
type HR Temp

year

2013 34.666667 36.983333

2014 38.833333 37.083333

import pandas as pd
import numpy as np

Double-click (or enter) to edit

def make_df(cols, ind):


data = {c: [str(c) + str(i) for i in ind]
for c in cols}
return pd.DataFrame(data, ind)
make_df('ABC', range(3))

A B C

0 A0 B0 C0

1 A1 B1 C1

2 A2 B2 C2

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 18/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]
np.concatenate([x, y, z])

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

x = [[1, 2],
[3, 4]]
np.concatenate([x, x], axis=1)

array([[1, 2, 1, 2],
[3, 4, 3, 4]])

ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])


ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1 A
2 B
3 C
4 D
5 E
6 F
dtype: object

df1 = make_df('AB', [1, 2])


df2 = make_df('AB', [3, 4])
print(df1); print(df2); print(pd.concat([df1, df2]))

A B
1 A1 B1
2 A2 B2
A B
3 A3 B3
4 A4 B4
A B
1 A1 B1
2 A2 B2
3 A3 B3
4 A4 B4

df3 = make_df('AB', [0, 1])


df4 = make_df('CD', [0, 1])
print(df3); print(df4); print(pd.concat([df3, df4], axis=1))

A B
0 A0 B0
1 A1 B1
C D
0 C0 D0
1 C1 D1
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1

x = make_df('AB', [0, 1])


y = make_df('AB', [2, 3])
y.index = x.index
print(x); print(y); print(pd.concat([x, y]))

A B
0 A0 B0
1 A1 B1
A B
0 A2 B2
1 A3 B3
A B
0 A0 B0
1 A1 B1
0 A2 B2
1 A3 B3

try:
pd.concat([x, y], verify_integrity=True)
except ValueError as e:
print("ValueError:", e)

ValueError: Indexes have overlapping values: Int64Index([0, 1], dtype='int64')

print(x); print(y); print(pd.concat([x, y], ignore_index=True))

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 19/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory

A B
0 A0 B0
1 A1 B1
A B
0 A2 B2
1 A3 B3
A B
0 A0 B0
1 A1 B1
2 A2 B2
3 A3 B3

print(x); print(y); print(pd.concat([x, y], keys=['x', 'y']))

A B
0 A0 B0
1 A1 B1
A B
0 A2 B2
1 A3 B3
A B
x 0 A0 B0
1 A1 B1
y 0 A2 B2
1 A3 B3

df5 = make_df('ABC', [1, 2])


df6 = make_df('BCD', [3, 4])
print(df5); print(df6); print(pd.concat([df5, df6]))

A B C
1 A1 B1 C1
2 A2 B2 C2
B C D
3 B3 C3 D3
4 B4 C4 D4
A B C D
1 A1 B1 C1 NaN
2 A2 B2 C2 NaN
3 NaN B3 C3 D3
4 NaN B4 C4 D4

print(df5); print(df6);
print(pd.concat([df5, df6], join='inner'))

A B C
1 A1 B1 C1
2 A2 B2 C2
B C D
3 B3 C3 D3
4 B4 C4 D4
B C
1 B1 C1
2 B2 C2
3 B3 C3
4 B4 C4

print(df5); print(df6);
print(pd.concat([df5, df6]))

A B C
1 A1 B1 C1
2 A2 B2 C2
B C D
3 B3 C3 D3
4 B4 C4 D4
A B C D
1 A1 B1 C1 NaN
2 A2 B2 C2 NaN
3 NaN B3 C3 D3
4 NaN B4 C4 D4

print(df1); print(df2); print(df1.append(df2))

A B
1 A1 B1
2 A2 B2
A B
3 A3 B3
4 A4 B4
A B
1 A1 B1
2 A2 B2
3 A3 B3

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 20/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
4 A4 B4
<ipython-input-498-f80ce5c761de>:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future
print(df1); print(df2); print(df1.append(df2))

df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],


'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
'hire_date': [2004, 2008, 2012, 2014]})
print(df1); print(df2)

employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
employee hire_date
0 Lisa 2004
1 Bob 2008
2 Jake 2012
3 Sue 2014

df3 = pd.merge(df1, df2)


df3

employee group hire_date

0 Bob Accounting 2008

1 Jake Engineering 2012

2 Lisa Engineering 2004

3 Sue HR 2014

df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],


'supervisor': ['Carly', 'Guido', 'Steve']})
print(df3); print(df4); print(pd.merge(df3, df4))

employee group hire_date


0 Bob Accounting 2008
1 Jake Engineering 2012
2 Lisa Engineering 2004
3 Sue HR 2014
group supervisor
0 Accounting Carly
1 Engineering Guido
2 HR Steve
employee group hire_date supervisor
0 Bob Accounting 2008 Carly
1 Jake Engineering 2012 Guido
2 Lisa Engineering 2004 Guido
3 Sue HR 2014 Steve

df5 = pd.DataFrame({'group': ['Accounting', 'Accounting',


'Engineering', 'Engineering', 'HR', 'HR'],'skills': ['math', 'spreadsheets', 'coding', 'linux',
'spreadsheets', 'organization']})
print(df1); print(df5); print(pd.merge(df1, df5))

employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
group skills
0 Accounting math
1 Accounting spreadsheets
2 Engineering coding
3 Engineering linux
4 HR spreadsheets
5 HR organization
employee group skills
0 Bob Accounting math
1 Bob Accounting spreadsheets
2 Jake Engineering coding
3 Jake Engineering linux
4 Lisa Engineering coding
5 Lisa Engineering linux
6 Sue HR spreadsheets
7 Sue HR organization

print(df1); print(df2); print(pd.merge(df1, df2, on='employee'))

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 21/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory

employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
employee hire_date
0 Lisa 2004
1 Bob 2008
2 Jake 2012
3 Sue 2014
employee group hire_date
0 Bob Accounting 2008
1 Jake Engineering 2012
2 Lisa Engineering 2004
3 Sue HR 2014

df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],


'salary': [70000, 80000, 120000, 90000]})
print(df1); print(df3);
print(pd.merge(df1, df3, left_on="employee", right_on="name"))

employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
name salary
0 Bob 70000
1 Jake 80000
2 Lisa 120000
3 Sue 90000
employee group name salary
0 Bob Accounting Bob 70000
1 Jake Engineering Jake 80000
2 Lisa Engineering Lisa 120000
3 Sue HR Sue 90000

pd.merge(df1, df3, left_on="employee", right_on="name").drop('name', axis=1)

employee group salary

0 Bob Accounting 70000

1 Jake Engineering 80000

2 Lisa Engineering 120000

3 Sue HR 90000

df1a = df1.set_index('employee')
df2a = df2.set_index('employee')
print(df1a); print(df2a)

group
employee
Bob Accounting
Jake Engineering
Lisa Engineering
Sue HR
hire_date
employee
Lisa 2004
Bob 2008
Jake 2012
Sue 2014

print(df1a); print(df2a);
print(pd.merge(df1a, df2a, left_index=True, right_index=True))

group
employee
Bob Accounting
Jake Engineering
Lisa Engineering
Sue HR
hire_date
employee
Lisa 2004
Bob 2008
Jake 2012
Sue 2014
group hire_date
employee
Bob Accounting 2008
Jake Engineering 2012

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 22/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
Lisa Engineering 2004
Sue HR 2014

print(df1a); print(df2a); print(df1a.join(df2a))

group
employee
Bob Accounting
Jake Engineering
Lisa Engineering
Sue HR
hire_date
employee
Lisa 2004
Bob 2008
Jake 2012
Sue 2014
group hire_date
employee
Bob Accounting 2008
Jake Engineering 2012
Lisa Engineering 2004
Sue HR 2014

print(df1a); print(df3);
print(pd.merge(df1a, df3, left_index=True, right_on='name'))

group
employee
Bob Accounting
Jake Engineering
Lisa Engineering
Sue HR
name salary
0 Bob 70000
1 Jake 80000
2 Lisa 120000
3 Sue 90000
group name salary
0 Accounting Bob 70000
1 Engineering Jake 80000
2 Engineering Lisa 120000
3 HR Sue 90000

df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],


'food': ['fish', 'beans', 'bread']},
columns=['name', 'food'])
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
'drink': ['wine', 'beer']},
columns=['name', 'drink'])
print(df6); print(df7); print(pd.merge(df6, df7))

name food
0 Peter fish
1 Paul beans
2 Mary bread
name drink
0 Mary wine
1 Joseph beer
name food drink
0 Mary bread wine

pd.merge(df6, df7, how='inner')

name food drink

0 Mary bread wine

print(df6); print(df7); print(pd.merge(df6, df7, how='outer'))

name food
0 Peter fish
1 Paul beans
2 Mary bread
name drink
0 Mary wine
1 Joseph beer
name food drink
0 Peter fish NaN
1 Paul beans NaN
2 Mary bread wine
3 Joseph NaN beer

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 23/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
print(df6); print(df7); print(pd.merge(df6, df7, how='left'))

name food
0 Peter fish
1 Paul beans
2 Mary bread
name drink
0 Mary wine
1 Joseph beer
name food drink
0 Peter fish NaN
1 Paul beans NaN
2 Mary bread wine

df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],


'rank': [1, 2, 3, 4]})
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'rank': [3, 1, 4, 2]})
print(df8); print(df9); print(pd.merge(df8, df9, on="name"))

name rank
0 Bob 1
1 Jake 2
2 Lisa 3
3 Sue 4
name rank
0 Bob 3
1 Jake 1
2 Lisa 4
3 Sue 2
name rank_x rank_y
0 Bob 1 3
1 Jake 2 1
2 Lisa 3 4
3 Sue 4 2

print(df8); print(df9);
print(pd.merge(df8, df9, on="name", suffixes=["_L", "_R"]))

name rank
0 Bob 1
1 Jake 2
2 Lisa 3
3 Sue 4
name rank
0 Bob 3
1 Jake 1
2 Lisa 4
3 Sue 2
name rank_L rank_R
0 Bob 1 3
1 Jake 2 1
2 Lisa 3 4
3 Sue 4 2

import seaborn as sns


planets = sns.load_dataset('planets')
planets.shape

(1035, 6)

planets.head()

method number orbital_period mass distance year

0 Radial Velocity 1 269.300 7.10 77.40 2006

1 Radial Velocity 1 874.774 2.21 56.95 2008

2 Radial Velocity 1 763.000 2.60 19.84 2011

3 Radial Velocity 1 326.030 19.40 110.62 2007

4 Radial Velocity 1 516.220 10.50 119.47 2009

rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
ser

0 0.374540
1 0.950714
2 0.731994
3 0.598658

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 24/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
4 0.156019
dtype: float64

ser.sum()

2.811925491708157

ser.mean()

0.5623850983416314

df = pd.DataFrame({'A': rng.rand(5),
'B': rng.rand(5)})
df

A B

0 0.155995 0.020584

1 0.058084 0.969910

2 0.866176 0.832443

3 0.601115 0.212339

4 0.708073 0.181825

df.mean()

A 0.477888
B 0.443420
dtype: float64

df.mean(axis='columns')

0 0.088290
1 0.513997
2 0.849309
3 0.406727
4 0.444949
dtype: float64

planets.dropna().describe()

number orbital_period mass distance year

count 498.00000 498.000000 498.000000 498.000000 498.000000

mean 1.73494 835.778671 2.509320 52.068213 2007.377510

std 1.17572 1469.128259 3.636274 46.596041 4.167284

min 1.00000 1.328300 0.003600 1.350000 1989.000000

25% 1.00000 38.272250 0.212500 24.497500 2005.000000

50% 1.00000 357.000000 1.245000 39.940000 2009.000000

75% 2.00000 999.600000 2.867500 59.332500 2011.000000

max 6.00000 17337.500000 25.000000 354.000000 2014.000000

df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],


'data': range(6)}, columns=['key', 'data'])
df

key data

0 A 0

1 B 1

2 C 2

3 A 3

4 B 4

5 C 5

df.groupby('key')

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 25/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7b56d3feb220>

df.groupby('key').sum()

data

key

A 3

B 5

C 7

planets.groupby('method')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7b56d4030b20>

planets.groupby('method')['orbital_period']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7b56d4030ee0>

planets.groupby('method')['orbital_period'].median()

method
Astrometry 631.180000
Eclipse Timing Variations 4343.500000
Imaging 27500.000000
Microlensing 3300.000000
Orbital Brightness Modulation 0.342887
Pulsar Timing 66.541900
Pulsation Timing Variations 1170.000000
Radial Velocity 360.200000
Transit 5.714932
Transit Timing Variations 57.011000
Name: orbital_period, dtype: float64

for (method, group) in planets.groupby('method'):


print("{0:30s} shape={1}".format(method, group.shape))

Astrometry shape=(2, 6)
Eclipse Timing Variations shape=(9, 6)
Imaging shape=(38, 6)
Microlensing shape=(23, 6)
Orbital Brightness Modulation shape=(3, 6)
Pulsar Timing shape=(5, 6)
Pulsation Timing Variations shape=(1, 6)
Radial Velocity shape=(553, 6)
Transit shape=(397, 6)
Transit Timing Variations shape=(4, 6)

planets.groupby('method')['year'].describe().unstack()

method
count Astrometry 2.0
Eclipse Timing Variations 9.0
Imaging 38.0
Microlensing 23.0
Orbital Brightness Modulation 3.0
...
max Pulsar Timing 2011.0
Pulsation Timing Variations 2007.0
Radial Velocity 2014.0
Transit 2014.0
Transit Timing Variations 2014.0
Length: 80, dtype: float64

rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data1': range(6),
'data2': rng.randint(0, 10, 6)},
columns = ['key', 'data1', 'data2'])
df

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 26/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory

key data1 data2

0 A 0 5

1 B 1 0
df.groupby('key').aggregate(['min',
2 C 2 3 np.median, max])

3 A 3 3
data1 data2
4 B 4 7
min median max min median max
5 C 5 9
key

A 0 1.5 3 3 4.0 5

B 1 2.5 4 0 3.5 7

C 2 3.5 5 3 6.0 9

df.groupby('key').aggregate({'data1': 'min',
'data2': 'max'})

data1 data2

key

A 0 5

B 1 7

C 2 9

def filter_func(x):
return x['data2'].std() > 4
print(df); print(df.groupby('key').std());
print(df.groupby('key').filter(filter_func))

key data1 data2


0 A 0 5
1 B 1 0
2 C 2 3
3 A 3 3
4 B 4 7
5 C 5 9
data1 data2
key
A 2.12132 1.414214
B 2.12132 4.949747
C 2.12132 4.242641
key data1 data2
1 B 1 0
2 C 2 3
4 B 4 7
5 C 5 9

df.groupby('key').transform(lambda x: x - x.mean())

def norm_by_data2(x):
x['data1'] /= x['data2'].sum()
return x
print(df); print(df.groupby('key').apply(norm_by_data2))

key data1 data2


0 A 0 5
1 B 1 0
2 C 2 3
3 A 3 3
4 B 4 7
5 C 5 9
key data1 data2
0 A 0.000000 5
1 B 0.142857 0
2 C 0.166667 3
3 A 0.375000 3
4 B 0.571429 7
5 C 0.416667 9
<ipython-input-548-05fb1586265c>:4: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the fut
To preserve the previous behavior, use

>>> .groupby(..., group_keys=False)

To adopt the future behavior and silence this warning, use

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 27/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
>>> .groupby(..., group_keys=True)
print(df); print(df.groupby('key').apply(norm_by_data2))

L = [0, 1, 0, 1, 2, 0]
print(df); print(df.groupby(L).sum())

key data1 data2


0 A 0 5
1 B 1 0
2 C 2 3
3 A 3 3
4 B 4 7
5 C 5 9
data1 data2
0 7 17
1 4 3
2 4 7
<ipython-input-549-b7449fcee4bf>:2: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a fut
print(df); print(df.groupby(L).sum())

print(df); print(df.groupby(df['key']).sum())

key data1 data2


0 A 0 5
1 B 1 0
2 C 2 3
3 A 3 3
4 B 4 7
5 C 5 9
data1 data2
key
A 3 8
B 5 7
C 7 12

df2 = df.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
print(df2); print(df2.groupby(mapping).sum())

data1 data2
key
A 0 5
B 1 0
C 2 3
A 3 3
B 4 7
C 5 9
data1 data2
key
consonant 12 19
vowel 3 8

print(df2); print(df2.groupby(str.lower).mean())

data1 data2
key
A 0 5
B 1 0
C 2 3
A 3 3
B 4 7
C 5 9
data1 data2
key
a 1.5 4.0
b 2.5 3.5
c 3.5 6.0

df2.groupby([str.lower, mapping]).mean()

data1 data2

key key

a vowel 1.5 4.0

b consonant 2.5 3.5

c consonant 3.5 6.0

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 28/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
decade = 10 * (planets['year'] // 10)
decade = decade.astype(str) + 's'
decade.name = 'decade'
planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)

decade 1980s 1990s 2000s 2010s

method

Astrometry 0.0 0.0 0.0 2.0

Eclipse Timing Variations 0.0 0.0 5.0 10.0

Imaging 0.0 0.0 29.0 21.0

Microlensing 0.0 0.0 12.0 15.0

Orbital Brightness Modulation 0.0 0.0 0.0 5.0

Pulsar Timing 0.0 9.0 1.0 1.0

Pulsation Timing Variations 0.0 0.0 1.0 0.0

Radial Velocity 1.0 52.0 475.0 424.0

Transit 0.0 0.0 64.0 712.0

Transit Timing Variations 0.0 0.0 0.0 9.0

import numpy as np
import pandas as pd
import seaborn as sns
titanic = sns.load_dataset('titanic')

titanic.head()

survived pclass sex age sibsp parch fare embarked class who adult

0 0 3 male 22.0 1 0 7.2500 S Third man

1 1 1 female 38.0 1 0 71.2833 C First woman

2 1 3 female 26.0 0 0 7.9250 S Third woman

3 1 1 female 35.0 1 0 53.1000 S First woman

4 0 3 male 35.0 0 0 8.0500 S Third man

titanic.groupby('sex')[['survived']].mean()

survived

sex

female 0.742038

male 0.188908

titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()

class First Second Third

sex

female 0.968085 0.921053 0.500000

male 0.368852 0.157407 0.135447

titanic.pivot_table('survived', index='sex', columns='class')

class First Second Third

sex

female 0.968085 0.921053 0.500000

male 0.368852 0.157407 0.135447

age = pd.cut(titanic['age'], [0, 18, 80])


titanic.pivot_table('survived', ['sex', age], 'class')

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 29/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory

class First Second Third

sex age

female (0, 18] 0.909091 1.000000 0.511628

(18, 80] 0.972973 0.900000 0.423729

male (0, 18] 0.800000 0.600000 0.215686

(18, 80] 0.375000 0.071429 0.133663


fare = pd.qcut(titanic['fare'], 2)
titanic.pivot_table('survived', ['sex', age], [fare, 'class'])

fare (-0.001, 14.454] (14.454, 512.329]

class First Second Third First Second Third

sex age

female (0, 18] NaN 1.000000 0.714286 0.909091 1.000000 0.318182

(18, 80] NaN 0.880000 0.444444 0.972973 0.914286 0.391304

male (0, 18] NaN 0.000000 0.260870 0.800000 0.818182 0.178571

(18, 80] 0.0 0.098039 0.125000 0.391304 0.030303 0.192308

titanic.pivot_table(index='sex', columns='class',
aggfunc={'survived':sum, 'fare':'mean'})

fare survived

class First Second Third First Second Third

sex

female 106.125798 21.970121 16.118810 91 70 72

male 67.226127 19.741782 12.661633 45 17 47

titanic.pivot_table('survived', index='sex', columns='class', margins=True)

class First Second Third All

sex

female 0.968085 0.921053 0.500000 0.742038

male 0.368852 0.157407 0.135447 0.188908

All 0.629630 0.472826 0.242363 0.383838

import numpy as np
x = np.array([2, 3, 5, 7, 11, 13])
x * 2

array([ 4, 6, 10, 14, 22, 26])

data = ['peter', 'Paul', 'MARY', 'gUIDO']


[s.capitalize() for s in data]

['Peter', 'Paul', 'Mary', 'Guido']

import pandas as pd
names = pd.Series(data)
names

0 peter
1 Paul
2 MARY
3 gUIDO
dtype: object

names.str.capitalize()

0 Peter
1 Paul
2 Mary

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 30/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
3 Guido
dtype: object

monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',


'Eric Idle', 'Terry Jones', 'Michael Palin'])

monte.str.lower()

0 graham chapman
1 john cleese
2 terry gilliam
3 eric idle
4 terry jones
5 michael palin
dtype: object

monte.str.len()

0 14
1 11
2 13
3 9
4 11
5 13
dtype: int64

monte.str.startswith('T')

0 False
1 False
2 True
3 False
4 True
5 False
dtype: bool

monte.str.split()

0 [Graham, Chapman]
1 [John, Cleese]
2 [Terry, Gilliam]
3 [Eric, Idle]
4 [Terry, Jones]
5 [Michael, Palin]
dtype: object

monte.str.extract('([A-Za-z]+)')

0 Graham

1 John

2 Terry

3 Eric

4 Terry

5 Michael

monte.str.findall(r'^[^AEIOU].*[^aeiou]$')

0 [Graham Chapman]
1 []
2 [Terry Gilliam]
3 []
4 [Terry Jones]
5 [Michael Palin]
dtype: object

monte.str[0:3]

0 Gra
1 Joh
2 Ter
3 Eri
4 Ter
5 Mic
dtype: object

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 31/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
monte.str.split().str.get(-1)

0 Chapman
1 Cleese
2 Gilliam
3 Idle
4 Jones
5 Palin
dtype: object

full_monte = pd.DataFrame({'name': monte,


'info': ['B|C|D', 'B|D', 'A|C', 'B|D', 'B|C',
'B|C|D']})
full_monte

name info

0 Graham Chapman B|C|D

1 John Cleese B|D

2 Terry Gilliam A|C

3 Eric Idle B|D

4 Terry Jones B|C

5 Michael Palin B|C|D

full_monte['info'].str.get_dummies('|')

A B C D

0 0 1 1 1

1 0 1 0 1

2 1 0 1 0

3 0 1 0 1

4 0 1 1 0

5 0 1 1 1

spice_list = ['salt', 'pepper', 'oregano', 'sage', 'parsley',


'rosemary', 'tarragon', 'thyme', 'paprika', 'cumin']

from datetime import datetime


datetime(year=2015, month=7, day=4)

datetime.datetime(2015, 7, 4, 0, 0)

from dateutil import parser


date = parser.parse("4th of July, 2015")
date

datetime.datetime(2015, 7, 4, 0, 0)

date.strftime('%A')

'Saturday'

import numpy as np
date = np.array('2015-07-04', dtype=np.datetime64)
date

array('2015-07-04', dtype='datetime64[D]')

date + np.arange(12)

array(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',


'2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
'2015-07-12', '2015-07-13', '2015-07-14', '2015-07-15'],
dtype='datetime64[D]')

np.datetime64('2015-07-04')

numpy.datetime64('2015-07-04')

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 32/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
np.datetime64('2015-07-04 12:00')

numpy.datetime64('2015-07-04T12:00')

np.datetime64('2015-07-04 12:59:59.50', 'ns')

numpy.datetime64('2015-07-04T12:59:59.500000000')

import pandas as pd
date = pd.to_datetime("4th of July, 2015")
date

Timestamp('2015-07-04 00:00:00')

date.strftime('%A')

'Saturday'

date + pd.to_timedelta(np.arange(12), 'D')

DatetimeIndex(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',


'2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
'2015-07-12', '2015-07-13', '2015-07-14', '2015-07-15'],
dtype='datetime64[ns]', freq=None)

index = pd.DatetimeIndex(['2014-07-04', '2014-08-04',


'2015-07-04', '2015-08-04'])
data = pd.Series([0, 1, 2, 3], index=index)
data

2014-07-04 0
2014-08-04 1
2015-07-04 2
2015-08-04 3
dtype: int64

data['2014-07-04':'2015-07-04']

2014-07-04 0
2014-08-04 1
2015-07-04 2
dtype: int64

data['2015']

2015-07-04 2
2015-08-04 3
dtype: int64

dates = pd.to_datetime([datetime(2015, 7, 3), '4th of July, 2015',


'2015-Jul-6', '07-07-2015', '20150708'])
dates

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',


'2015-07-08'],
dtype='datetime64[ns]', freq=None)

dates.to_period('D')

PeriodIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',


'2015-07-08'],
dtype='period[D]')

dates - dates[0]

TimedeltaIndex(['0 days', '1 days', '3 days', '4 days', '5 days'], dtype='timedelta64[ns]', freq=None)

pd.date_range('2015-07-03', '2015-07-10')

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',


'2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
dtype='datetime64[ns]', freq='D')

pd.date_range('2015-07-03', periods=8)

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',


'2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 33/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory
dtype='datetime64[ns]', freq='D')

pd.date_range('2015-07-03', periods=8, freq='H')

DatetimeIndex(['2015-07-03 00:00:00', '2015-07-03 01:00:00',


'2015-07-03 02:00:00', '2015-07-03 03:00:00',
'2015-07-03 04:00:00', '2015-07-03 05:00:00',
'2015-07-03 06:00:00', '2015-07-03 07:00:00'],
dtype='datetime64[ns]', freq='H')

pd.period_range('2015-07', periods=8, freq='M')

PeriodIndex(['2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12',


'2016-01', '2016-02'],
dtype='period[M]')

pd.timedelta_range(0, periods=10, freq='H')

TimedeltaIndex(['0 days 00:00:00', '0 days 01:00:00', '0 days 02:00:00',


'0 days 03:00:00', '0 days 04:00:00', '0 days 05:00:00',
'0 days 06:00:00', '0 days 07:00:00', '0 days 08:00:00',
'0 days 09:00:00'],
dtype='timedelta64[ns]', freq='H')

pd.timedelta_range(0, periods=9, freq="2H30T")

TimedeltaIndex(['0 days 00:00:00', '0 days 02:30:00', '0 days 05:00:00',


'0 days 07:30:00', '0 days 10:00:00', '0 days 12:30:00',
'0 days 15:00:00', '0 days 17:30:00', '0 days 20:00:00'],
dtype='timedelta64[ns]', freq='150T')

from pandas.tseries.offsets import BDay


pd.date_range('2015-07-01', periods=5, freq=BDay())

DatetimeIndex(['2015-07-01', '2015-07-02', '2015-07-03', '2015-07-06',


'2015-07-07'],
dtype='datetime64[ns]', freq='B')

import pandas as pd
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols))
for i in range(4))

%timeit df1 + df2 + df3 + df4

110 ms ± 14.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

%timeit pd.eval('df1 + df2 + df3 + df4')

56.8 ms ± 11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

np.allclose(df1 + df2 + df3 + df4,


pd.eval('df1 + df2 + df3 + df4'))

True

df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100, 3)))


for i in range(5))

result1 = -df1 * df2 / (df3 + df4) - df5


result2 = pd.eval('-df1 * df2 / (df3 + df4) - df5')
np.allclose(result1, result2)

True

result1 = (df1 < df2) & (df2 <= df3) & (df3 != df4)
result2 = pd.eval('df1 < df2 <= df3 != df4')
np.allclose(result1, result2)

True

result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
result2 = pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)')
np.allclose(result1, result2)

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 34/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory

True

result3 = pd.eval('(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)')
np.allclose(result1, result3)

True

result1 = df2.T[0] + df3.iloc[1]


result2 = pd.eval('df2.T[0] + df3.iloc[1]')
np.allclose(result1, result2)

True

df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])


df.head()

A B C

0 0.375506 0.406939 0.069938

1 0.069087 0.235615 0.154374

2 0.677945 0.433839 0.652324

3 0.264038 0.808055 0.347197

4 0.589161 0.252418 0.557789

result1 = (df['A'] + df['B']) / (df['C'] - 1)


result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
np.allclose(result1, result2)

True

result3 = df.eval('(A + B) / (C - 1)')


np.allclose(result1, result3)

True

df.head()

A B C

0 0.375506 0.406939 0.069938

1 0.069087 0.235615 0.154374

2 0.677945 0.433839 0.652324

3 0.264038 0.808055 0.347197

4 0.589161 0.252418 0.557789

df.eval('D = (A + B) / C', inplace=True)


df.head()

A B C D

0 0.375506 0.406939 0.069938 11.187620

1 0.069087 0.235615 0.154374 1.973796

2 0.677945 0.433839 0.652324 1.704344

3 0.264038 0.808055 0.347197 3.087857

4 0.589161 0.252418 0.557789 1.508776

df.eval('D = (A - B) / C', inplace=True)


df.head()

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 35/36
11/30/23, 11:37 AM Pandas.ipynb - Colaboratory

A B C D
column_mean = df.mean(1)
0 0.375506 0.406939 0.069938 -0.449425
result1 = df['A'] + column_mean
result2
1 =0.069087
df.eval('A + @column_mean')
0.235615 0.154374 -1.078728
np.allclose(result1, result2)
2 0.677945 0.433839 0.652324 0.374209
True
3 0.264038 0.808055 0.347197 -1.566886

4 =0.589161
result1 df[(df.A 0.252418
< 0.5) & 0.557789 0.603708
(df.B < 0.5)]
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]')
np.allclose(result1, result2)

True

result2 = df.query('A < 0.5 and B < 0.5')


np.allclose(result1, result2)

True

Cmean = df['C'].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query('A < @Cmean and B < @Cmean')
np.allclose(result1, result2)

True

x = df[(df.A < 0.5) & (df.B < 0.5)]

tmp1 = df.A < 0.5


tmp2 = df.B < 0.5
tmp3 = tmp1 & tmp2
x = df[tmp3]

df l b t

https://colab.research.google.com/drive/1kPoNQIODs3ToNLdhklAC8BpC5QHfgCmf#printMode=true 36/36

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy