Numpy Merged
Numpy Merged
1
[3] : dtype('int32')
[4] : #ndim
a.ndim
[4]: 1
[5] : #size
a.size
[5]: 3
[6] : #shape
a.shape
[6]: (3,)
2
print(c)
[[1 2 3 4 5]
[6 7 8 9 1]]
[11] : #set
a=[1,2,3,4,5]
c=set(a)
np.array(c)
[11] : array({1, 2, 3, 4, 5}, dtype=object)
[12] : #Set
l={1,2,3,4}
print(np.array(l))
{1, 2, 3, 4}
[13] : #dictionary
import numpy as np
dict={'a':1,'b':2,'c':3}
z=np.array(list(dict.items()
)) print(z)
a=np.array(list(dict.keys())
) print(a)
[['a' '1']
['b' '2']
['c' '3']]
['a' 'b' 'c']
3
[16] : b=np.zeros([3,3])
print(b)
[[0. 0. 0.]
[0. 0. 0.]
[0. 0. 0.]]
[17] : #zeros_like()
x=np.array([[1,3,7],[2,5,9]])
x
[17] : array([[1, 3, 7],
[2, 5, 9]])
[18] : d=np.zeros_like(x)
d
[[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]]
[21] :
#using eye() method
a=np.eye(3)
print(a)
[[1. 0. 0.]
[0. 1. 0.]
[0. 0. 1.]]
[22] :
c=np.eye(3,k=1)
print(c)
[[0. 1. 0.]
[0. 0. 1.]
[0. 0. 0.]]
4
[23] : #using identity() method
a=np.identity(3)
print(a)
[[1. 0. 0.]
[0. 1. 0.]
[0. 0. 1.]]
[24] :
#using full() method
d=np.full((2,2),7)
print(d)
[[7 7]
[7 7]]
[25] : np.arange(15)
[26] : x=np.arange(6,dtype=in
t) np.full_like(x,1)
[27] : c=np.full_like(x,0.1)
c
[28] : d=np.full_like(x,0.1,dtype=np.double)
d
[[5.20093491e-090, 5.69847262e-066],
5
[5.51292779e+169, 4.85649086e-033],
[6.48224659e+170, 5.82471487e+257]]])
[31]: #empty_like()
a=([1,2,3],[4,5,6])
np.empty_like(a)
[32]: array([[1, 0, 0,
0],
[0, 2, 0, 0],
[0, 0, 3, 0],
[0, 0, 0, 4]])
6
[35]: #choice() method
y=np.random.bytes(7)
print(y)
a=np.random.choice(['true','false'],size=(2,3))
print(a)
b"t'\n\x16\x14QB"
[['true' 'false'
'false']
[36] ['true' 'false' 'true']]
: #complex number
x = random.rand(1) +
random.rand(1)*1j print (x)
print(x.real)
print(x.imag)
[0.08421058+0.69654499j]
[0.08421058]
[0.69654499]
[37]
#complex number using the rand()method
:
x = random.rand(1,5) +
random.rand(1,5)*1j print (x)
[39] : #permutation()
np.random.permutation(5)
[39] : array([0, 3, 4, 1, 2])
[40] : a=np.array(5)
b=np.random.choice(a,size=5,p=[0.1,0.2,0.3,0.2,0.2])
print(b)
[4 3 0 3 4]
[41] : #randint()
np.random.randint(1,5)
[41]: 3
7
[42] : #randn()
a=np.random.randn(1,10)
print(a)
5 2.1 Indexing
[45] : Indexing in NumPy refers to accessing individual elements or groups of elements within an array
#integer indexing
import numpy as np
x = np.array([[1, 2], [3, 4], [5,
6]]) y = x[[0,1,2], [0,1,0]]
print(x[0,1])
2
[46] : a=[3,4,5,6,7]
print(a[0])
3
[47] : arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
arr3d[0]
[47] : array([[1, 2, 3],
[4, 5, 6]])
[48] : #copy()
old_values = arr3d[0].copy()
arr3d[0] = 42
8
print(arr3d)
[[[42 42 42]
[42 42 42]]
[[ 7 8 9]
[49] : [10 11 12]]]
import numpy as np
arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
print(arr[0, 1, 2])
6
[53] : import numpy as np
9
6 2.2 Slicing
Slicing in NumPy refers to the process of selecting a specific subset of elements from an array. It
allows you to create a new view of the original data without copying it, which can be very efficient
[54] : in terms of memory usage.
#slicing
import numpy as np
arr=np.array([5,6,7,8,9
]) print(arr[1:3])
[6 7]
[55] : import numpy as np
arr=np.array([5,6,7,3,6,8,9])
print(arr[1:])
[6 7 3 6 8 9]
[56] : import numpy as np
arr=np.array([1,2,3,4,5,8,9])
print(arr[1:])
[2 3 4 5 8 9]
[57] : arr=np.array([5,6,7,8,9
]) print(arr[:3])
[5 6 7]
[58] : arr=np.array([5,6,7,8,9])
print(arr[-3:-1])
[7 8]
[59] : arr=np.array([5,6,7,8,9
]) print(arr[:3])
[5 6 7]
[60] : arr=np.array([5,6,7,8,9
]) print(arr[:3])
[5 6 7]
[61] : arr=np.array([5,6,7,8,9])
print(arr[-3:-1])
[7 8]
10
[62] : #slicing parameters separated by a
#colon : (start:stop:step) directly to the ndarray object
arr=np.array([5,6,7,8,4,5,6,7,9])
print(arr[1:5:2])
[6 8]
[63] : arr=np.array([5,6,7,8,4,5,6,7,9])
print(arr[-1:-5:-1])
[9 7 6 5]
print(arr[1, 1:4])
[7 8 9]
print(arr[0:2, 2])
[3 8]
print(arr[0:2, 1:4])
[[2 3 4]
[7 8 9]]
[67]
: #string indexing
b = "Hello, World!"
print(b[2:5])
llo
[68]: b = "Hello, World!"
print(b[:5])
Hello
11
[69]: b = "Hello, World!"
print(b[2:])
llo, World!
7 2.3 Re-Shaping
Reshaping in NumPy is the process of changing the shape (i.e., dimensions) of an existing array
without altering the data. This is particularly useful when you need to transform an array to fit a
[70] certain shape for further operations, such as machine learning or data processing task
:
import numpy as np
print(arr.shape)
(2, 4)
[71]: import numpy as np
print(arr)
print('shape of array :', arr.shape)
[[[[[1 2 3 4]]]]]
shape of array : (1, 1, 1, 1, 4)
[72]
: import numpy as np
arr1= arr.reshape(4,
3) print(arr1)
[[ 2 3]
1
[ 4 5 6]
[ 7 8 9]
[10 11 12]]
[73]: import numpy as np
arr1 = arr.reshape(2, 2, 3)
12
print(arr1)
[[ 1 2 3]
[
[ 4 5 6]]
[[ 7 8 9]
[10 11 12]]]
[74]: import numpy as np
a=np.arange(8)
print(a.reshape(4,2))
[[0 1]
[2 3]
[4 5]
[75] [6 7]]
: a=np.arange(12).reshape(4,3)
print(a)
[[ 0 1 2]
[ 3 4 5]
[ 6 7 8]
[ 9 10 11]]
[[0 1 0 1]
[2 3 2 3]
[4 5 4 5]]
[77] #numpy.hstack and vstack
: a = np.array([[1,2],[3,4]])
b = np.array([[5,6],[7,8]])
print(np.stack((a,b)))
[[[1 2]
[3 4]]
13
[[5 6]
[7 8]]]
[78]
: #stack()
print(np.stack((a,b),axis=0))
[[[1 2]
[3 4]]
[[5 6]
[7 8]]
]
[79]: print(np.stack((a,b),axis=1))
[[[1 2]
[5 6]]
[[3 4]
[7 8]]
]
[80]: #hstack()
ch = np.hstack((a,b))
print(ch)
[[1 2 5 6]
[3 4 7 8]]
[81]
: #vstack()
ch = np.vstack((a,b))
print(ch)
[[1 2]
[3 4]
[5 6]
[7 8]]
9 2.5 Splitting
Splitting in NumPy involves dividing an array into multiple sub-arrays. This can be useful when
you need to partition data for different processing purposes or when dealing with chunks of data
[82] in a structured way.
:
import numpy as np
a = np.arange(9)
print(a)
[0 1 2 3 4 5 6 7 8]
14
[83]: #split()
b = np.split(a,3)
print(b)
[array([0, 1, 2]), array([3, 4, 5]), array([6, 7, 8])]
[84]: #horizontal split-hsplit()
a = np.arange(12).reshape(4,3)
b=np.hsplit(a,3)
print(b)
[array([[0],
[3],
[6],
[9] ]), array([[ 1],
[ 4],
[ 7],
[10] ]), array([[ 2],
[ 5],
[85] [ 8],
: [11] ])]
#verticsl split-vsplit()
b=np.vsplit(a,2)
print(b) 1, 2],
[array([[0,
[3, 4, 5]]), array([[ 6, 7, 8],
[ 9, 10, 11]])]
15
[87] : array([0. , 1. , 1.41421356, 1.73205081, 2. ,
2.23606798, 2.44948974, 2.64575131, 2.82842712, 3. ])
[88] : #exp()
np.exp(arr)
[88] : array([1.00000000e+00, 2.71828183e+00, 7.38905610e+00,
2.00855369e+01, 5.45981500e+01, 1.48413159e+02, 4.03428793e+02,
1.09663316e+03,
2.98095799e+03, 8.10308393e+03])
[89] : #min()
np.min(arr)
[89]: 0
[90] : #max()
np.max(arr)
[90]: 9
[91] : #average()
np.average(arr)
[91]: 4.5
[92] : #abs()
print(np.abs(arr))
[0 1 2 3 4 5 6 7 8 9]
[93] : #fabs()
arr=np.arange(0,-5,-0.5)
print(np.fabs(arr))
[0. 0.5 1. 1.5 2. 2.5 3. 3.5 4. 4.5]
x =
np.random.randn(8) y
= np.random.randn(8)
[print(x)
1.11097262 -0.26995231 0.0060993 1.04398907 -1.82141342 0.00998652
0.08274781 0.82046885]
16
[95] :
print(y)
[101] : np.multiply(a,b)
[102] : np.divide(a,b)
17
[103] : import numpy as np
a = np.array([10,100,1000])
np.power(a,2)
[103] : array([ 100, 10000, 1000000], dtype=int32)
a = np.array([[3,7,5],[8,4,3],[2,4,9]])
[104]a: array([[3, 7,
5],
[8, 4, 3],
[2, 4, 9]])
[105] : #sum()
a.sum()
[105]: 45
[106] : #percentile()
import numpy as np
a = np.array([[30,40,70],[80,20,10],[50,90,60]])
np.percentile(a,90)
[106]: 82.0
[108] : #mean()
arr.mean()
[108]: -0.14756616582071838
[109] : arr.mean(axis=1)
[110] : #median()
np.median(arr)
[110]: -0.28413298907449897
18
[111] : #standard deviation
np.std(arr)
[111]: 0.9329450218698545
[112] : #variance
np.var(arr)
[112]: 0.8703864138317433
[113] : #sum()
arr.sum(axis=0)
[113] : array([ 0.68253865, -2.88096912, -2.108008 , 1.35511515])
[ 0 1 3 6 10 15 21 28]
[115] : arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
print(arr.cumsum(axis=0))
[[ 1 2]
0
[ 3 5 7]
[ 9 12 15]]
[116] : print(arr.cumprod(axis=1))
[[ 0 0 0]
[ 3 12 60]
[ 6 42 336]]
19
[119] : #greater()
print(np.greater(a,b))
True
[123] :
[print(np.less(a,b))
True False False False]
[124] : #less_equal()
print(np.less_equal(a,b))
[ True False True False]
20
[127] : np.sort(a,axis=0)
[128] : np.sort(a,axis=1)
21
[134]: 14.3 5.3 Set Operations
#in1d()method
import numpy as np
values = np.array([6, 0, 0, 3, 2, 5, 6])
print(np.in1d(values, [2, 3, 6]))
22
[[[242 242 242]
[242 242 242]
[242 242 242]
…
[195 195 195]
[195 195 195]
[195 195 195]]
25
[11] : crop_img=a[100:900,100:900,:]
img_out=Image.fromarray(crop_img)
img_out
[11]:
[12] : flipped_img=np.flipud(a)
display(Image.fromarray(flipped_img))
26
[ ]:
27
Data Manipulation with Pandas
0 4
1 7
2 -5
3 3
dtype: int64
27
lst = ['G', 'h', 'i',
'j',
'k', 'l', 'm']
# numpy array
data = np.array(['a', 'b', 'c', 'd', 'e'])
# creating series
s = pd.Series(data)
print(s)
0 a
1 b
2 c
3 d
4 e
dtype: object
# numpy array
data = np.array(['a', 'b', 'c', 'd', 'e'])
# creating series
s = pd.Series(data, index =[1000, 1001, 1002, 1003, 1004])
print(s)
28
1000 a
1001 b
1002 c
1003 d
1004 e
[6] : dtype: object
# create a series
series = pd.Series(dictionary)
print(series)
D 10
B 20
C 30
dtype: int64
# import the pandas lib as pd
import pandas as pd
[8] :
# create a dictionary
dictionary = {'A': 50, 'B': 10, 'C': 80}
# create a series
series = pd.Series(dictionary, index=['B','C','A'])
29
print(series)
B 10
C 80
A 50
dtype: int64
[9] :
import pandas as pd
# create a dictionary
dictionary = {'A': 50, 'B': 10, 'C': 80}
# create a series
series = pd.Series(dictionary, index=['B', 'C', 'D', 'A'])
B 10.0
Cprint(series)
80.0
D NaN
A 50.0
dtype: float64
30
Day 3 3/1/2018
Day 4 4/1/2018
dtype: object
[12] :
print(sr['Day 1'])
1/1/2018
[19]: 1.0
[26]: s[['b','a','d']]
[26] : b 1.0
a 0.0
d 3.0
dtype: float64
[27] : s['b':'e']
31
[27]: 1.0
b
c 2.0
d 3.0
e 4.0
dtype: float64
[20] : s[1]
[20]: 1.0
[21] : s[2:4]
[21]: c 2.0
d 3.0
dtype: float64
[23]: s[[1,3]]
[23]: b 1.0
d 3.0
dtype: float64
[28] :
print(s[[0, 2, 4]])
a 0.0
c 2.0
e 4.0
dtype: float64
[4]
: 4.4 2.3 Filtering
import numpy as np
import pandas as pd
s=pd.Series(np.arange(5.),index=['a','b','c','d','
e']) print(s)
a 0.0
b 1.0
c 2.0
d 3.0
e 4.0
dtype: float64
[32]: s[s<2]
[32]: a 0.0
dtype: float64
32
[36]: s[s>2]
[36]: b 5.0
d 3.0
e 4.0
dtype: float64
[35]: s[s!=2]
[35]: 0.0
a
b 5.0
d 3.0
e 4.0
dtype: float64
[38]: s[(s>2)&(s<5) ]
[38]: d 3.0
e 4.0
dtype: float64
[33]: s['b':'c']
[33]: b 5.0
c 2.0
dtype: float64
[7]
print(s[1:2]==5)
:
b True
dtype: bool
[42] s[s.isin([2,4])]
:
[42]: c 2.0
e 4.0
dtype: float64
33
0 7
1 9
2 11
3 13
4 15
[4] dtype: int64
:
series3 = series1 - series2
print(series3)
0 -5
1 -5
2 -5
3 -5
4 -5
[5] dtype: int64
:
series3 = series1 *series2
print(series3)
0 6
1 14
2 24
3 36
4 50
[6] dtype: int64
:
series3 = series1 /series2
print(series3)
0 0.166667
1 0.285714
2 0.375000
3 0.444444
4 0.500000
[9] dtype: float64
:
series3 = series1 %series2
print(series3)
0 1
1 2
2 3
3 4
4 5
dtype: int64
34
[10] 4.6 2.5 Ranking
: import pandas as pd
s=pd.Series([121,211,153,214,115,116,237,118,219,120])
s.rank(ascending=True)
[10]: 0 5.0
1 7.0
2 6.0
3 8.0
4 1.0
5 2.0
6 10.0
7 3.0
8 9.0
9 4.0
dtype: float64
[49]: s.rank(ascending=False)
[49]: 0 6.0
1 4.0
2 5.0
3 3.0
4 10.0
5 9.0
6 1.0
7 8.0
8 2.0
9 7.0
dtype: float64
[11]: s.rank(method='min')
[11]: 0 5.0
1 7.0
2 6.0
3 8.0
4 1.0
5 2.0
6 10.0
7 3.0
8 9.0
9 4.0
dtype: float64
[12]: s.rank(method='max')
35
[12]: 0 5.0
1 7.0
2 6.0
3 8.0
4 1.0
5 2.0
6 10.0
7 3.0
8 9.0
9 4.0
dtype: float64
[50]: s.rank(method='first')
[50]: 0 5.0
1 7.0
2 6.0
3 8.0
4 1.0
5 2.0
6 10.0
7 3.0
8 9.0
9 4.0
dtype: float64
0 19.5000
1 16.8000
2 22.7800
3 20.1240
4 18.1002
dtype: float64
37
[53]: sr.sort_values(ascending = True)
[53]: 16.8000
1
4 18.1002
0 19.5000
3 20.1240
2 22.7800
dtype: float64
[55]: sr.sort_index()
[55]: 19.5000
0
1 16.8000
2 22.7800
3 20.1240
4 18.1002
dtype: float64
[58]
: print(sr.sort_values(kind))
1 16.8000
4 18.1002
0 19.5000
3 20.1240
2 22.7800
dtype: float64
[40]
4.8 2.7 checking null values
:
s=pd.Series({'ohio':35000,'teyas':71000,'oregon':16000,'utah':500
0}) print(s)
states=['california','ohio','Texas','oregon']
x=pd.Series(s,index=states)
print(x)
ohio 35000
teyas 71000
oregon 16000
utah 5000
dtype: int64
california
NaN ohio
35000.0
Texas NaN
oregon 16000.0
dtype: float64
[42] x.isnull()
:
38
[42]: california
True
ohio
False
Texas True
oregon False
dtype: bool
[44]: x.notnull()
[44]: california
False ohio
True
Texas False
oregon
True dtype: bool
39
2 C
dtype: object
[21]
: print(pd.concat([series1, series2], ignore_index=True))
0 1
1 2
2 3
3 A
4 B
5 C
[22] dtype: object
:
print(pd.concat([series1, series2], ignore_index=False))
0 1
1 2
2 3
0 A
1 B
[69]
2 C
:
dtype: object
series1 0 1
1 2
2 3
series2 0 A
1 B
2 C
[16] dtype: object
:
4.10 3 .Creating DataFrames from List and Dictionary
4.11 3.1 From List
data = [1, 2, 3, 4, 5]
# Convert to DataFrame
df = pd.DataFrame(data, columns=['Numbers'])
print(df)
Numbers
0 1
1 2
2 3
3 4
4 5
40
[70]: import pandas as pd
nme = ["aparna", "pankaj", "sudhir", "Geeku"]
deg = ["MBA", "BCA", "M.Tech",
"MBA"] scr = [90, 40, 80, 98]
dict = {'name': nme, 'degree': deg, 'score':
scr} df = pd.DataFrame(dict)
print(df)
name degree score
0 aparna MBA 90
1 pankaj BCA 40
2 sudhir M.Tech 80
[38]: import pandas as pd 98
3 Geeku MBA
data = [['G', 10], ['h', 15], ['i', 20]]
# Create the pandas Dataframe
df = pd.DataFrame(data, columns = ['Name', 'Age'])
# print dataframe.
print(df)
Name Age
0 G 10
1 h 15
2 i 20
print(df)
41
[14] : df=pd.DataFrame({'a':[4,5,6],'b':[7,8,9]},index=pd.MultiIndex.
𝗌from_tuples([('d',1),('d',2),('e',2)] ,names=['n','v']))
print(df)
a b
n v
d 1 4 7
2 5 8
e 2 6 9
[71]: df=pd.DataFrame({'ap':{'a':0.0,'c':3.0,'d':6.0},'ts':
{'a':1.0,'c':4.0,'d':7.
𝗌0},'tn':{'a':2.0,'c':5.0,'d':8.0}})
[71]: df.reindex(['a','b','c','d'])
ap ts tn
a 0.0 1.0 2.0
b NaN NaN NaN
c 3.0 4.0 5.0
d 6.0 7.0 8.0
[10]: id huml humw ulnal ulnaw feml fem tibl tibw tarl tarw \
w
0 0 80.78 6.68 72.01 4.88 41.81 3.70 5.50 4.03 38.70 3.84
1 1 88.91 6.63 80.53 5.59 47.04 4.30 80.22 4.51 41.50 4.01
2 2 79.97 6.37 69.26 5.28 43.07 3.90 75.35 4.04 38.31 3.34
3 3 77.65 5.70 65.76 4.77 40.04 3.52 69.17 3.40 35.78 3.41
4 4 62.80 4.84 52.09 3.73 33.95 2.72 56.27 2.96 31.88 3.13
.. … … … … … … … … … … …
415 415 17.96 1.63 19.25 1.33 18.36 1.54 31.25 1.33 21.99 1.15
416 416 19.21 1.64 20.76 1.49 19.24 1.45 33.21 1.28 23.60 1.15
417 417 18.79 1.63 19.83 1.53 20.96 1.43 34.45 1.41 22.86 1.21
418 418 20.38 1.78 22.53 1.50 21.35 1.48 36.09 1.53 25.98 1.24
419 419 17.89 1.44 19.26 1.10 17.62 1.34 29.81 1.24 21.69 1.05
type
0 SW
1 SW
2 SW
3 SW
42
4 SW
.. …
415 SO
416 SO
417 SO
418 SO
419 SO
[16] : data.tail(5)
[16]: id huml humw ulnal ulnaw feml femw tibl tibw tarl tarw \
415 415 17.96 1.63 19.25 1.33 18.36 1.54 31.25 1.33 21.99 1.15
416 416 19.21 1.64 20.76 1.49 19.24 1.45 33.21 1.28 23.60 1.15
417 417 18.79 1.63 19.83 1.53 20.96 1.43 34.45 1.41 22.86 1.21
418 418 20.38 1.78 22.53 1.50 21.35 1.48 36.09 1.53 25.98 1.24
419 419 17.89 1.44 19.26 1.10 17.62 1.34 29.81 1.24 21.69 1.05
type
415 SO
416 SO
417 SO
418 SO
419 SO
[17] : 4.16 4.3 Get shape,data type,null values,index and column details
data.shape
[17]: (420, 12)
[18] : data.dtypes
43
[18] : id int64
huml float64
humw float64
ulnal float64
ulnaw float64
feml float64
femw float64
tibl float64
tibw float64
tarl float64
tarw float64
type
object dtype:
object
[19] : data.isnull().sum()
[19]: id 0
huml 1
humw 1
ulnal 3
ulnaw 2
feml 2
femw 1
tibl 2
tibw 1
tarl 1
tarw 1
type 0
dtype: int64
[20] : data.columns
[21]: data.index
[24]: id huml humw ulnal ulnaw feml femw tibl tibw tarl tarw \
0 0 80.78 6.68 72.01 4.88 41.81 3.70 5.50 4.03 38.70 3.84
1 1 88.91 6.63 80.53 5.59 47.04 4.30 80.22 4.51 41.50 4.01
2 2 79.97 6.37 69.26 5.28 43.07 3.90 75.35 4.04 38.31 3.34
44
3 3 77.65 5.70 65.76 4.77 40.04 3.52 69.17 3.40 35.78 3.41
4 4 62.80 4.84 52.09 3.73 33.95 2.72 56.27 2.96 31.88 3.13
.. … … … … … … … … … … …
415 415 17.96 1.63 19.25 1.33 18.36 1.54 31.25 1.33 21.99 1.15
416 416 19.21 1.64 20.76 1.49 19.24 1.45 33.21 1.28 23.60 1.15
417 417 18.79 1.63 19.83 1.53 20.96 1.43 34.45 1.41 22.86 1.21
418 418 20.38 1.78 22.53 1.50 21.35 1.48 36.09 1.53 25.98 1.24
419 419 17.89 1.44 19.26 1.10 17.62 1.34 29.81 1.24 21.69 1.05
type
0 SW
1 SW
2 SW
3 SW
4 SW
.. …
415 SO
416 SO
417 SO
418 SO
419 SO
[25] data.drop([0,3])
:
[25]: id huml humw ulnal ulnaw feml fem tibl tibw tarl tarw \
w
1 1 88.91 6.63 80.53 5.59 47.04 4.30 80.22 4.51 41.50 4.01
2 2 79.97 6.37 69.26 5.28 43.07 3.90 75.35 4.04 38.31 3.34
4 4 62.80 4.84 52.09 3.73 33.95 2.72 56.27 2.96 31.88 3.13
5 5 61.92 4.78 50.46 3.47 49.52 4.41 56.95 2.73 29.07 2.83
6 6 79.73 5.94 67.39 4.50 42.07 3.41 71.26 3.56 37.22 3.64
.. … … … … … … … … … … …
415 415 17.96 1.63 19.25 1.33 18.36 1.54 31.25 1.33 21.99 1.15
416 416 19.21 1.64 20.76 1.49 19.24 1.45 33.21 1.28 23.60 1.15
417 417 18.79 1.63 19.83 1.53 20.96 1.43 34.45 1.41 22.86 1.21
418 418 20.38 1.78 22.53 1.50 21.35 1.48 36.09 1.53 25.98 1.24
419 419 17.89 1.44 19.26 1.10 17.62 1.34 29.81 1.24 21.69 1.05
type
1 SW
2 SW
4 SW
5 SW
6 SW
.. …
45
415 SO
416 SO
417 SO
418 SO
419 SO
[27]: data.drop(data[data['huml']>4.3].index)
[27] : id huml humw ulnal ulnaw feml femw tibl tibw tarl tarw type
342 342 NaN NaN NaN NaN 32.54 2.65 55.06 2.81 38.94 2.25 SO
[28] : data.loc[6,'ulnal']
[28]: 67.39
[29] : data.loc[11:15][['huml','humw']]
[30] : 4.18
data
4.5 Sorting and Ranking operations in DataFrame
[30]: id huml humw ulnal ulnaw feml fem tibl tibw tarl tarw \
w
0 0 80.78 6.68 72.01 4.88 41.81 3.70 5.50 4.03 38.70 3.84
1 1 88.91 6.63 80.53 5.59 47.04 4.30 80.22 4.51 41.50 4.01
2 2 79.97 6.37 69.26 5.28 43.07 3.90 75.35 4.04 38.31 3.34
3 3 77.65 5.70 65.76 4.77 40.04 3.52 69.17 3.40 35.78 3.41
4 4 62.80 4.84 52.09 3.73 33.95 2.72 56.27 2.96 31.88 3.13
.. … … … … … … … … … … …
415 415 17.96 1.63 19.25 1.33 18.36 1.54 31.25 1.33 21.99 1.15
416 416 19.21 1.64 20.76 1.49 19.24 1.45 33.21 1.28 23.60 1.15
417 417 18.79 1.63 19.83 1.53 20.96 1.43 34.45 1.41 22.86 1.21
418 418 20.38 1.78 22.53 1.50 21.35 1.48 36.09 1.53 25.98 1.24
419 419 17.89 1.44 19.26 1.10 17.62 1.34 29.81 1.24 21.69 1.05
type
0 SW
1 SW
2 SW
46
3 SW
4 SW
.. …
415 SO
416 SO
417 SO
418 SO
419 SO
[31] : data.sort_index(ascending=False)
[31]: id huml humw ulnal ulnaw feml fem tibl tibw tarl tarw \
w
419 419 17.89 1.44 19.26 1.10 17.62 1.34 29.81 1.24 21.69 1.05
418 418 20.38 1.78 22.53 1.50 21.35 1.48 36.09 1.53 25.98 1.24
417 417 18.79 1.63 19.83 1.53 20.96 1.43 34.45 1.41 22.86 1.21
416 416 19.21 1.64 20.76 1.49 19.24 1.45 33.21 1.28 23.60 1.15
415 415 17.96 1.63 19.25 1.33 18.36 1.54 31.25 1.33 21.99 1.15
.. … … … … … … … … … … …
4 4 62.80 4.84 52.09 3.73 33.95 2.72 56.27 2.96 31.88 3.13
3 3 77.65 5.70 65.76 4.77 40.04 3.52 69.17 3.40 35.78 3.41
2 2 79.97 6.37 69.26 5.28 43.07 3.90 75.35 4.04 38.31 3.34
1 1 88.91 6.63 80.53 5.59 47.04 4.30 80.22 4.51 41.50 4.01
0 0 80.78 6.68 72.01 4.88 41.81 3.70 5.50 4.03 38.70 3.84
type
419 SO
418 SO
417 SO
416 SO
415 SO
.. …
4 SW
3 SW
2 SW
1 SW
0 SW
[32] : data.sort_values(['ulnaw']).head(6)
[32] : id huml humw ulnal ulnaw feml femw tibl tibw tarl tarw \
369 369 13.48 1.27 16.00 1.00 12.67 1.10 23.12 0.88 16.34 0.89
413 413 12.95 1.16 14.09 1.03 13.03 1.03 22.13 0.96 15.19 1.02
395 395 15.62 1.28 18.52 1.06 15.75 1.17 28.63 1.03 21.39 0.88
47
367 367 13.31 1.17 16.47 1.06 12.32 0.93 22.47 0.95 15.97 0.75
414 414 13.63 1.16 15.22 1.06 13.75 0.99 23.13 0.96 15.62 1.01
376 376 13.52 1.28 17.88 1.07 15.10 1.05 25.14 1.23 17.81 0.69
type
369 SO
413 SO
395 SO
367 SO
414 SO
376 SO
[33] : data.sort_values(by=['ulnaw','ulnal']).head(6)
[33]: id huml humw ulnal ulnaw feml femw tibl tibw tarl tarw \
369 369 13.48 1.27 16.00 1.00 12.67 1.10 23.12 0.88 16.34 0.89
413 413 12.95 1.16 14.09 1.03 13.03 1.03 22.13 0.96 15.19 1.02
414 414 13.63 1.16 15.22 1.06 13.75 0.99 23.13 0.96 15.62 1.01
367 367 13.31 1.17 16.47 1.06 12.32 0.93 22.47 0.95 15.97 0.75
395 395 15.62 1.28 18.52 1.06 15.75 1.17 28.63 1.03 21.39 0.88
376 376 13.52 1.28 17.88 1.07 15.10 1.05 25.14 1.23 17.81 0.69
type
369 SO
413 SO
414 SO
367 SO
395 SO
376 SO
[34] : data.rank().head(10)
[34]: id huml humw ulnal ulnaw feml femw tibl tibw tarl tarw \
0 1.0 289.0 344.0 275.0 325.5 289.0 295.0 1.0 302.0 272.0 328.0
1 2.0 308.0 343.0 284.0 343.0 312.0 320.0 308.0 327.5 285.0 333.0
2 3.0 286.0 336.0 268.0 334.0 295.0 303.5 292.0 303.5 271.0 305.5
3 4.0 284.0 308.0 255.0 313.5 279.0 288.0 270.0 272.5 247.0 310.5
4 5.0 248.0 281.0 227.5 258.0 224.0 225.5 231.0 250.0 211.0 294.0
5 6.0 246.0 275.0 223.0 242.0 326.0 322.0 234.0 234.0 181.0 268.5
6 7.0 285.0 321.0 262.0 304.0 292.0 282.5 279.0 280.5 259.0 320.0
7 8.0 304.0 306.0 278.0 306.0 300.0 299.0 296.0 295.5 266.0 324.0
8 9.0 362.0 370.0 354.0 362.0 365.0 356.5 363.5 359.0 352.0 346.0
9 10.0 387.0 399.0 381.5 383.0 382.0 398.0 382.0 397.0 392.0 377.0
typ
0 e
274.5
1 274.5
48
2 274.5
3 274.5
4 274.5
5 274.5
6 274.5
7 274.5
8 274.5
9 274.5
[35] : data.rank().head(2)
[35]: id huml humw ulnal ulnaw feml femw tibl tibw tarl tarw \
0 1.0 289.0 344.0 275.0 325.5 289.0 295.0 1.0 302.0 272.0 328.0
1 2.0 308.0 343.0 284.0 343.0 312.0 320.0 308.0 327.5 285.0 333.0
type
0 274.5
1 274.5
[15]: data.rank(ascending=False).head(5)
[15]: PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket \
0 891.0 617.0 246.0 783.0 289.0 497.0 179.0 552.5 220.0
1 890.0 171.5 783.5 701.0 734.5 183.0 179.0 552.5 112.0
2 889.0 171.5 246.0 538.0 734.5 404.5 587.5 552.5 17.0
3 888.0 171.5 783.5 619.0 734.5 226.5 179.0 552.5 824.5
4 887.0 617.0 246.0 876.0 289.0 226.5 587.5 552.5 283.0
49
.. … … … … … … …
968 24 Male 87.1 1.74 187 158 67
969 25 Male 66.6 1.61 184 166 56
970 59 Female 60.4 1.76 194 120 53
971 32 Male 126.4 1.83 198 146 62
972 46 Male 88.7 1.63 166 146 66
BM
0 I
30.20
1 32.00
2 24.71
3 18.41
4 14.39
50
.. …
968 28.77
969 25.69
970 19.50
971 37.74
972 33.38
51
[25]: data['Age'].mean()
[25]: 38.68345323741007
[28]: data['Age'].median()
[28]: 40.0
[29]: data['Age'].std()
[29]: 12.180927866987108
[30] : data['Age'].sum()
[30]: 37639
[31] : data['Age'].var()
[31]: 148.37500370074312
52
Data cleaning and preparation
Student performance
Import any csv file to pandas data frame and perform the following
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('student.csv')
df
51
6603 No 8 81
6604 Yes 6 65
6605 Yes 6 91
6606 Yes 9 94
1 No College Moderate
2 No Postgraduate Near
52
3 No High School Moderate
4 No College Near
Gender Exam_Score
0 Male 67
1 Female 61
2 Male 74
3 Male 71
4 Female 70
... ... ...
6602 Female 68
6603 Female 69
6604 Female 68
6605 Female 68
6606 Male 64
# Display the first few rows of the DataFrame to understand the data
print("Original DataFrame:")
print(df.head())
Original DataFrame:
Hours_Studied Attendance Parental_Involvement Access_to_Resources
\
0 23 84 Low High
1 19 64 Low Medium
2 24 98 Medium Medium
3 29 89 Low Medium
4 19 92 Medium Medium
53
Motivation_Level \
0 No 7 73
Low
1 No 8 59
Low
2 Yes 7 91
Medium
3 Yes 8 98
Medium
4 Yes 6 65
Medium
1 Public Negative 4 No
2 Public Neutral 4 No
3 Public Negative 4 No
4 Public Neutral 4 No
Missing Data:
Hours_Studied Attendance Parental_Involvement
Access_to_Resources \
0 False False False
False
1 False False False
54
False
2 False False False
False
3 False False False
False
4 False False False
False
5 False False False
False
6 False False False
False
7 False False False
False
8 False False False
False
9 False False False
False
55
4 False False False False
56
# No of null values
n=df.isnull().sum()
n
Hours_Studied 0
Attendance 0
Parental_Involvement 0
Access_to_Resources 0
Extracurricular_Activities 0
Sleep_Hours 0
Previous_Scores 0
Motivation_Level 0
Internet_Access 0
Tutoring_Sessions 0
Family_Income 0
Teacher_Quality 78
School_Type 0
Peer_Influence 0
Physical_Activity 0
Learning_Disabilities 0
Parental_Education_Level 90
Distance_from_Home 67
Gender 0
Exam_Score 0
dtype: int64
1 19 64 Low Medium
2 24 98 Medium Medium
3 29 89 Low Medium
4 19 92 Medium Medium
5 19 88 Medium Medium
6 29 84 Medium Low
7 25 78 Low High
57
8 17 94 Medium High
9 23 98 Medium Medium
1 Public Negative 4 No
2 Public Neutral 4 No
3 Public Negative 4 No
58
4 Public Neutral 4 No
5 Public Positive 3 No
6 Private Neutral 2 No
7 Public Negative 2 No
8 Private Neutral 1 No
9 Public Positive 5 No
1 19 64 Low Medium
2 24 98 Medium Medium
3 29 89 Low Medium
4 19 92 Medium Medium
5 19 88 Medium Medium
6 29 84 Medium Low
59
7 25 78 Low High
8 17 94 Medium High
9 23 98 Medium Medium
1 Public Negative 4 No
2 Public Neutral 4 No
60
3 Public Negative 4 No
4 Public Neutral 4 No
5 Public Positive 3 No
6 Private Neutral 2 No
7 Public Negative 2 No
8 Private Neutral 1 No
9 Public Positive 5 No
1 19 64 Low Medium
2 24 98 Medium Medium
3 29 89 Low Medium
4 19 92 Medium Medium
61
0 No 7 73
Low
1 No 8 59
Low
2 Yes 7 91
Medium
3 Yes 8 98
Medium
4 Yes 6 65
Medium
1 Public Negative 4 No
2 Public Neutral 4 No
3 Public Negative 4 No
4 Public Neutral 4 No
# Display the first few rows of the DataFrame to understand the data
print("Original DataFrame:")
print(df.head())
Original DataFrame:
Hours_Studied Attendance Parental_Involvement Access_to_Resources
62
\
0 23 84 Low High
1 19 64 Low Medium
2 24 98 Medium Medium
3 29 89 Low Medium
4 19 92 Medium Medium
1 Public Negative 4 No
2 Public Neutral 4 No
3 Public Negative 4 No
4 Public Neutral 4 No
63
# Assume 'Price' is a column that we want to transform
64
2 Medium Yes 2 Medium
1 No College Moderate
2 No Postgraduate Near
4 No College Near
65
6604 No Postgraduate Near
Gender Exam_Score
0 Male 67
1 Female 61
2 Male 74
3 Male 71
4 Female 70
... ... ...
6602 Female 68
6603 Female 69
6604 Female 68
6605 Female 68
6606 Male 64
1 19 64 Low Medium
2 24 98 Medium Medium
3 29 89 Low Medium
4 19 92 Medium Medium
66
Low
2 Yes NaN 91
Medium
3 Yes NaN 98
Medium
4 Yes NaN 65
Medium
1 Public Negative 4 No
2 Public Neutral 4 No
3 Public Negative 4 No
4 Public Neutral 4 No
67
886 887 0 2
887 888 1 1
888 889 0 3
889 890 1 1
890 891 0 3
68
Titanic
# Display the first few rows of the DataFrame to understand the data
print("Original DataFrame:")
print(df.head())
Original DataFrame:
PassengerId Survived Pclass \
0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3
# Select the column to analyze for outliers (replace 'Value' with the
actual column name)
column_name = 'Fare'
# Calculate the z-scores for the selected column
z_scores = np.abs((df[column_name] - df[column_name].mean()) /
df[column_name].std())
z_scores.head(10)
0 0.502163
1 0.786404
2 0.488580
3 0.420494
4 0.486064
5 0.477848
6 0.395591
7 0.223957
69
8 0.424018
9 0.042931
Name: Fare, dtype: float64
70
d) perform vectorized string operations on pandas series
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('titanic.csv')
df
71
.. ... ... ... ... ...
886 0 211536 13.0000 NaN S
887 0 112053 30.0000 B42 S
888 2 W./C. 6607 23.4500 NaN S
889 0 111369 30.0000 C148 C
890 0 370376 7.7500 NaN Q
72
890 370376 7.7500 NaN Q
1 2 1 1 6 female 38.0 1 0
2 3 1 3 6 female 26.0 0 0
3 4 1 1 6 female 35.0 1 0
4 5 0 3 4 male 35.0 0 0
# Split the names based on a delimiter (e.g., space) and create a new
column for the first part of the name
73
df['Name'] = df['Sex'].str.split(' ').str[0]
df
74
1 2 1 1 female female 38.0 1 0
75
Data Wrangling
[3] : X A1Y
0 X0 Y0
1 X1 Y1
[4] : df2
[4] : X Y
0 X2 Y2
1 X3 Y3
[5] : result
[5]: X A1Y Y
0 X0 Y0 NaN
1 X1 Y1 NaN
0 X2 NaN Y2
1 X3 NaN Y3
76
0.0.2 MERGE
Used to merge two data frames based on a key column, similar to SQL joins. Options include
how=’inner’, how=’outer’, how=’left’, and how=’right’ for different types of joins.
[8]
: import pandas as pd
# Create DataFrame 1
df1 = pd.DataFrame({'key': ['x', 'y', 'z'], 'value1': [1, 2, 3]})
# Create DataFrame 2
df2 = pd.DataFrame({'key': ['y', 'z', 'a'], 'value2': [4, 5, 6]})
# Merge DataFrames on 'key' column using inner join
result = pd.merge(df1, df2, on='key',
how='inner') df1
[9] : df2
[10] : result
[12] : df2
77
[12] : key value2
0 y 4
1 z 5
2 a 6
[13] : result
x y
j0 x0 y0
j1 x1 y1
j2 x2 y2
z a
K0 z0 a0
K2 z2 a2
K3 z3 a3
x y z a
j0 x0 y0 NaN NaN
j1 x1 y1 NaN NaN
j2 x2 y2 NaN NaN
78
[21]: #inner join
# Create DataFrame 1
df1 = pd.DataFrame({"x": ["x0", "x1", "x2"], "y": ["y0", "y1", "y2"]},
index=["j0", "j1", "j2"]) # Create DataFrame 2
df2 = pd.DataFrame({"x": ["x0", "x1", "x3"],"z": ["z0", "z2",
"z3"], "a": ["a0", "a2", "a3"]},
index=["K0", "K2", "K3"])
df4 = df1.merge(df2,on="x", how='inner')
print(df4)
x y z a
0 x0 y0 z0
a0
1 x1 y1 z2
a2
[22]
: 0.0.5 FULL OUTER JOIN
Returns all rows from both DataFrames.
# full outer join
df5 = df1.merge(df2,on="x", how='outer')
print(df5)
x y z a
0 x0 y0 z0 a0
1 x1 y1 z2 a2
2 x2 y2 NaN NaN
3 x3 NaN z3 a3
x y z a
0 x0 y0 z0 a0
1 x1 y1 z2 a2
2 x3 NaN z3 a3
0.0.8 RESHAPE
Reshaping functions like pivot and melt are used to transform the layout of data frames.
79
[30]: import pandas as pd
# Create Series 1
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
# Create Series 2
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])
# Concatenate Series into DataFrame
df = pd.concat([s1, s2], keys=['one', 'two'])
print(df)
one a 0
b 1
c 2
d 3
two c 4
d 5
e 6
dtype: int64
[31]
: print(df.unstack())
a b c d
e one 0.01.0 2.0 3.0
NaN
[ ] two NaN NaN 4.0 5.0 6.0
:
80
81
84
85
86
87
88
89
90
91
92
93