What Can You Do With Dataframes Using Pandas?: Pandas Is A High-Level Data Manipulation Tool Developed by Wes Mckinney
What Can You Do With Dataframes Using Pandas?: Pandas Is A High-Level Data Manipulation Tool Developed by Wes Mckinney
DataFrames allow you to store and manipulate tabular data in rows of observations and
columns of variables.
Pandas is an open source Python package that is most widely used for data
science/data analysis and machine learning tasks
Series : series are similar to numpy array except we can give named or
datetime index instead of numerical index
import numpy as np
import pandas as pd
lable =['a','b','c']
lst =[10,20,30]
arr =np.array([10,20,30])
dis ={'a':10,'b':20,'c':30}
pd.Series(lst)
pd.Series(lst,lable)
pd.Series(arr,lable)
pd.Series(dis)
pd.Series([sum,print,len])
ser1 =pd.Series([1,2,3,4],['USA','CHAINA','FRANCE','GERMANY'])
ser2 =pd.Series([1,2,3,4],['USA','CHAINA','INDIA','SINGAPOOR'])
ser1
ser2
ser1['USA']
ser1 + ser2
Data frames which is but directly top of series which is used in financial data
import numpy as np
import pandas as pd
np.random.seed(101)
df =pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])
df['W']
type(df['W'])
type(df)
df.w
df[['W','X']]
df['new'] =df['Y']+df['Z']
df.drop('new')
df.drop('new',axis=1)
df
df.drop('new',axis=1,inplace=True)
df.drop('E',inplace=True)
df.loc['A']
df.iloc[2]
df.loc[['A','B']]
df.loc[['A','B'],['W','Y']]
df.iloc[2:,:]
df.iloc[2:,2:]
df.iloc[2:,:2]
df.iloc[:2,:2]
f.iloc[1:3,1:3]
df.iloc[-2:,-2:]
df.iloc[0:2,0:2]
df >0
df[booldf]
df[df>0]
df['W']>0
df[df['W']>0]
resultdf =df[df['W']<0]
resultdf
resultdf[['X','Z']]
df[df['W']<0][['X','Z']]
df.reset_index()
lst=['TN','AP','KA','MH','TS']
df['STATE']=lst
outside =['G1','G1','G1','G2','G2','G2']
inside =[1,2,3,1,2,3]
hier_index=list(zip(outside,inside))
hier_index=pd.MultiIndex.from_tuples(hier_index)
df =pd.DataFrame(randn(6,2),hier_index,['A','B'])
df.loc['G1']
df.loc['G1']['A']
df.index.names
df.index.names=['Groups','Num']
df.loc['G2'].loc[2]['B']
Cross Section
df.xs('G1')
df.xs(1,level='Num')
df.xs(('G1',2))
Missing data
d ={'A':[1,2,np.nan],'B':[5,np.nan,np.nan],'C':[1,2,3]}
df = pd.DataFrame(d)
df.dropna()
df.dropna(axis=1)
df.dropna(thresh=2)
fill value
df.fillna(value=0)
df['A'].fillna(df['A'].mean())
df['A'].fillna(df['A'].mean(),inplace=True)
Grouping
d ={'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
'Person':['RAM','SHAM','SUNIL','SUDEEP','RAHEEM','SHEETAL'],
'Sales':[250,400,200,150,350,100]}
df =pd.DataFrame(d)
bycomp.mean()
bycomp.max()
bycomp.std()
bycomp.min()
bycomp.sum()
bycomp.sum().loc['FB']
bycomp.describe()
bycomp.describe().transpose()
df.groupby('Company').describe().transpose()['FB']
Merging , joining,Concatination
df1 =pd.DataFrame({'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3'],
'C':['C0','C1','C2','C3']},
index =[0,1,2,3])
df2=pd.DataFrame({'A':['A4','A5','A6','A7'],
'B':['B4','B5','B6','B7'],
'C':['C4','C5','C6','C7']},
index =[4,5,6,7])
df3=pd.DataFrame({'A':['A8','A9','A10','A11'],
'B':['B8','B9','B10','B11'],
'C':['C8','C9','C10','C11']},
index =[8,9,10,11])
concatinate
pd.concat([df1,df2,df2])
pd.concat([df1,df2,df2],axis=1)
left =pd.DataFrame({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right =pd.DataFrame({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
pd.merge(left,right,how='inner',on='key')
emp=pd.DataFrame({'EMPNO':['E001','E0002','E003','E004'],
'ENAME':['BABJEE','RAM','SUNIL','SHAM'],
'DEPTNO':[10,10,20,30]})
dept=pd.DataFrame({'Dname':['Accounts','Admin','It'],'DEPTNO':[10,20,50]})
pd.merge(emp,dept,how='inner',on='DEPTNO')
emp=pd.DataFrame({'EMPNO':['E001','E0002','E003','E004'],
'ENAME':['BABJEE','RAM','SUNIL','SHAM']},
index =[10,10,20,30])
dept=pd.DataFrame({'DNAME':['Accounts','Admin','It'],
'LOCATION':['CHENNAI','MUMBAI','PUNE'] },
index=[10,20,50])
emp.join(dept,how='inner')
emp.join(dept,how='outer')
df =pd.DataFrame({'Col1':[1,2,3,4],
'Col2':[444,555,666,444],
'Col3':['abc','def','ghi','xyz']})
df.head(2)
df.tail(2)
df['Col2'].unique()
len(df['Col2'].unique())
df['Col2'].nunique()
df['Col2'].value_counts()
df[df['Col1']>2]
df['Col1'].sum()
Customs function
def times2(x):
retrun x*x
df['Col1'].apply(times2)
df['Col3'].apply(len)
df['Col2'].apply(lambda x: x *x)
df.drop('Col1',axis=1)
df.columns
df.index
df.sort_values(by='Col2',ascending=False)
df.isnull()
pwd
pd.read_csv('d:\\demo\example.csv')
pd.read_excel("d:\demo\example.xlsx")
df.to_csv("d://demo/myoutput.csv",index=False)
pd.read_excel("d:\\demo\\example.xlsx",sheet_name='Sheet1')
df.to_excel("d:\\demo\\example1.xlsx",sheet_name='Sheet2',index=False)
import pandas as pd
cnx = create_engine('mysql+pymysql://root:admin123@localhost:3306/demo').connect()
df = pd.read_sql(sql, cnx)
The Pandas datareader is a sub package that allows one to create a dataframe from
various internet datasources, currently including:
Yahoo! Finance
Google Finance
St.Louis FED (FRED)
Kenneth French’s data library
World Bank
Google Analytics
import datetime as dt
start=dt.datetime(2015,1,1)
end=dt.datetime(2015,12,31)
facebook =web.DataReader('FB','yahoo',start,end
datatime index
import pandas as pd
import numpy as np
first_two =[datetime(2017,1,1),datetime(2017,1,2)]
dt_ind =pd.DatetimeIndex(first_two)
data =np.random.randn(2,2)
df =pd.DataFrame(data,dt_ind,['a','b'])
df.index.argmax()
df.index.argmin()
df.index.max()
Time resampling
df = pd.read_csv("d://demo//walmart_stock.csv")
df.head()
df.info()
df['Date']=pd.to_datetime(df['Date'])
df.info()
df.set_index('Date',inplace=True)
df.index()