Laptop Price Predictor
Laptop Price Predictor
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('laptop_data.csv')
df.head()
df.shape
(1303, 12)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 1303 non-null int64
1 Company 1303 non-null object
2 TypeName 1303 non-null object
3 Inches 1303 non-null float64
4 ScreenResolution 1303 non-null object
5 Cpu 1303 non-null object
6 Ram 1303 non-null object
7 Memory 1303 non-null object
8 Gpu 1303 non-null object
9 OpSys 1303 non-null object
10 Weight 1303 non-null object
11 Price 1303 non-null float64
dtypes: float64(2), int64(1), object(9)
memory usage: 122.3+ KB
df.duplicated().sum()
df.isnull().sum()
Unnamed: 0 0
Company 0
TypeName 0
Inches 0
ScreenResolution 0
Cpu 0
Ram 0
Memory 0
Gpu 0
OpSys 0
Weight 0
Price 0
dtype: int64
df.drop(columns=['Unnamed: 0'],inplace=True)
df.head()
df['Ram'] = df['Ram'].str.replace('GB','')
df['Weight'] = df['Weight'].str.replace('kg','')
df.head()
df['Ram'] = df['Ram'].astype('int32')
df['Weight'] = df['Weight'].astype('float32')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Company 1303 non-null object
1 TypeName 1303 non-null object
2 Inches 1303 non-null float64
3 ScreenResolution 1303 non-null object
4 Cpu 1303 non-null object
5 Ram 1303 non-null int32
6 Memory 1303 non-null object
7 Gpu 1303 non-null object
8 OpSys 1303 non-null object
9 Weight 1303 non-null float32
10 Price 1303 non-null float64
dtypes: float32(1), float64(2), int32(1), object(7)
memory usage: 101.9+ KB
sns.distplot(df['Price'])
C:\Users\91842\anaconda3\lib\site-packages\seaborn\
distributions.py:2557: FutureWarning: `distplot` is a deprecated
function and will be removed in a future version. Please adapt your
code to use either `displot` (a figure-level function with similar
flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='Price', ylabel='Density'>
df['Company'].value_counts().plot(kind='bar')
<AxesSubplot:>
sns.barplot(x=df['Company'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()
df['TypeName'].value_counts().plot(kind='bar')
<AxesSubplot:>
sns.barplot(x=df['TypeName'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()
sns.distplot(df['Inches'])
C:\Users\91842\anaconda3\lib\site-packages\seaborn\
distributions.py:2557: FutureWarning: `distplot` is a deprecated
function and will be removed in a future version. Please adapt your
code to use either `displot` (a figure-level function with similar
flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='Inches', ylabel='Density'>
sns.scatterplot(x=df['Inches'],y=df['Price'])
<AxesSubplot:xlabel='Inches', ylabel='Price'>
df['ScreenResolution'].value_counts()
Full HD 1920x1080 507
1366x768 281
IPS Panel Full HD 1920x1080 230
IPS Panel Full HD / Touchscreen 1920x1080 53
Full HD / Touchscreen 1920x1080 47
1600x900 23
Touchscreen 1366x768 16
Quad HD+ / Touchscreen 3200x1800 15
IPS Panel 4K Ultra HD 3840x2160 12
IPS Panel 4K Ultra HD / Touchscreen 3840x2160 11
4K Ultra HD / Touchscreen 3840x2160 10
Touchscreen 2560x1440 7
IPS Panel 1366x768 7
4K Ultra HD 3840x2160 7
IPS Panel Quad HD+ / Touchscreen 3200x1800 6
Touchscreen 2256x1504 6
IPS Panel Retina Display 2304x1440 6
IPS Panel Retina Display 2560x1600 6
IPS Panel Touchscreen 2560x1440 5
IPS Panel 2560x1440 4
IPS Panel Retina Display 2880x1800 4
IPS Panel Touchscreen 1920x1200 4
1440x900 4
Quad HD+ 3200x1800 3
IPS Panel Quad HD+ 2560x1440 3
1920x1080 3
Touchscreen 2400x1600 3
IPS Panel Touchscreen 1366x768 3
2560x1440 3
IPS Panel Full HD 2160x1440 2
IPS Panel Touchscreen / 4K Ultra HD 3840x2160 2
IPS Panel Quad HD+ 3200x1800 2
Touchscreen / Full HD 1920x1080 1
IPS Panel Retina Display 2736x1824 1
IPS Panel Full HD 1920x1200 1
IPS Panel Full HD 1366x768 1
Touchscreen / 4K Ultra HD 3840x2160 1
IPS Panel Touchscreen 2400x1600 1
IPS Panel Full HD 2560x1440 1
Touchscreen / Quad HD+ 3200x1800 1
Name: ScreenResolution, dtype: int64
df.sample(5)
ScreenResolution \
1154 IPS Panel Touchscreen / 4K Ultra HD 3840x2160
750 Touchscreen 1366x768
1246 1366x768
879 Full HD 1920x1080
1021 Full HD 1920x1080
df['Touchscreen'].value_counts().plot(kind='bar')
<AxesSubplot:>
sns.barplot(x=df['Touchscreen'],y=df['Price'])
<AxesSubplot:xlabel='Touchscreen', ylabel='Price'>
df.head()
Company TypeName Inches ScreenResolution \
0 Apple Ultrabook 13.3 IPS Panel Retina Display 2560x1600
1 Apple Ultrabook 13.3 1440x900
2 HP Notebook 15.6 Full HD 1920x1080
3 Apple Ultrabook 15.4 IPS Panel Retina Display 2880x1800
4 Apple Ultrabook 13.3 IPS Panel Retina Display 2560x1600
df['Ips'].value_counts().plot(kind='bar')
<AxesSubplot:>
sns.barplot(x=df['Ips'],y=df['Price'])
<AxesSubplot:xlabel='Ips', ylabel='Price'>
new = df['ScreenResolution'].str.split('x',n=1,expand=True)
df['X_res'] = new[0]
df['Y_res'] = new[1]
df.sample(5)
X_res Y_res
141 IPS Panel Full HD 1920 1080
1055 1366 768
75 Full HD 1920 1080
984 1366 768
337 Full HD 1920 1080
df['X_res'] = df['X_res'].str.replace(',','').str.findall(r'(\d+\.?\
d+)').apply(lambda x:x[0])
df.head()
X_res Y_res
0 2560 1600
1 1440 900
2 1920 1080
3 2880 1800
4 2560 1600
df['X_res'] = df['X_res'].astype('int')
df['Y_res'] = df['Y_res'].astype('int')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Company 1303 non-null object
1 TypeName 1303 non-null object
2 Inches 1303 non-null float64
3 ScreenResolution 1303 non-null object
4 Cpu 1303 non-null object
5 Ram 1303 non-null int32
6 Memory 1303 non-null object
7 Gpu 1303 non-null object
8 OpSys 1303 non-null object
9 Weight 1303 non-null float32
10 Price 1303 non-null float64
11 Touchscreen 1303 non-null int64
12 Ips 1303 non-null int64
13 X_res 1303 non-null int32
14 Y_res 1303 non-null int32
dtypes: float32(1), float64(2), int32(3), int64(2), object(7)
memory usage: 132.5+ KB
df.corr()['Price']
Inches 0.068197
Ram 0.743007
Weight 0.210370
Price 1.000000
Touchscreen 0.191226
Ips 0.252208
X_res 0.556529
Y_res 0.552809
Name: Price, dtype: float64
df['ppi'] = (((df['X_res']**2) +
(df['Y_res']**2))**0.5/df['Inches']).astype('float')
df.corr()['Price']
Inches 0.068197
Ram 0.743007
Weight 0.210370
Price 1.000000
Touchscreen 0.191226
Ips 0.252208
X_res 0.556529
Y_res 0.552809
ppi 0.473487
Name: Price, dtype: float64
df.drop(columns=['ScreenResolution'],inplace=True)
df.head()
df.drop(columns=['Inches','X_res','Y_res'],inplace=True)
df.head()
ppi
0 226.983005
1 127.677940
2 141.211998
3 220.534624
4 226.983005
df['Cpu'].value_counts()
df.head()
def fetch_processor(text):
if text == 'Intel Core i7' or text == 'Intel Core i5' or text ==
'Intel Core i3':
return text
else:
if text.split()[0] == 'Intel':
return 'Other Intel Processor'
else:
return 'AMD Processor'
df.head()
df['Cpu brand'].value_counts().plot(kind='bar')
<AxesSubplot:>
sns.barplot(x=df['Cpu brand'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()
df.drop(columns=['Cpu','Cpu Name'],inplace=True)
df.head()
df['Ram'].value_counts().plot(kind='bar')
<AxesSubplot:>
sns.barplot(x=df['Ram'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()
df['Memory'].value_counts()
df["first"]= new[0]
df["first"]=df["first"].str.strip()
df["second"]= new[1]
df["first"] = df["first"].astype(int)
df["second"] = df["second"].astype(int)
df["HDD"]=(df["first"]*df["Layer1HDD"]+df["second"]*df["Layer2HDD"])
df["SSD"]=(df["first"]*df["Layer1SSD"]+df["second"]*df["Layer2SSD"])
df["Hybrid"]=(df["first"]*df["Layer1Hybrid"]
+df["second"]*df["Layer2Hybrid"])
df["Flash_Storage"]=(df["first"]*df["Layer1Flash_Storage"]
+df["second"]*df["Layer2Flash_Storage"])
df.sample(5)
df.drop(columns=['Memory'],inplace=True)
df.head()
Company TypeName Ram Gpu OpSys Weight
\
0 Apple Ultrabook 8 Intel Iris Plus Graphics 640 macOS 1.37
Flash_Storage
0 0
1 128
2 0
3 0
4 0
df.corr()['Price']
Ram 0.743007
Weight 0.210370
Price 1.000000
Touchscreen 0.191226
Ips 0.252208
ppi 0.473487
HDD -0.096441
SSD 0.670799
Hybrid 0.007989
Flash_Storage -0.040511
Name: Price, dtype: float64
df.drop(columns=['Hybrid','Flash_Storage'],inplace=True)
df.head()
Company TypeName Ram Gpu OpSys Weight
\
0 Apple Ultrabook 8 Intel Iris Plus Graphics 640 macOS 1.37
df['Gpu'].value_counts()
df.head()
Gpu brand
0 Intel
1 Intel
2 Intel
3 AMD
4 Intel
df['Gpu brand'].value_counts()
Intel 722
Nvidia 400
AMD 180
ARM 1
Name: Gpu brand, dtype: int64
df['Gpu brand'].value_counts()
Intel 722
Nvidia 400
AMD 180
Name: Gpu brand, dtype: int64
sns.barplot(x=df['Gpu brand'],y=df['Price'],estimator=np.median)
plt.xticks(rotation='vertical')
plt.show()
df.drop(columns=['Gpu'],inplace=True)
C:\Users\91842\anaconda3\lib\site-packages\pandas\core\frame.py:4308:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
df.head()
df['OpSys'].value_counts()
Windows 10 1072
No OS 66
Linux 62
Windows 7 45
Chrome OS 26
macOS 13
Windows 10 S 8
Mac OS X 8
Android 2
Name: OpSys, dtype: int64
sns.barplot(x=df['OpSys'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()
def cat_os(inp):
if inp == 'Windows 10' or inp == 'Windows 7' or inp == 'Windows 10
S':
return 'Windows'
elif inp == 'macOS' or inp == 'Mac OS X':
return 'Mac'
else:
return 'Others/No OS/Linux'
df['os'] = df['OpSys'].apply(cat_os)
<ipython-input-122-38671a3c07bd>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
df.head()
df.drop(columns=['OpSys'],inplace=True)
C:\Users\91842\anaconda3\lib\site-packages\pandas\core\frame.py:4308:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
sns.barplot(x=df['os'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()
sns.distplot(df['Weight'])
C:\Users\91842\anaconda3\lib\site-packages\seaborn\
distributions.py:2557: FutureWarning: `distplot` is a deprecated
function and will be removed in a future version. Please adapt your
code to use either `displot` (a figure-level function with similar
flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='Weight', ylabel='Density'>
sns.scatterplot(x=df['Weight'],y=df['Price'])
<AxesSubplot:xlabel='Weight', ylabel='Price'>
df.corr()['Price']
Ram 0.742905
Weight 0.209867
Price 1.000000
Touchscreen 0.192917
Ips 0.253320
ppi 0.475368
HDD -0.096891
SSD 0.670660
Name: Price, dtype: float64
sns.heatmap(df.corr())
<AxesSubplot:>
sns.distplot(np.log(df['Price']))
C:\Users\91842\anaconda3\lib\site-packages\seaborn\
distributions.py:2557: FutureWarning: `distplot` is a deprecated
function and will be removed in a future version. Please adapt your
code to use either `displot` (a figure-level function with similar
flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='Price', ylabel='Density'>
X = df.drop(columns=['Price'])
y = np.log(df['Price'])
0 11.175755
1 10.776777
2 10.329931
3 11.814476
4 11.473101
...
1298 10.433899
1299 11.288115
1300 9.409283
1301 10.614129
1302 9.886358
Name: Price, Length: 1302, dtype: float64
X_train
Linear regression
step1 = ColumnTransformer(transformers=[
('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')
step2 = LinearRegression()
pipe = Pipeline([
('step1',step1),
('step2',step2)
])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
R2 score 0.8073277448418521
MAE 0.21017827976429174
Ridge Regression
step1 = ColumnTransformer(transformers=[
('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')
step2 = Ridge(alpha=10)
pipe = Pipeline([
('step1',step1),
('step2',step2)
])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
R2 score 0.8127331031311811
MAE 0.20926802242582954
Lasso Regression
step1 = ColumnTransformer(transformers=[
('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')
step2 = Lasso(alpha=0.001)
pipe = Pipeline([
('step1',step1),
('step2',step2)
])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
R2 score 0.8071853945317105
MAE 0.21114361613472565
KNN
step1 = ColumnTransformer(transformers=[
('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')
step2 = KNeighborsRegressor(n_neighbors=3)
pipe = Pipeline([
('step1',step1),
('step2',step2)
])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
R2 score 0.8021984604448553
MAE 0.19319716721521116
Decision Tree
step1 = ColumnTransformer(transformers=[
('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')
step2 = DecisionTreeRegressor(max_depth=8)
pipe = Pipeline([
('step1',step1),
('step2',step2)
])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
R2 score 0.8466456692979233
MAE 0.1806340977609143
SVM
step1 = ColumnTransformer(transformers=[
('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')
step2 = SVR(kernel='rbf',C=10000,epsilon=0.1)
pipe = Pipeline([
('step1',step1),
('step2',step2)
])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
R2 score 0.8083180902257614
MAE 0.20239059427481307
Random Forest
step1 = ColumnTransformer(transformers=[
('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')
step2 = RandomForestRegressor(n_estimators=100,
random_state=3,
max_samples=0.5,
max_features=0.75,
max_depth=15)
pipe = Pipeline([
('step1',step1),
('step2',step2)
])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
R2 score 0.8873402378382488
MAE 0.15860130110457718
ExtraTrees
step1 = ColumnTransformer(transformers=[
('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')
step2 = ExtraTreesRegressor(n_estimators=100,
random_state=3,
max_samples=0.5,
max_features=0.75,
max_depth=15)
pipe = Pipeline([
('step1',step1),
('step2',step2)
])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
R2 score 0.8753793123440623
MAE 0.15979519126758127
AdaBoost
step1 = ColumnTransformer(transformers=[
('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')
step2 = AdaBoostRegressor(n_estimators=15,learning_rate=1.0)
pipe = Pipeline([
('step1',step1),
('step2',step2)
])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
R2 score 0.7929652659237908
MAE 0.23296532406396742
Gradient Boost
step1 = ColumnTransformer(transformers=[
('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')
step2 = GradientBoostingRegressor(n_estimators=500)
pipe = Pipeline([
('step1',step1),
('step2',step2)
])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
R2 score 0.8823244736036472
MAE 0.15929506744611283
XgBoost
step1 = ColumnTransformer(transformers=[
('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')
step2 = XGBRegressor(n_estimators=45,max_depth=5,learning_rate=0.5)
pipe = Pipeline([
('step1',step1),
('step2',step2)
])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
R2 score 0.8811773435850243
MAE 0.16496203512600974
Voting Regressor
from sklearn.ensemble import VotingRegressor,StackingRegressor
step1 = ColumnTransformer(transformers=[
('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')
rf =
RandomForestRegressor(n_estimators=350,random_state=3,max_samples=0.5,
max_features=0.75,max_depth=15)
gbdt = GradientBoostingRegressor(n_estimators=100,max_features=0.5)
xgb = XGBRegressor(n_estimators=25,learning_rate=0.3,max_depth=5)
et =
ExtraTreesRegressor(n_estimators=100,random_state=3,max_samples=0.5,ma
x_features=0.75,max_depth=10)
pipe = Pipeline([
('step1',step1),
('step2',step2)
])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
R2 score 0.8901036732986811
MAE 0.15847265699907628
Stacking
from sklearn.ensemble import VotingRegressor,StackingRegressor
step1 = ColumnTransformer(transformers=[
('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')
estimators = [
('rf',
RandomForestRegressor(n_estimators=350,random_state=3,max_samples=0.5,
max_features=0.75,max_depth=15)),
('gbdt',GradientBoostingRegressor(n_estimators=100,max_features=0.5)),
('xgb',
XGBRegressor(n_estimators=25,learning_rate=0.3,max_depth=5))
]
step2 = StackingRegressor(estimators=estimators,
final_estimator=Ridge(alpha=100))
pipe = Pipeline([
('step1',step1),
('step2',step2)
])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
R2 score 0.8816958647512341
MAE 0.1663048975120589
pickle.dump(df,open('df.pkl','wb'))
pickle.dump(pipe,open('pipe.pkl','wb'))
df
os
0 Mac
1 Mac
2 Others/No OS/Linux
3 Mac
4 Mac
... ...
1298 Windows
1299 Windows
1300 Windows
1301 Windows
1302 Windows
X_train