SOURCE CODE Telecom
SOURCE CODE Telecom
SOURCE CODE Telecom
setwd("F:/project")
> str(Telecom)
$ AccountWeeks : num 128 107 137 84 75 118 121 147 117 141 ...
$ RoamMins : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
> dim(Telecom)
[1] 3333 11
> sum(is.na(Telecom))
[1] 0
[1] 1
[1] 243
> summary(Telecom)
Min. :0.0000 Min. :0.000 Min. : 0.0 Min. : 0.0 Min. : 14.00
1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:143.7 1st Qu.: 87.0 1st Qu.: 45.00
Median :0.0000 Median :1.000 Median :179.4 Median :101.0 Median : 53.50
Mean :0.8165 Mean :1.563 Mean :179.8 Mean :100.4 Mean : 56.31
3rd Qu.:1.7800 3rd Qu.:2.000 3rd Qu.:216.4 3rd Qu.:114.0 3rd Qu.: 66.20
Max. :5.4000 Max. :9.000 Max. :350.8 Max. :165.0 Max. :111.30
OverageFee RoamMins
> boxplot(Telecom)
> par(mfrow=c(3,3))
> hist(Telecom$AccountWeeks)
> hist(Telecom$DataUsage)
> hist(Telecom$CustServCalls)
> hist(Telecom$DayMins)
> hist(Telecom$DayCalls)
> hist(Telecom$DataUsage)
> hist(Telecom$MonthlyCharge)
> hist(Telecom$OverageFee)
> hist(Telecom$RoamMins)
> attach(Telecom)
> library(corrplot)
> corrplot(cor(Telecom))
> qplot(AccountWeeks,DataUsage,data=Telecom)
> qplot(AccountWeeks,DayMins,data=Telecom)
> qplot(AccountWeeks,MonthlyCharge,data=Telecom)
> qplot(AccountWeeks,OverageFee,data=Telecom)
> qplot(AccountWeeks,RoamMins,data=Telecom)
> hist(Churn,CustServCalls,Telecom)
Error in freq && !equidist : invalid 'x' type in 'x && y'
> str(Telecom)
$ AccountWeeks : num 128 107 137 84 75 118 121 147 117 141 ...
$ RoamMins : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
> churnrate
0 1
0.8550855 0.1449145
> library(class)
> library(gmodels)
> library(datasets)
> summary(Telecom)
Min. :0.0000 Min. : 1.0 Min. :0.0000 Min. :0.0000 Min. :0.000
1st Qu.:0.0000 1st Qu.: 74.0 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:1.000
Median :0.0000 Median :101.0 Median :1.0000 Median :0.0000 Median :1.000
Mean :0.1449 Mean :101.1 Mean :0.9031 Mean :0.2766 Mean :1.563
3rd Qu.:0.0000 3rd Qu.:127.0 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:2.000
Max. :1.0000 Max. :243.0 Max. :1.0000 Max. :1.0000 Max. :9.000
Min. : 0.0 Min. : 0.0 Min. : 14.00 Min. : 0.00 Min. : 0.00
1st Qu.:143.7 1st Qu.: 87.0 1st Qu.: 45.00 1st Qu.: 8.33 1st Qu.: 8.50
Median :179.4 Median :101.0 Median : 53.50 Median :10.07 Median :10.30
Mean :179.8 Mean :100.4 Mean : 56.31 Mean :10.05 Mean :10.24
3rd Qu.:216.4 3rd Qu.:114.0 3rd Qu.: 66.20 3rd Qu.:11.77 3rd Qu.:12.10
Max. :350.8 Max. :165.0 Max. :111.30 Max. :18.19 Max. :20.00
> mydata=createDataPartition(y=Telecom$Churn,p=0.7,list=FALSE)
> set.seed(123)
> trainData=Telecom[mydata,]
> testData=Telecom[-mydata,]
> dim(trainData)
[1] 2334 10
> dim(testData)
[1] 999 10
> print(sum(trainData$Churn=="1")/nrow(trainData))
[1] 0.1452442
> lmodel=glm(Churn~.,family = binomial(link = "logit"),data=trainData)
> print(summary(lmodel))
Call:
Deviance Residuals:
Coefficients:
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
AIC: 1535.6
Number of Fisher Scoring iterations: 6
> vif(lmodel)
> lrtest(lmodel)
Model 2: Churn ~ 1
1 10 -757.81
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Response: Churn
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> pr2(lmodel)
> ROCRpred=prediction(predTest,testData$Churn)
> print("Confusion Matrix for Logistic Regression"); table(testing$Churn, fitted.results > 0.5)
FALSE TRUE
0 827 28
1 112 32
> ROCRpred=prediction(lmodel,testData$Churn)
> summary(lmodel2)
Call:
data = trainData)
Deviance Residuals:
Coefficients:
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
AIC: 1570.7
Model 2: Churn ~ 1
1 8 -777.35
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> exp(coefficients(lmodel2))
> exp(coefficients(lmodel))
> library(pscl)
> vif(lmodel2)
OverageFee RoamMins
1.027652 1.013871
OR 2.5 % 97.5 %
> pR2(lmodel2)
> confusionMatrix(trainData$Churn)
Error in is.factor(reference) :
> predictglm=predict(lmodel2,type="response",newdata=training)
> predictglm
1 2 3 4 5 6 7
8 9 10 11 12 13 14
15 16 17 18 19 20 21
22 23 24 25 26 27 28
29 30 31 32 33 34 35
36 37 38 39 40 41 42
50 51 52 53 54 55 56
57 58 59 60 61 62 63
64 65 66 67 68 69 70
71 72 73 74 75 76 77
78 79 80 81 82 83 84
85 86 87 88 89 90 91
92 93 94 95 96 97 98
> testing$Churn=as.character(testing$Churn)
> fitted.results=predict(lmodel,newdata=testing,type='response')
> fitted.results=ifelse(fitted.results > 0.5,1,0)
> print("Confusion Matrix for Logistic Regression"); table(testing$Churn, fitted.results > 0.5)
FALSE TRUE
0 827 28
1 112 32
> sum(Telecom$Churn=="1")/nrow(Telecom)
[1] 0.1449145
> str(Telecom)
$ AccountWeeks : num 128 107 137 84 75 118 121 147 117 141 ...
$ RoamMins : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
> Trainknn=trainData
> Testknn=testData
> TrainBayes=trainData
> TestBayes=testData
> print(summary(lmodel))
Call:
Deviance Residuals:
Coefficients:
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
AIC: 1535.6
> view(lmodel2)
> print(summary(lmodel2))
Call:
data = trainData)
Deviance Residuals:
Coefficients:
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
AIC: 1570.7
[1] 2334 10
> dim(Testknn)
[1] 999 10
> head(Trainknn)
# A tibble: 6 x 10
40 84 0 0 2 299. 71
50 75 0 0 3 167. 113
60 118 0 0 0 223. 98
# ... with 3 more variables: MonthlyCharge <dbl>, OverageFee <dbl>, RoamMins <dbl>
+ test = Testknn,
+ cl= Telecom_train_labels,
+ k = 3,
+ prob=TRUE)
> pred=knn(Trainknn[-1],Testknn[-1],Trainknn[1],k=19)
> sum(diag(table.knn)/sum(table.knn))
> dim(TrainBayes)
[1] 2334 10
> dim(TestBayes)
[1] 999 10
> train.NB=train.NB[-c(8,10)]
> train.NB=train.NB[-c(8,9)]
> test.NB=test.NB[-c(8,9)]
> train.NB$Churn=as.factor(train.NB$Churn)
> test.NB$Churn=as.factor(test.NB$Churn)
> NB
Call:
A-priori probabilities:
TrainBayes$Churn
0 1
0.8547558 0.1452442
Conditional probabilities:
AccountWeeks
TrainBayes$Churn [,1] [,2]
0 101.1975 39.29269
1 101.9794 39.24309
ContractRenewal
0 0.9328321 0.2503753
1 0.7109145 0.4540078
DataPlan
0 0.2972431 0.4571591
1 0.1563422 0.3637163
CustServCalls
0 1.445614 1.142954
1 2.259587 1.809865
DayMins
0 175.3046 49.97021
1 206.3634 69.97969
DayCalls
0 100.5719 19.50113
1 101.4926 22.51053
MonthlyCharge
1 58.84041 16.13284
OverageFee
0 9.928416 2.503162
1 10.614248 2.594425
RoamMins
0 10.18050 2.76674
1 10.76047 2.84053
> y_pred.NB=predict(NB,newdata=TestBayes[-1])
> tab.NB=table(test.NB[,1],y_pred.NB)
> tab.NB
[1] 0.8358358
> mean(predNB==TestBayes$Churn)
[1] 0.8438438
> tab.NB=table(test.NB[,1],y_pred.NB)
> CM
Reference
Prediction 0 1
0 785 86
1 70 58
Accuracy : 0.8438
Kappa : 0.3365
Sensitivity : 0.9181
Specificity : 0.4028
Prevalence : 0.8559
'Positive' Class : 0
> CM2=confusionMatrix(predNB1,TrainBayes[["Churn"]])
> CM2
Reference
Prediction 0 1
0 1837 192
1 158 147
Accuracy : 0.85
Kappa : 0.3698
Sensitivity : 0.9208
Specificity : 0.4336
Prevalence : 0.8548
'Positive' Class : 0
> normalize=function(x){return((x-min(x))/(max(x)-min(x)))}
> Telecom$norm.Churn=normalize(Churn)
> Telecom$norm.Accountweeks=normalize(AccountWeeks)
> Telecom$norm.daycalls=normalize(DayCalls)
> Telecom$norm.daymins=normalize(DayMins)
> Telecom$norm.overagefee=normalize(OverageFee)
> Telecom$norm.contractrenewal=normalize(ContractRenewal)
> Telecom$norm.dataplan=normalize(DataPlan)
> Telecom$norm.datausage=normalize(DataUsage)
> Telecom$norm.Cust=normalize(CustServCalls)
> Telecom$norm.monthlycharge=normalize(MonthlyCharge)
> Telecom$norm.roammins=normalize(RoamMins)
> view(Telecom)
> Telecom
# A tibble: 3,333 x 21
40 84 0 0 2 299. 71
50 75 0 0 3 167. 113
60 118 0 0 0 223. 98
70 121 1 1 3 218. 88
80 147 0 0 0 157 79
90 117 1 0 1 184. 97
10 0 141 0 1 0 259. 84
# ... with 3,323 more rows, and 14 more variables: MonthlyCharge <dbl>,
> set.seed(1234)
> test1=Telecom[pd==2,]
> train.NB=train1[,c(1,13:22)]
* Position 11 equals 22
> test.NB=test1[,c(1,13:22)]
* Position 11 equals 22
> str(train.NB)
> table(train.NB$Churn)
0 1
2034 328
> table(test.NB$Churn)
0 1
816 155