Janani Prakash Loan Prediction Study
Janani Prakash Loan Prediction Study
Janani Prakash Loan Prediction Study
29.03.2019
─
Janani Prakash
PGPBABI-Online
GreatLearning, Great Lakes Institute of Management
Project Objective 3
>dim(Loan)
[1] 20000 40
The dim function explains the dimension of the dataset
>names(Loan)
[1] "CUST_ID" "TARGET"
[3] "AGE" "GENDER"
[5] "BALANCE" "OCCUPATION"
[7] "AGE_BKT" "SCR"
[9] "HOLDING_PERIOD" "ACC_TYPE"
[11] "ACC_OP_DATE" "LEN_OF_RLTN_IN_MNTH"
[13] "NO_OF_L_CR_TXNS" "NO_OF_L_DR_TXNS"
[15] "TOT_NO_OF_L_TXNS" "NO_OF_BR_CSH_WDL_DR_TXNS"
[17] "NO_OF_ATM_DR_TXNS" "NO_OF_NET_DR_TXNS"
[19] "NO_OF_MOB_DR_TXNS" "NO_OF_CHQ_DR_TXNS"
[21] "FLG_HAS_CC" "AMT_ATM_DR"
[23] "AMT_BR_CSH_WDL_DR" "AMT_CHQ_DR"
[25] "AMT_NET_DR" "AMT_MOB_DR"
[27] "AMT_L_DR" "FLG_HAS_ANY_CHGS"
[29] "AMT_OTH_BK_ATM_USG_CHGS" "AMT_MIN_BAL_NMC_CHGS"
[31] "NO_OF_IW_CHQ_BNC_TXNS" "NO_OF_OW_CHQ_BNC_TXNS"
[33] "AVG_AMT_PER_ATM_TXN" "AVG_AMT_PER_CSH_WDL_TXN"
[35] "AVG_AMT_PER_CHQ_TXN" "AVG_AMT_PER_NET_TXN"
[37] "AVG_AMT_PER_MOB_TXN" "FLG_HAS_NOMINEE"
[39] "FLG_HAS_OLD_LOAN" "random"
From the name function, the names of the column can be found.
> head(Loan,5)
CUST_ID TARGET AGE GENDER BALANCE OCCUPATION AGE_BKT SCR
1 C7927 0 27 M 3383.75 SELF-EMP 26-30 776
2 C6877 0 47 M 287489.04 SAL 46-50 324
3 C19922 0 40 M 18216.88 SELF-EMP 36-40 603
4 C8183 0 53 M 71720.48 SAL >50 196
5 C12123 0 36 M 1671622.89 PROF 36-40 167
HOLDING_PERIOD ACC_TYPE ACC_OP_DATE LEN_OF_RLTN_IN_MNTH
NO_OF_L_CR_TXNS
1 30 SA 3/23/2005 146 7
2 28 SA 10-11-08 104 8
3 2 SA 4/26/2012 61 10
4 13 CA 07-04-08 107 36
5 24 SA 12/29/2001 185 20
NO_OF_L_DR_TXNS TOT_NO_OF_L_TXNS
NO_OF_BR_CSH_WDL_DR_TXNS
1 3 10 0
2 2 10 0
3 5 15 1
4 14 50 4
5 1 21 1
NO_OF_ATM_DR_TXNS NO_OF_NET_DR_TXNS NO_OF_MOB_DR_TXNS
NO_OF_CHQ_DR_TXNS
11200
21100
31102
42314
50000
FLG_HAS_CC AMT_ATM_DR AMT_BR_CSH_WDL_DR AMT_CHQ_DR
AMT_NET_DR AMT_MOB_DR
1 0 13100 0 0 973557 0
2 0 6600 0 0 799813 0
3 0 11200 561120 49320 997570 0
4 0 26100 673590 60780 741506 71388
5 0 0 808480 0 0 0
AMT_L_DR FLG_HAS_ANY_CHGS AMT_OTH_BK_ATM_USG_CHGS
AMT_MIN_BAL_NMC_CHGS
1 986657 0 0 0
2 806413 1 0 0
3 1619210 1 0 0
4 1573364 0 0 0
5 808480 0 0 0
NO_OF_IW_CHQ_BNC_TXNS NO_OF_OW_CHQ_BNC_TXNS
AVG_AMT_PER_ATM_TXN
1 0 0 13100
2 0 0 6600
3 0 1 11200
4 0 0 13050
5000
AVG_AMT_PER_CSH_WDL_TXN AVG_AMT_PER_CHQ_TXN
AVG_AMT_PER_NET_TXN
1 0.0 0 486778.5
2 0.0 0 799813.0
3 561120.0 24660 997570.0
4 168397.5 15195 247168.7
5 808480.0 0 0.0
AVG_AMT_PER_MOB_TXN FLG_HAS_NOMINEE FLG_HAS_OLD_LOAN
random
1 0 1 1 0.000011400
2 0 1 0 0.000111373
3 0 1 1 0.000119954
4 71388 1 0 0.000136825
5 0 1 0 0.000173976
>table(TARGET)
TARGET
0 1
17488 2512
The table function of the target shows that 2512 were the number of responders
and 17488 were the number of non responders out of 20000 customers who
were contacted by the bank.
>fancyRpartPlot(m1)
The fancyRpartPlot plots a fancy RPart decision tree using the pretty rpart plotter.
>printcp(m2)
Classification tree:
rpart(formula = cart.train$TARGET ~ ., data = cart.train[, -c(1,
11)], method = "class", control = r1.ctrl)
From the above result, it can be seen that the xerror value decreases till cp =
0.0017 and steadily increases thereafter.
>plotcp(m2)
Ranking Code
>tmp_DT = data.table(cart.train)
>rank <- tmp_DT[, list(cnt=length(TARGET),
cnt_resp=sum(TARGET==1),
cnt_non_resp=sum(TARGET==0)
), by=deciles][order(-deciles)]
>rank$rrate <- round(rank$cnt_resp / rank$cnt,4);
>rank$cum_resp <- cumsum(rank$cnt_resp)
>rank$cum_non_resp <- cumsum(rank$cnt_non_resp)
>rank$cum_rel_resp <- round(rank$cum_resp / sum(rank$cnt_resp),4);
>rank$cum_rel_non_resp <- round(rank$cum_non_resp /
>sum(rank$cnt_non_resp),4);
>rank$ks <- abs(rank$cum_rel_resp - rank$cum_rel_non_resp) * 100;
>rank$rrate <- percent(rank$rrate)
>rank$cum_rel_resp <- percent(rank$cum_rel_resp)
>rank$cum_rel_non_resp <- percent(rank$cum_rel_non_resp)
>rank
Interpretation:
● The response rate in top deciles is above 26.3%.
● The KS is around 23.9%, indicating it to be not a good model
KS and Area under Curve
>plot(perf)
>KS
[1] 0.2456805
>auc
[1] 0.6350559
>gini
[1] 0.2192173
>printcp(m2)
Classification tree:
rpart(formula = TARGET ~ ., data = cart.train.over[, -c(1, 11,
41, 42, 43)], method = "class", control = r1.ctrl)
n= 24544
>plotcp(m2)
n= 24544
Deciling Code
>decile <- function (x)
{
deciles <- vector(length=10)
for (i in seq(0.1,1,.1))
{
deciles[i*10] <- quantile(x, i, na.rm=T)
}
return (
ifelse(x<deciles[1], 1,
ifelse(x<deciles[2], 2,
ifelse(x<deciles[3], 3,
ifelse(x<deciles[4], 4,
ifelse(x<deciles[5], 5,
ifelse(x<deciles[6], 6,
ifelse(x<deciles[7], 7,
ifelse(x<deciles[8], 8,
ifelse(x<deciles[9], 9, 10
))))))))))
}
Ranking Code
>tmp_DT = data.table(cart.train.over)
>rank <- tmp_DT[, list(cnt=length(TARGET),
cnt_resp=sum(TARGET==1),
cnt_non_resp=sum(TARGET==0)
), by=deciles][order(-deciles)]
>rank$rrate <- round(rank$cnt_resp / rank$cnt,4);
>rank$cum_resp <- cumsum(rank$cnt_resp)
>rank$cum_non_resp <- cumsum(rank$cnt_non_resp)
>rank$cum_rel_resp <- round(rank$cum_resp / sum(rank$cnt_resp),4);
>rank$cum_rel_non_resp <- round(rank$cum_non_resp /
sum(rank$cnt_non_resp),4);
>rank$ks <- abs(rank$cum_rel_resp - rank$cum_rel_non_resp) * 100;
>rank$rrate <- percent(rank$rrate)
>rank$cum_rel_resp <- percent(rank$cum_rel_resp)
>rank$cum_rel_non_resp <- percent(rank$cum_rel_non_resp)
>rank
>plot(perf)
>KS
[1] 0.7202575
>auc
[1] 0.9162399
>gini
[1] 0.4162399
Summary - CART Model Performance(Oversampled Training Dataset):
● The KS = 72.18% and the AUC = 91.46% which indicating it to be a good
model.
● The Gini Coefficient = 41.46% also indicating to be a good model, with
scope of improvement.
● Confusion matrix:
1. Accuracy = (10106 + 11025)/(10106 + 1247 + 2166 + 11025) =
86.9%
2. Classification Error Rate = 1 - Accuracy = 13.91%
Ranking Code
>tmp_DT = data.table(cart.test)
>h_rank <- tmp_DT[, list(cnt=length(TARGET),
cnt_resp=sum(TARGET==1),
cnt_non_resp=sum(TARGET==0)
), by=deciles][order(-deciles)]
>h_rank$rrate <- round(h_rank$cnt_resp / h_rank$cnt,4);
>h_rank$cum_resp <- cumsum(h_rank$cnt_resp)
>h_rank$cum_non_resp <- cumsum(h_rank$cnt_non_resp)
>h_rank$cum_rel_resp <- round(h_rank$cum_resp / sum(h_rank$cnt_resp),4);
>h_rank$cum_rel_non_resp <- round(h_rank$cum_non_resp /
sum(h_rank$cnt_non_resp),4);
>h_rank$ks <- abs(h_rank$cum_rel_resp - h_rank$cum_rel_non_resp) * 100;
>h_rank$rrate <- percent(h_rank$rrate)
>h_rank$cum_rel_resp <- percent(h_rank$cum_rel_resp)
>h_rank$cum_rel_non_resp <- percent(h_rank$cum_rel_non_resp)
>h_rank
deciles cnt cnt_resp cnt_non_resp rrate cum_resp cum_non_resp
1: 10 637 270 367 42.4% 270 367
2: 9 605 199 406 32.9% 469 773
3: 8 573 108 465 18.8% 577 1238
4: 7 621 66 555 10.6% 643 1793
5: 6 664 33 631 5.0% 676 2424
6: 5 519 41 478 7.9% 717 2902
7: 4 759 26 733 3.4% 743 3635
8: 3 563 9 554 1.6% 752 4189
9: 2 1059 32 1027 3.0% 784 5216
cum_rel_resp cum_rel_non_resp ks
1: 34.4% 7.0% 27.40
2: 59.8% 14.8% 45.00
3: 73.6% 23.7% 49.87
4: 82.0% 34.4% 47.64
5: 86.2% 46.5% 39.75
6: 91.4% 55.6% 35.81
7: 94.8% 69.7% 25.08
8: 95.9% 80.3% 15.61
9: 100.0% 100.0% 0.00
CART - Conclusion
Comparative Summary of the CART Model on Training and Testing Dataset
is as follows:
Measures Train Test %Deviation
KS 72.18 47.40 35%
AUC 91.46 79.23 13%
Gini 41.46 60.42 -46%
Accuracy 86.9 78.35 10%
CeR 13.91 21.65 -56%
It is observed that as the number of trees increases, the OOB error rate starts
decreasing till it reaches around 155th tree with OOB = 0.0757 (the minimum
value). After this, the OOB doesn’t decrease further and remains largely steady.
Hence, the optimal number of trees would be 155.
Variable Importance
To understand the important variables in Random Forest, the following measures
are generally used:
> Mean Decrease in Accuracy is based on permutation >> Randomly permute
values of a variable for which importance is to be computed in the OOB sample
>> Compute the Error Rate with permuted values >> Compute decrease in OOB
Error rate (Permuted - Not permuted) >> Average the decrease over all the trees
> Mean Decrease in Gini is computed as “total decrease in node impurities from
splitting on the variable,averaged over all trees”
Scoring
>rf.train$predict.class <- predict(tRF, rf.train, type = "class")
>rf.train$predict.score <- predict(tRF, rf.train, type = "prob")
Deciling code
>decile <- function (x)
{
deciles <- vector(length=10)
for (i in seq(0.1,1,.1)){
deciles[i*10] <- quantile(x, i, na.rm=T)
}
return (
ifelse(x<deciles[1], 1,
ifelse(x<deciles[2], 2,
ifelse(x<deciles[3], 3,
ifelse(x<deciles[4], 4,
ifelse(x<deciles[5], 5,
ifelse(x<deciles[6], 6,
ifelse(x<deciles[7], 7,
ifelse(x<deciles[8], 8,
ifelse(x<deciles[9], 9, 10
))))))))))
}
>rf.train$deciles <- decile(rf.train$predict.score[,2])
Rank order table
>tmp_DT = data.table(rf.train)
>rank <- tmp_DT[, list(
cnt = length(TARGET),
cnt_resp = sum(TARGET==1),
cnt_non_resp = sum(TARGET == 0)) ,
by=deciles][order(-deciles)]
>rank$rrate <- round (rank$cnt_resp / rank$cnt,2);
>rank$cum_resp <- cumsum(rank$cnt_resp)
>rank$cum_non_resp <- cumsum(rank$cnt_non_resp)
>rank$cum_rel_resp <- round(rank$cum_resp / sum(rank$cnt_resp),2);
>rank$cum_rel_non_resp <- round(rank$cum_non_resp /
sum(rank$cnt_non_resp),2);
>rank$ks <- abs(rank$cum_rel_resp - rank$cum_rel_non_resp); library (scales)
>rank$rrate <- percent(rank$rrate)
>rank$cum_rel_resp <- percent(rank$cum_rel_resp)
>rank$cum_rel_non_resp <- percent(rank$cum_rel_non_resp)
>rank
deciles cnt cnt_resp cnt_non_resp rrate cum_resp cum_non_resp cum_rel_resp
1: 10 1404 1404 0 100% 1404 0 81.0%
2: 9 1435 324 1111 23% 1728 1111 100.0%
3: 8 1431 0 1431 0% 1728 2542 100.0%
4: 7 1402 0 1402 0% 1728 3944 100.0%
5: 6 1446 0 1446 0% 1728 5390 100.0%
6: 5 1522 0 1522 0% 1728 6912 100.0%
7: 4 1493 0 1493 0% 1728 8405 100.0%
8: 3 1495 0 1495 0% 1728 9900 100.0%
9: 2 1327 0 1327 0% 1728 11227 100.0%
10: 1 1045 0 1045 0% 1728 12272 100.0%
cum_rel_non_resp ks
1: 0% 0.81
2: 9% 0.91
3: 21% 0.79
4: 32% 0.68
5: 44% 0.56
6: 56% 0.44
7: 68% 0.32
8: 81% 0.19
9: 91% 0.09
10: 100% 0.00
Classification Error
> with(rf.train, table(TARGET, predict.class))
predict.class
TARGET 0 1
0 12270 2
1 171 1557
KS=99.75% AUC=99.99% Gini=74.85% Accuracy=98.76% CeR=1.24%
>tmp_DT = data.table(rf.test)
>h_rank <- tmp_DT[, list(
cnt = length(TARGET),
cnt_resp = sum(TARGET),
cnt_non_resp = sum(TARGET == 0)) ,
by=deciles][order(-deciles)]
>h_rank$rrate <- round (h_rank$cnt_resp / h_rank$cnt,2);
>h_rank$cum_resp <- cumsum(h_rank$cnt_resp)
>h_rank$cum_non_resp <- cumsum(h_rank$cnt_non_resp)
>h_rank$cum_rel_resp <- round(h_rank$cum_resp / sum(h_rank$cnt_resp),2);
>h_rank$cum_rel_non_resp <- round(h_rank$cum_non_resp /
sum(h_rank$cnt_non_resp),2);
>h_rank$ks <- abs(h_rank$cum_rel_resp - h_rank$cum_rel_non_resp);
>h_rank$rrate <- percent(h_rank$rrate)
>h_rank$cum_rel_resp <- percent(h_rank$cum_rel_resp)
>h_rank$cum_rel_non_resp <- percent(h_rank$cum_rel_non_resp)
>h_rank
deciles cnt cnt_resp cnt_non_resp rrate cum_resp cum_non_resp
1: 10 602 542 60 90% 542 60
2: 9 607 139 468 23% 681 528
3: 8 591 51 540 9% 732 1068
4: 7 627 27 600 4% 759 1668
5: 6 592 10 582 2% 769 2250
6: 5 592 12 580 2% 781 2830
7: 4 657 1 656 0% 782 3486
8: 3 647 0 647 0% 782 4133
9: 2 616 0 616 0% 782 4749
10: 1 469 2 467 0% 784 5216
cum_rel_resp cum_rel_non_resp ks
1: 69% 1% 0.68
2: 87% 10% 0.77
3: 93% 20% 0.73
4: 97% 32% 0.65
5: 98% 43% 0.55
6: 100% 54% 0.46
7: 100% 67% 0.33
8: 100% 79% 0.21
9: 100% 91% 0.09
10: 100% 100% 0.00
Interpretation:
● The baseline Response Rate is 12.34%, whereas the response rate in top
three deciles is 90%, 23%, 9% respectively.
● With two deciles, the KS is 77% indicating it to be a good model.
KS and AUC
>pred1 <- prediction(rf.test$predict.score[,2], rf.test$TARGET)
>perf1 <- performance(pred1, "tpr", "fpr")
>plot(perf1)
>KS1 <- max(attr(perf1, 'y.values')[[1]]-attr(perf1, 'x.values')[[1]])
>KS1
[1] 0.774446
Area Under Curve
>auc1 <- performance(pred1,"auc");
>auc1 <- as.numeric(auc1@y.values)
>auc1
[1] 0.9584023
Gini Coefficient
>gini1 = ineq(rf.test$predict.score[,2], type="Gini")
>gini1
[1] 0.6463728
Classification Error
>with(rf.test, table(TARGET, predict.class))
predict.class
TARGET 0 1
0 5211 5
1 343 441
KS = 77.44% AUC = 95.84% Gini = 64.63% Accuracy = 94.2% CeR = 5.8%
>dim(NNInput)
[1] 20000 56
Creating Training and Test Datasets
>trainIndex <- createDataPartition(TARGET,
p = .7,
list = FALSE,
times = 1)
>NN.train.data <- NNInput[trainIndex,]
>NN.test.data <- NNInput[-trainIndex,]
>dim(NN.train.data)
[1] 14000 56
dim(NN.test.data)
[1] 6000 56
Scaling the Dataset and Variables
>x <- subset(NN.train.data,
select = c("AGE",
"BALANCE",
"SCR",
"HOLDING_PERIOD",
"LEN_OF_RLTN_IN_MNTH",
"NO_OF_L_CR_TXNS",
"NO_OF_L_DR_TXNS",
"TOT_NO_OF_L_TXNS",
"NO_OF_BR_CSH_WDL_DR_TXNS",
"NO_OF_ATM_DR_TXNS",
"NO_OF_NET_DR_TXNS",
"NO_OF_MOB_DR_TXNS",
"NO_OF_CHQ_DR_TXNS",
"FLG_HAS_CC",
"AMT_ATM_DR",
"AMT_BR_CSH_WDL_DR",
"AMT_CHQ_DR",
"AMT_NET_DR",
"AMT_MOB_DR",
"AMT_L_DR",
"FLG_HAS_ANY_CHGS",
"AMT_OTH_BK_ATM_USG_CHGS",
"AMT_MIN_BAL_NMC_CHGS",
"NO_OF_IW_CHQ_BNC_TXNS",
"NO_OF_OW_CHQ_BNC_TXNS",
"AVG_AMT_PER_ATM_TXN",
"AVG_AMT_PER_CSH_WDL_TXN",
"AVG_AMT_PER_CHQ_TXN",
"AVG_AMT_PER_NET_TXN",
"AVG_AMT_PER_MOB_TXN",
"FLG_HAS_NOMINEE",
"FLG_HAS_OLD_LOAN",
"random",
"GENDERF",
"GENDERM",
"GENDERO",
"OCCUPATIONPROF",
"OCCUPATIONSAL",
"OCCUPATIONSELF.EMP",
"OCCUPATIONSENP",
"AGE_BKT.25",
"AGE_BKT.50",
"AGE_BKT26.30",
"AGE_BKT31.35",
"AGE_BKT36.40",
"AGE_BKT41.45",
"AGE_BKT46.50",
"ACC_TYPECA",
"ACC_TYPESA" )
)
>nn.devscaled <- scale(x)
>nn.devscaled <- cbind(NN.train.data[2], nn.devscaled)
Building the NN Model
>nn2 <- neuralnet(formula = TARGET ~
AGE +
BALANCE +
SCR +
HOLDING_PERIOD +
LEN_OF_RLTN_IN_MNTH +
NO_OF_L_CR_TXNS +
NO_OF_L_DR_TXNS +
TOT_NO_OF_L_TXNS +
NO_OF_BR_CSH_WDL_DR_TXNS +
NO_OF_ATM_DR_TXNS +
NO_OF_NET_DR_TXNS +
NO_OF_MOB_DR_TXNS +
NO_OF_CHQ_DR_TXNS +
FLG_HAS_CC +
AMT_ATM_DR +
AMT_BR_CSH_WDL_DR +
AMT_CHQ_DR +
AMT_NET_DR +
AMT_MOB_DR +
AMT_L_DR +
FLG_HAS_ANY_CHGS +
AMT_OTH_BK_ATM_USG_CHGS +
AMT_MIN_BAL_NMC_CHGS +
NO_OF_IW_CHQ_BNC_TXNS +
NO_OF_OW_CHQ_BNC_TXNS +
AVG_AMT_PER_ATM_TXN +
AVG_AMT_PER_CSH_WDL_TXN +
AVG_AMT_PER_CHQ_TXN +
AVG_AMT_PER_NET_TXN +
AVG_AMT_PER_MOB_TXN +
FLG_HAS_NOMINEE +
FLG_HAS_OLD_LOAN +
random +
GENDERF +
GENDERM +
GENDERO +
OCCUPATIONPROF +
OCCUPATIONSAL +
OCCUPATIONSELF.EMP +
OCCUPATIONSENP +
AGE_BKT.25 +
AGE_BKT.50 +
AGE_BKT26.30 +
AGE_BKT31.35 +
AGE_BKT36.40 +
AGE_BKT41.45 +
AGE_BKT46.50 +
ACC_TYPECA +
ACC_TYPESA ,
data = nn.devscaled,
hidden = 3,
err.fct = "sse",
linear.output = FALSE,
lifesign = "full",
lifesign.step = 10,
threshold = 0.1,
stepmax = 2000
)
>plot (nn2)
Measuring Model Performance
Prediction on Train Data
>NN.train.data$Prob = nn2$net.result[[1]]
Probabilities in Training Dataset
>quantile(NN.train.data$Prob, c(0,1,5,10,25,50,75,90,95,98,99,100)/100)
0% 1% 5% 10% 25%
0.005340550 0.005340550 0.005341659 0.005787379 0.031114205
50% 75%
0.036584905 0.180627476
90% 95% 98% 99% 100%
0.188810014 0.513345635 0.581857453 0.581921405 0.581921674
>hist(NN.train.data$Prob)
Ranking code
>tmp_DT = data.table(NN.train.data)
>rank <- tmp_DT[, list(
cnt = length(TARGET),
cnt_resp = sum(TARGET==1),
cnt_non_resp = sum(TARGET == 0)) ,
by=deciles][order(-deciles)]
>rank$rrate <- round (rank$cnt_resp / rank$cnt,2);
>rank$cum_resp <- cumsum(rank$cnt_resp)
>rank$cum_non_resp <- cumsum(rank$cnt_non_resp)
>rank$cum_rel_resp <- round(rank$cum_resp / sum(rank$cnt_resp),2);
>rank$cum_rel_non_resp <- round(rank$cum_non_resp /
sum(rank$cnt_non_resp),2);
>rank$ks <- abs(rank$cum_rel_resp - rank$cum_rel_non_resp)
>rank$rrate <- percent(rank$rrate)
>rank$cum_rel_resp <- percent(rank$cum_rel_resp)
>rank$cum_rel_non_resp <- percent(rank$cum_rel_non_resp)
>rank
deciles cnt cnt_resp cnt_non_resp rrate cum_resp cum_non_resp cum_rel_resp
1: 10 1400 549 851 39.0% 549 851 32.0%
2: 9 1400 251 1149 18.0% 800 2000 46.0%
3: 8 1400 251 1149 18.0% 1051 3149 61.0%
4: 7 1400 228 1172 16.0% 1279 4321 74.0%
5: 6 1400 62 1338 4.0% 1341 5659 78.0%
6: 5 1400 74 1326 5.0% 1415 6985 82.0%
7: 4 1400 72 1328 5.0% 1487 8313 86.0%
8: 3 1400 96 1304 7.0% 1583 9617 92.0%
9: 2 1400 85 1315 6.0% 1668 10932 97.0%
10: 1 1400 60 1340 4.0% 1728 12272 100.0%
cum_rel_non_resp ks
1: 7.0% 0.25
2: 16.0% 0.30
3: 26.0% 0.35
4: 35.0% 0.39
5: 46.0% 0.32
6: 57.0% 0.25
7: 68.0% 0.18
8: 78.0% 0.14
9: 89.0% 0.08
10: 100.0% 0.00
Interpretation:
● The baseline Response Rate is 12.34%, whereas the response rate in top
three deciles is 39%, 18% and 18%respectively.
● With top 4 deciles, the KS is 39%, which is close to a good fitness model
indicator.
Assigning 0 / 1 class based on certain threshold
>NN.train.data$Class = ifelse(NN.train.data$Prob>0.5,1,0)
>with( NN.train.data, table(TARGET, as.factor(Class) ))
TARGET 0 1
0 11981 291
1 1296 432
Error Computation
>sum((NN.train.data$TARGET - NN.train.data$Prob)^2)/2
[1] 649.5382956
Other Model Performance Measures
>pred3 <- prediction(NN.train.data$Prob, NN.train.data$TARGET)
>perf3 <- performance(pred3, "tpr", "fpr")
>plot(perf3)
>KS3 <- max(attr(perf3, 'y.values')[[1]]-attr(perf3, 'x.values')[[1]])
>KS3
[1] 0.3913591313
>auc3 <- performance(pred3,"auc");
>auc3 <- as.numeric(auc3@y.values)
>auc3
[1] 0.7290062405
>gini3 = ineq(NN.train.data$Prob, type="Gini")
>auc3
[1] 0.7290062405
>KS3
[1] 0.3913591313
>gini3
[1] 0.5473745079
Summary: Model Performance Measures (Training Dataset)
Measure | Value
———-|———
KS | 39.13%
AUC | 72.90%
Gini | 54.73%
Accuracy | 88.66%
CeR | 11.00%
The Gini Coefficient is the ratio of the area between the line of perfect equality
and the observed Lorenz curve to the area between the line of perfect equality
and the line of perfect inequality.
The higher the coefficient, the more unequal the distribution is. Gini coefficient
can be straight away derived from the AUC ROC number.
Gini above 60% is a good model.
The lower the classification error rate, higher the model accuracy, resulting in a
better model. The classification error rate can be reduced if there were more
independent variables were present for modeling.
The model observed to perform at par expectations on majority of the model
performance measures,indicating it to be a good model, with scope for
improvement.
> hist(NN.test.data$Predict.score)
Deciling
>NN.test.data$deciles <- decile(NN.test.data$Predict.score)
Rank ordering
>tmp_DT = data.table(NN.test.data)
>h_rank <- tmp_DT[, list(
cnt = length(TARGET),
cnt_resp = sum(TARGET),
cnt_non_resp = sum(TARGET == 0)) ,
by=deciles][order(-deciles)]
>h_rank$rrate <- round (h_rank$cnt_resp / h_rank$cnt,2);
>h_rank$cum_resp <- cumsum(h_rank$cnt_resp)
>h_rank$cum_non_resp <- cumsum(h_rank$cnt_non_resp)
>h_rank$cum_rel_resp <- round(h_rank$cum_resp / sum(h_rank$cnt_resp),2);
>h_rank$cum_rel_non_resp <- round(h_rank$cum_non_resp /
sum(h_rank$cnt_non_resp),2);
>h_rank$ks <- abs(h_rank$cum_rel_resp - h_rank$cum_rel_non_resp);
>h_rank$rrate <- percent(h_rank$rrate)
>h_rank$cum_rel_resp <- percent(h_rank$cum_rel_resp)
>h_rank$cum_rel_non_resp <- percent(h_rank$cum_rel_non_resp)
>h_rank
TARGET 0 1
0 5065 151
1 639 145
Error Computation
>sum((NN.test.data$TARGET - NN.test.data$Prob)^2)/2
[1] 308.6009414
Other Model Performance Measures
>pred4 <- prediction(NN.test.data$Prob, NN.test.data$TARGET)
>perf4 <- performance(pred4, "tpr", "fpr")
>plot(perf4)
>KS4 <- max(attr(perf4, 'y.values')[[1]]-attr(perf4, 'x.values')[[1]])
>KS4
[1] 0.3545409728
>auc4 <- performance(pred4,"auc");
>auc4 <- as.numeric(auc4@y.values)
>auc4
[1] 0.7213823293
gini4 = ineq(NN.test.data$Prob, type="Gini")
gini4
[1] 0.5462225246
The predictive models was be developed using the following Machine Learning
techniques:
1. Classification Tree - CART
2. Random Forest
3. Neural Network
The snapshot of the performance of all the models on accuracy, over-fitting and
other model performance measures is provided below:
CART
Measures Train Test %Deviation
KS 72.18 47.40 35%
AUC 91.46 79.23 13%
Gini 41.46 60.42 -46%
Accuracy 86.9 78.35 10%
CeR 13.91 21.65 -56%
Interpretation:
● The CART method has given poor performance compared to Random
Forest and ANN. Looking at the percentage deviation between Training
and Testing Dataset, it looks like the Model is over fit.
● The Random Forest method has the best performance (best accuracy)
among all the three models. The percentage deviation between Training
and Testing Dataset also is reasonably under control, suggesting a robust
model.
● Neural Network has given relatively secondary performance compared to
Random Forest, however, better than CART. However, the percentage
deviation between Training and Testing Data set is minimal among three
models.
4.2. The source code for importing dataset has been written below
>Loan <- read.csv("Loan.csv", header=TRUE)
4.3. The source code for Exploratory Data Analysis and Descriptive Statistics
has been written below
>head(Loan)
# Find out Class of each Feature, along with internal structure
>str(Loan)
# Find out Total Number of Rows and Columns
>dim(Loan)
# Find out Names of the Columns (Features)
>names(Loan)
>summary(Loan)
4.4. The source code for Creating Training and Testing Dataset has been
written below
>trainIndex <- createDataPartition(Loan$TARGET,
p=0.7,
list = FALSE,
times = 1)
>Loan.dev <- Loan[trainIndex, ]
>Loan.holdout <- Loan[-trainIndex,]
>dim(Loan.dev)
>dim(Loan.holdout)
>table(Loan.dev$TARGET)
>table(Loan.holdout$TARGET)
4.5. The source code for CART technique has been written below
#Build the model on Training Dataset (Unbalanced)
>cart.train <- Loan.dev
#Setting the control parameter inputs for rpart
>r1.ctrl = rpart.control(minsplit=100, minbucket = 10, cp = 0, xval = 10)
>m2 <- rpart(formula = cart.train$TARGET ~ .,
data = cart.train[,-c(1,11)], method = "class",
control = r1.ctrl)
>m2
>fancyRpartPlot(m2)
>printcp(m2)
>plotcp(m2)
# Pruning Code
>ptree1<- prune(m2, cp= 0.0017 ,"CP")
>printcp(ptree1)
>fancyRpartPlot(ptree1, uniform=TRUE, main="Pruned Classification Tree")
# Deciling code
>decile <- function(x)
{
deciles <- vector(length=10)
for (i in seq(0.1,1,.1))
{
deciles[i*10] <- quantile(x, i, na.rm=T)
}
return (
ifelse(x<deciles[1], 1,
ifelse(x<deciles[2], 2,
ifelse(x<deciles[3], 3,
ifelse(x<deciles[4], 4,
ifelse(x<deciles[5], 5,
ifelse(x<deciles[6], 6,
ifelse(x<deciles[7], 7,
ifelse(x<deciles[8], 8,
ifelse(x<deciles[9], 9, 10
))))))))))
}
# Deciling
>cart.train$deciles <- decile(cart.train$predict.score[,2])
>View(cart.train)
# Ranking code
>tmp_DT = data.table(cart.train)
>rank <- tmp_DT[, list(cnt=length(TARGET),
cnt_resp=sum(TARGET==1),
cnt_non_resp=sum(TARGET==0)
), by=deciles][order(-deciles)]
>rank$rrate <- round(rank$cnt_resp / rank$cnt,4);
>rank$cum_resp <- cumsum(rank$cnt_resp)
>rank$cum_non_resp <- cumsum(rank$cnt_non_resp)
>rank$cum_rel_resp <- round(rank$cum_resp / sum(rank$cnt_resp),4);
>rank$cum_rel_non_resp <- round(rank$cum_non_resp /
sum(rank$cnt_non_resp),4);
>rank$ks <- abs(rank$cum_rel_resp - rank$cum_rel_non_resp) * 100;
>rank$rrate <- percent(rank$rrate)
>rank$cum_rel_resp <- percent(rank$cum_rel_resp)
>rank$cum_rel_non_resp <- percent(rank$cum_rel_non_resp)
>rank
#Deciling Code
>decile <- function(x)
{
deciles <- vector(length=10)
for (i in seq(0.1,1,.1))
{
deciles[i*10] <- quantile(x, i, na.rm=T)
}
return (
ifelse(x<deciles[1], 1,
ifelse(x<deciles[2], 2,
ifelse(x<deciles[3], 3,
ifelse(x<deciles[4], 4,
ifelse(x<deciles[5], 5,
ifelse(x<deciles[6], 6,
ifelse(x<deciles[7], 7,
ifelse(x<deciles[8], 8,
ifelse(x<deciles[9], 9, 10
))))))))))
}
# Deciling
>cart.train.over$deciles <- decile(cart.train.over$predict.score[,2])
#Ranking Code
>tmp_DT = data.table(cart.train.over)
>rank <- tmp_DT[, list(cnt=length(TARGET),
cnt_resp=sum(TARGET==1),
cnt_non_resp=sum(TARGET==0)
), by=deciles][order(-deciles)]
>rank$rrate <- round(rank$cnt_resp / rank$cnt,4);
>rank$cum_resp <- cumsum(rank$cnt_resp)
>rank$cum_non_resp <- cumsum(rank$cnt_non_resp)
>rank$cum_rel_resp <- round(rank$cum_resp / sum(rank$cnt_resp),4);
>rank$cum_rel_non_resp <- round(rank$cum_non_resp /
sum(rank$cnt_non_resp),4);
>rank$ks <- abs(rank$cum_rel_resp - rank$cum_rel_non_resp) * 100;
>rank$rrate <- percent(rank$rrate)
>rank$cum_rel_resp <- percent(rank$cum_rel_resp)
>rank$cum_rel_non_resp <- percent(rank$cum_rel_non_resp)
>rank
#Ranking Code
>tmp_DT = data.table(cart.test)
>h_rank <- tmp_DT[, list(cnt=length(TARGET),
cnt_resp=sum(TARGET==1),
cnt_non_resp=sum(TARGET==0)
), by=deciles][order(-deciles)]
>h_rank$rrate <- round(h_rank$cnt_resp / h_rank$cnt,4);
>h_rank$cum_resp <- cumsum(h_rank$cnt_resp)
>h_rank$cum_non_resp <- cumsum(h_rank$cnt_non_resp)
>h_rank$cum_rel_resp <- round(h_rank$cum_resp / sum(h_rank$cnt_resp),4);
>h_rank$cum_rel_non_resp <- round(h_rank$cum_non_resp /
sum(h_rank$cnt_non_resp),4);
>h_rank$ks <- abs(h_rank$cum_rel_resp - h_rank$cum_rel_non_resp) * 100;
>h_rank$rrate <- percent(h_rank$rrate)
>h_rank$cum_rel_resp <- percent(h_rank$cum_rel_resp)
>h_rank$cum_rel_non_resp <- percent(h_rank$cum_rel_non_resp)
>h_rank
4.6. The source code for Random Forest technique has been written below
#Random Forest Model - Train Dataset
>rf.train <- Loan.dev
>RF=randomForest(as.factor(TARGET)~.,
data = rf.train[,-c(1,11)],
ntree = 501, mtry = 3, nodesize = 10,
importance=TRUE)
>print(RF)
>plot(RF, main="")
>legend("topright", c("OOB", "0", "1"), text.col=1:6, lty=1:3, col=1:3)
>title(main="Error Rates Random Forest Loan Training data")
#Deciling code
>decile <- function(x)
{
deciles <- vector(length=10)
for (i in seq(0.1,1,.1)){
deciles[i*10] <- quantile(x, i, na.rm=T)
}
return (
ifelse(x<deciles[1], 1,
ifelse(x<deciles[2], 2,
ifelse(x<deciles[3], 3,
ifelse(x<deciles[4], 4,
ifelse(x<deciles[5], 5,
ifelse(x<deciles[6], 6,
ifelse(x<deciles[7], 7,
ifelse(x<deciles[8], 8,
ifelse(x<deciles[9], 9, 10
))))))))))
}
>rf.train$deciles <- decile(rf.train$predict.score[,2])
# Gini Coefficient
>gini = ineq(rf.train$predict.score[,2], type="Gini")
>gini
# Classification Error
>with(rf.train, table(TARGET, predict.class))
# Scoring syntax
>rf.test <- Loan.holdout
>rf.test$predict.class <- predict(tRF, rf.test, type="class")
>rf.test$predict.score <- predict(tRF, rf.test, type="prob")
>rf.test$deciles <- decile(rf.test$predict.score[,2])
#Rank order
>tmp_DT = data.table(rf.test)
>h_rank <- tmp_DT[, list(
cnt = length(TARGET),
cnt_resp = sum(TARGET),
cnt_non_resp = sum(TARGET == 0)) ,
by=deciles][order(-deciles)]
>h_rank$rrate <- round (h_rank$cnt_resp / h_rank$cnt,2);
>h_rank$cum_resp <- cumsum(h_rank$cnt_resp)
>h_rank$cum_non_resp <- cumsum(h_rank$cnt_non_resp)
>h_rank$cum_rel_resp <- round(h_rank$cum_resp / sum(h_rank$cnt_resp),2);
>h_rank$cum_rel_non_resp <- round(h_rank$cum_non_resp /
sum(h_rank$cnt_non_resp),2);
>h_rank$ks <- abs(h_rank$cum_rel_resp - h_rank$cum_rel_non_resp);
>h_rank$rrate <- percent(h_rank$rrate)
>h_rank$cum_rel_resp <- percent(h_rank$cum_rel_resp)
>h_rank$cum_rel_non_resp <- percent(h_rank$cum_rel_non_resp)
>h_rank
# Gini Coefficient
>gini1 = ineq(rf.test$predict.score[,2], type="Gini")
>gini1
# Classification Error
>with(rf.test, table(TARGET, predict.class))
4.7. The source code for Artificial Neural Network technique has been written
below
>NNInput = read.table("Loan.csv", sep = ",", header = T)
>attach(NNInput)
>dim(NNInput)
# Gender
>GEN.matrix <- model.matrix(~ GENDER - 1, data = NNInput)
>NNInput <- data.frame(NNInput, GEN.matrix)
# Occupation
>occ.matrix <- model.matrix(~ OCCUPATION - 1, data = NNInput)
>NNInput <- data.frame(NNInput, occ.matrix)
# AGE_BKT
>AGEBKT.matrix <- model.matrix(~ AGE_BKT - 1, data = NNInput)
>NNInput <- data.frame(NNInput, AGEBKT.matrix)
# ACC_TYPE
>ACCTYP.matrix <- model.matrix(~ ACC_TYPE - 1, data = NNInput)
>NNInput <- data.frame(NNInput, ACCTYP.matrix)
#Deciling code
>decile <- function(x)
{
deciles <- vector(length=10)
for (i in seq(0.1,1,.1)){
deciles[i*10] <- quantile(x, i, na.rm=T)
}
return (
ifelse(x<deciles[1], 1,
ifelse(x<deciles[2], 2,
ifelse(x<deciles[3], 3,
ifelse(x<deciles[4], 4,
ifelse(x<deciles[5], 5,
ifelse(x<deciles[6], 6,
ifelse(x<deciles[7], 7,
ifelse(x<deciles[8], 8,
ifelse(x<deciles[9], 9, 10
))))))))))
}
>NN.train.data$deciles <- decile(NN.train.data$Prob)
# Ranking code
>tmp_DT = data.table(NN.train.data)
>rank <- tmp_DT[, list(
cnt = length(TARGET),
cnt_resp = sum(TARGET==1),
cnt_non_resp = sum(TARGET == 0)) ,
by=deciles][order(-deciles)]
>rank$rrate <- round (rank$cnt_resp / rank$cnt,2);
>rank$cum_resp <- cumsum(rank$cnt_resp)
>rank$cum_non_resp <- cumsum(rank$cnt_non_resp)
>rank$cum_rel_resp <- round(rank$cum_resp / sum(rank$cnt_resp),2);
>rank$cum_rel_non_resp <- round(rank$cum_non_resp /
sum(rank$cnt_non_resp),2);
>rank$ks <- abs(rank$cum_rel_resp - rank$cum_rel_non_resp)
>rank$rrate <- percent(rank$rrate)
>rank$cum_rel_resp <- percent(rank$cum_rel_resp)
>rank$cum_rel_non_resp <- percent(rank$cum_rel_non_resp)
>rank
#Error Computation
>sum((NN.train.data$TARGET - NN.train.data$Prob)^2)/2
# Deciling
>NN.test.data$deciles <- decile(NN.test.data$Predict.score)
#Rank ordering
>tmp_DT = data.table(NN.test.data)
>h_rank <- tmp_DT[, list(
cnt = length(TARGET),
cnt_resp = sum(TARGET),
cnt_non_resp = sum(TARGET == 0)) ,
by=deciles][order(-deciles)]
>h_rank$rrate <- round (h_rank$cnt_resp / h_rank$cnt,2);
>h_rank$cum_resp <- cumsum(h_rank$cnt_resp)
>h_rank$cum_non_resp <- cumsum(h_rank$cnt_non_resp)
>h_rank$cum_rel_resp <- round(h_rank$cum_resp / sum(h_rank$cnt_resp),2);
>h_rank$cum_rel_non_resp <- round(h_rank$cum_non_resp /
sum(h_rank$cnt_non_resp),2); h_rank$ks <- abs(h_rank$cum_rel_resp -
>h_rank$cum_rel_non_resp); # library(scales)
>h_rank$rrate <- percent(h_rank$rrate)
>h_rank$cum_rel_resp <- percent(h_rank$cum_rel_resp)
>h_rank$cum_rel_non_resp <- percent(h_rank$cum_rel_non_resp)
>h_rank
# Error Computation
>sum((NN.test.data$TARGET - NN.test.data$Prob)^2)/2