Default of Credit Card Clients Data Set

23 features are used to predict target column.Some features are catogerical and some are also numeric. Categorical features are converted into dummy variables for linear regression. Original data set is used by other learners such as decision tree, random forest.

Features

X1: Amount of the given credit (NT dollar)

X2: Gender (1 = male; 2 = female)

X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others)

X4: Marital status (1 = married; 2 = single; 3 = others)

X5: Age (year)

X6 - X11: History of past payment

X12-X17: Amount of bill statement (NT dollar)

X18-X23: Amount of previous payment (NT dollar)

Target

Default Payment (Yes = 1, No = 0)

library("caret")
library("dplyr")
library("glmnet")
library(Metrics)
require(rpart)
library(rattle)
library(RColorBrewer)
require(data.table,quietly = TRUE)
library(data.table)
library(randomForest)
library(gbm)
library(xgboost)
library(e1071)

Data Manipulation Part

credit = read.table(file = "clipboard", sep = "\t",header = T)
credit = credit[,-1]
colnames(credit) <- credit[1,]
names(credit)[ncol(credit)] = "default_payment"
credit <- credit[-1,]
credit$default_payment = as.factor(credit$default_payment)


# convert categorical value into factor and numerical

for (i in 1:23) {
  if(i < 2 || i > 11 ){
    
    credit[,i] = as.numeric(credit[,i])
  }else if( i == 5){
    credit[,i] = as.double(as.numeric(credit[,i]))
  
  }else{
    credit[,i] = as.factor(credit[,i])}}
# convert categorical variables to dummy
Dummies =  dummyVars("~.", data = credit[,-ncol(credit)],fullRank = T,drop2nd = T)

credit_dummy = data.frame(predict(Dummies, newdata = credit[,-ncol(credit)]))
credit_dummy$default_payment <- credit$default_payment

# split data as training and test

set.seed(121)

idx <- sample(seq(1, 2), size = nrow(credit), replace = TRUE, prob = c(.75,.25))

train_credit <- credit[idx == 1,]
test_credit <- credit[idx == 2,]

dummy_train <- credit_dummy[idx == 1,]
dummy_test <- credit_dummy[idx == 2,]

# to solve imbalance problem

only_0 <- as.data.frame(credit[credit$default_payment == "0",])
only_1 <- as.data.frame(credit[credit$default_payment == "1",])

only_0_dummy <- as.data.frame(credit_dummy[credit_dummy$default_payment == "0",])
only_1_dummy <- as.data.frame(credit_dummy[credit_dummy$default_payment == "1",])


balanced_train <- rbind( sample_n(only_1,5000) , sample_n(only_0,5000))
train_balanced_dummy <- rbind(sample_n(only_1_dummy,5000),sample_n(only_0_dummy,5000))

Penalized Linear Regression

#  unbalanced training set
glmmod = cv.glmnet(as.matrix(dummy_train[,1:(ncol(dummy_train)-1)]),dummy_train$default_payment, family = "binomial",type.measure = "class")


# balanced training set
glmmod_balanced = cv.glmnet(as.matrix(train_balanced_dummy[,1:(ncol(train_balanced_dummy)-1)]),train_balanced_dummy$default_payment, family = "binomial",type.measure = "class")

# evaluate test error performance on model trained with unbalanced data set
linear_regression <- function(x){
prediction_test = predict(glmmod,as.matrix(dummy_test[,1:(ncol(dummy_test)-1)]),type = "class",s = x)
error_unbalanced= confusionMatrix(as.factor(prediction_test), dummy_test$default_payment)

# evaluate test error performance on model trained with balanced data set
prediction_balanced = predict(glmmod_balanced,as.matrix(dummy_test[,1:(ncol(dummy_test)-1)]),type = "class",s = x)
error_balanced= confusionMatrix(as.factor(prediction_balanced), dummy_test$default_payment) 

print("Confusion matrix for model trained unbalanced data set")
print(error_unbalanced$table)      
print(paste("Accuracy for model trained unbalanced data set", error_unbalanced$overall[1]))

print("Confusion matrix for the model trained balanced data set")
print(error_balanced$table)
print(paste("Accuracy for model trained balanced data set", error_balanced$overall[1]))


}
linear_regression(0.009)
## [1] "Confusion matrix for model trained unbalanced data set"
##           Reference
## Prediction    0    1
##          0 5576 1088
##          1  264  559
## [1] "Accuracy for model trained unbalanced data set 0.819420328569521"
## [1] "Confusion matrix for the model trained balanced data set"
##           Reference
## Prediction    0    1
##          0 4871  716
##          1  969  931
## [1] "Accuracy for model trained balanced data set 0.774943234940564"
linear_regression(0.01)
## [1] "Confusion matrix for model trained unbalanced data set"
##           Reference
## Prediction    0    1
##          0 5582 1093
##          1  258  554
## [1] "Accuracy for model trained unbalanced data set 0.819553893415253"
## [1] "Confusion matrix for the model trained balanced data set"
##           Reference
## Prediction    0    1
##          0 4869  714
##          1  971  933
## [1] "Accuracy for model trained balanced data set 0.774943234940564"
linear_regression(0.05)
## [1] "Confusion matrix for model trained unbalanced data set"
##           Reference
## Prediction    0    1
##          0 5739 1345
##          1  101  302
## [1] "Accuracy for model trained unbalanced data set 0.806865233070656"
## [1] "Confusion matrix for the model trained balanced data set"
##           Reference
## Prediction    0    1
##          0 5132  827
##          1  708  820
## [1] "Accuracy for model trained balanced data set 0.794977961800454"

The data set does not have equally distributed target values.This causes imbalance problem.Therefore, two data set were examined. One is original data set other is created from original data set as equally distributed values. When looking at model trained with unbalanced data, different lambda values did not make a difference in terms of accuracy.Thus, model trained with equally distributed data were examined. The best results are obtaned with lambda = 0.05. Test accuracy is achieved as 0.79

Decision Tree

# Classification Tree



Decision_tree <- function(cp,minsplit){
class_tree = rpart(default_payment~.,train_credit, method = "class",  cp = cp, minsplit = minsplit )
class_tree_balanced = rpart(default_payment~.,balanced_train, method = "class",cp = cp, minsplit = minsplit  )



test_prediction_tree= predict(class_tree,test_credit, type = "class")
error_test_tree <- confusionMatrix(test_prediction_tree,test_credit$default_payment)

test_tree_balanced= predict(class_tree_balanced,test_credit, type = "class")
error_tree_balanced <- confusionMatrix(test_tree_balanced,test_credit$default_payment)

train_prediction<- predict(class_tree_balanced,balanced_train,type = "class")
error_train <- confusionMatrix(train_prediction,balanced_train$default_payment)



print("Confusion matrix for D.T.  trained with unbalanced data set")
print(error_test_tree$table)      
print(paste("Accuracy for D.T. trained with unbalanced data set", error_test_tree$overall[1]))

print("Confusion matrix for D.T. test data set")
print(error_tree_balanced$table)
print(paste("Accuracy for test data set", error_tree_balanced$overall[1]))

print(paste("Accuracy for balanced training data", error_train$overall[1]))


}
Decision_tree(0.0009,30)
## [1] "Confusion matrix for D.T.  trained with unbalanced data set"
##           Reference
## Prediction    0    1
##          0 5541 1057
##          1  299  590
## [1] "Accuracy for D.T. trained with unbalanced data set 0.81888606918659"
## [1] "Confusion matrix for D.T. test data set"
##           Reference
## Prediction    0    1
##          0 4471  550
##          1 1369 1097
## [1] "Accuracy for test data set 0.743689061039134"
## [1] "Accuracy for balanced training data 0.7444"
Decision_tree(0.0001,30)
## [1] "Confusion matrix for D.T.  trained with unbalanced data set"
##           Reference
## Prediction    0    1
##          0 5244  981
##          1  596  666
## [1] "Accuracy for D.T. trained with unbalanced data set 0.789368238279685"
## [1] "Confusion matrix for D.T. test data set"
##           Reference
## Prediction    0    1
##          0 4008  367
##          1 1832 1280
## [1] "Accuracy for test data set 0.706290904234006"
## [1] "Accuracy for balanced training data 0.8006"
Decision_tree(0.0001,10)
## [1] "Confusion matrix for D.T.  trained with unbalanced data set"
##           Reference
## Prediction    0    1
##          0 4971 1010
##          1  869  637
## [1] "Accuracy for D.T. trained with unbalanced data set 0.749031654868439"
## [1] "Confusion matrix for D.T. test data set"
##           Reference
## Prediction    0    1
##          0 4006  273
##          1 1834 1374
## [1] "Accuracy for test data set 0.718578870041405"
## [1] "Accuracy for balanced training data 0.8948"

Highest training accuracy (0.89) was obtained when cp and minsplit were equal to 0.0001 and 10. Although model had high training accuracy, it had lower prediction accuracy (0.71) than other models using different hyperparameter values.This means that this model tends to be overfitting. Optimum results was achieved when cp and minsplit were taken 0.0009 and 30 (training accuracy =0.744 and test accuracy = 0.743).

Random Forest

Random_forest <- function(numfeat){
RF = randomForest(train_credit[ ,1:(ncol(train_credit)-1)],train_credit$default_payment,type = "class" ,mtry = numfeat)

RF_balanced = randomForest(balanced_train[ ,1:(ncol(balanced_train)-1)],balanced_train$default_payment,type = "class", mtry = numfeat )


# Test data
prediction_test_forest = predict(RF,test_credit, type = "class")

error_test_forest <- confusionMatrix(prediction_test_forest,test_credit$default_payment)


prediction_balanced_forest = predict(RF_balanced,test_credit, type = "class")
error_balanced_forest <- confusionMatrix(prediction_balanced_forest,test_credit$default_payment)

train_prediction<- predict(RF_balanced,balanced_train,type = "class")
error_train <- confusionMatrix(train_prediction,balanced_train$default_payment)


print("Confusion matrix for R.F.  trained with unbalanced  data set")
print(error_test_forest$table)      
print(paste("Accuracy for R.F. trained with unbalanced data set ", error_test_forest$overall[1]))

print("Confusion matrix for test data set")
print(error_balanced_forest$table)
print(paste("Accuracy for test data set", error_balanced_forest$overall[1]))

print(paste("Accuracy for balanced training data", error_train$overall[1]))



}
Random_forest(5)
## [1] "Confusion matrix for R.F.  trained with unbalanced  data set"
##           Reference
## Prediction    0    1
##          0 5508 1039
##          1  332  608
## [1] "Accuracy for R.F. trained with unbalanced data set  0.816882596500601"
## [1] "Confusion matrix for test data set"
##           Reference
## Prediction    0    1
##          0 4714  139
##          1 1126 1508
## [1] "Accuracy for test data set 0.831040470148257"
## [1] "Accuracy for balanced training data 0.9946"
Random_forest(10)
## [1] "Confusion matrix for R.F.  trained with unbalanced  data set"
##           Reference
## Prediction    0    1
##          0 5505 1043
##          1  335  604
## [1] "Accuracy for R.F. trained with unbalanced data set  0.815947642580473"
## [1] "Confusion matrix for test data set"
##           Reference
## Prediction    0    1
##          0 4723  142
##          1 1117 1505
## [1] "Accuracy for test data set 0.831841859222653"
## [1] "Accuracy for balanced training data 0.9994"
Random_forest(15)
## [1] "Confusion matrix for R.F.  trained with unbalanced  data set"
##           Reference
## Prediction    0    1
##          0 5502 1032
##          1  338  615
## [1] "Accuracy for R.F. trained with unbalanced data set  0.817016161346334"
## [1] "Confusion matrix for test data set"
##           Reference
## Prediction    0    1
##          0 4708  141
##          1 1132 1506
## [1] "Accuracy for test data set 0.829971951382396"
## [1] "Accuracy for balanced training data 0.9996"

This simulation showed that random forest algorithm tended to be overfitting on this data. Training accuracy was about 1 but test accuracy was equal to 0.83. Nevertheless, the best test accuracy was obtained by random forest algorithm. When number of features which are randomly selected to evaluate which features are used for the aim of splitting was increased, it was observed that model mush more tended to be overfitting.

Stochastic Gradient Boosting

# stochastic gradient boosting

control <- trainControl(method = "repeatedcv",number = 5, repeats = 2, allowParallel = T)
tunning  <- expand.grid(n.trees = c(100,150,200), interaction.depth=c(1:3), shrinkage=c(0.01,0.05,0.1), n.minobsinnode=c(20))
                                 
unwantedoutput =  capture.output(stochastic <- train(default_payment~., data = balanced_train,trControl = control,method = "gbm",tuneGrid = tunning))
print(stochastic)  
## Stochastic Gradient Boosting 
## 
## 10000 samples
##    23 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 8000, 8000, 8000, 8000, 8000, 8000, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  Accuracy  Kappa 
##   0.01       1                  100      0.67240   0.3448
##   0.01       1                  150      0.67815   0.3563
##   0.01       1                  200      0.68650   0.3730
##   0.01       2                  100      0.68555   0.3711
##   0.01       2                  150      0.69490   0.3898
##   0.01       2                  200      0.70060   0.4012
##   0.01       3                  100      0.70165   0.4033
##   0.01       3                  150      0.70525   0.4105
##   0.01       3                  200      0.70915   0.4183
##   0.05       1                  100      0.70045   0.4009
##   0.05       1                  150      0.70550   0.4110
##   0.05       1                  200      0.70685   0.4137
##   0.05       2                  100      0.70990   0.4198
##   0.05       2                  150      0.71265   0.4253
##   0.05       2                  200      0.71395   0.4279
##   0.05       3                  100      0.71100   0.4220
##   0.05       3                  150      0.71395   0.4279
##   0.05       3                  200      0.71530   0.4306
##   0.10       1                  100      0.70660   0.4132
##   0.10       1                  150      0.70815   0.4163
##   0.10       1                  200      0.70675   0.4135
##   0.10       2                  100      0.71190   0.4238
##   0.10       2                  150      0.71145   0.4229
##   0.10       2                  200      0.71185   0.4237
##   0.10       3                  100      0.71335   0.4267
##   0.10       3                  150      0.71345   0.4269
##   0.10       3                  200      0.71225   0.4245
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 20
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 200, interaction.depth =
##  3, shrinkage = 0.05 and n.minobsinnode = 20.
unwantedoutput =  capture.output(stochastic_unbalanced <- train(default_payment~., data = train_credit,trControl = control,method = "gbm",tuneGrid = tunning))
print(stochastic)  
## Stochastic Gradient Boosting 
## 
## 10000 samples
##    23 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 8000, 8000, 8000, 8000, 8000, 8000, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  Accuracy  Kappa 
##   0.01       1                  100      0.67240   0.3448
##   0.01       1                  150      0.67815   0.3563
##   0.01       1                  200      0.68650   0.3730
##   0.01       2                  100      0.68555   0.3711
##   0.01       2                  150      0.69490   0.3898
##   0.01       2                  200      0.70060   0.4012
##   0.01       3                  100      0.70165   0.4033
##   0.01       3                  150      0.70525   0.4105
##   0.01       3                  200      0.70915   0.4183
##   0.05       1                  100      0.70045   0.4009
##   0.05       1                  150      0.70550   0.4110
##   0.05       1                  200      0.70685   0.4137
##   0.05       2                  100      0.70990   0.4198
##   0.05       2                  150      0.71265   0.4253
##   0.05       2                  200      0.71395   0.4279
##   0.05       3                  100      0.71100   0.4220
##   0.05       3                  150      0.71395   0.4279
##   0.05       3                  200      0.71530   0.4306
##   0.10       1                  100      0.70660   0.4132
##   0.10       1                  150      0.70815   0.4163
##   0.10       1                  200      0.70675   0.4135
##   0.10       2                  100      0.71190   0.4238
##   0.10       2                  150      0.71145   0.4229
##   0.10       2                  200      0.71185   0.4237
##   0.10       3                  100      0.71335   0.4267
##   0.10       3                  150      0.71345   0.4269
##   0.10       3                  200      0.71225   0.4245
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 20
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 200, interaction.depth =
##  3, shrinkage = 0.05 and n.minobsinnode = 20.
prediction_unbalanced <- predict(stochastic_unbalanced,test_credit)
error_unbalanced <- confusionMatrix(prediction_unbalanced,test_credit$default_payment)

error_unbalanced$table
##           Reference
## Prediction    0    1
##          0 5519 1043
##          1  321  604
error_unbalanced$overall[1]
##  Accuracy 
## 0.8178176
prediction_balanced <- predict(stochastic,test_credit)
error_balanced <- confusionMatrix(prediction_balanced,test_credit$default_payment)

error_balanced$table
##           Reference
## Prediction    0    1
##          0 4605  592
##          1 1235 1055
print(paste("Accuracy for test set",error_balanced$overall[1]))
## [1] "Accuracy for test set 0.755977026846534"
train_prediction<- predict(stochastic,balanced_train)
error_train <- confusionMatrix(train_prediction,balanced_train$default_payment)

error_train$table
##           Reference
## Prediction    0    1
##          0 4004 1725
##          1  996 3275
print(paste("Accuracy for training data",error_train$overall[1]))
## [1] "Accuracy for training data 0.7279"

Number of trees, depth and alpha values are choosen 200,3 and 0.05 to acquire the best prediction result. However,training accuracy was inconsistent with test accuracy. Training error was higher than test error. I did not characterize model to be overfitting or underfitting. This might result from my coding mistake. To summarize, random forest algorithm gave best accuracy among other learners