23 features are used to predict target column.Some features are catogerical and some are also numeric. Categorical features are converted into dummy variables for linear regression. Original data set is used by other learners such as decision tree, random forest.
Features
X1: Amount of the given credit (NT dollar)
X2: Gender (1 = male; 2 = female)
X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others)
X4: Marital status (1 = married; 2 = single; 3 = others)
X5: Age (year)
X6 - X11: History of past payment
X12-X17: Amount of bill statement (NT dollar)
X18-X23: Amount of previous payment (NT dollar)
Target
Default Payment (Yes = 1, No = 0)
library("caret")
library("dplyr")
library("glmnet")
library(Metrics)
require(rpart)
library(rattle)
library(RColorBrewer)
require(data.table,quietly = TRUE)
library(data.table)
library(randomForest)
library(gbm)
library(xgboost)
library(e1071)
credit = read.table(file = "clipboard", sep = "\t",header = T)
credit = credit[,-1]
colnames(credit) <- credit[1,]
names(credit)[ncol(credit)] = "default_payment"
credit <- credit[-1,]
credit$default_payment = as.factor(credit$default_payment)
# convert categorical value into factor and numerical
for (i in 1:23) {
if(i < 2 || i > 11 ){
credit[,i] = as.numeric(credit[,i])
}else if( i == 5){
credit[,i] = as.double(as.numeric(credit[,i]))
}else{
credit[,i] = as.factor(credit[,i])}}
# convert categorical variables to dummy
Dummies = dummyVars("~.", data = credit[,-ncol(credit)],fullRank = T,drop2nd = T)
credit_dummy = data.frame(predict(Dummies, newdata = credit[,-ncol(credit)]))
credit_dummy$default_payment <- credit$default_payment
# split data as training and test
set.seed(121)
idx <- sample(seq(1, 2), size = nrow(credit), replace = TRUE, prob = c(.75,.25))
train_credit <- credit[idx == 1,]
test_credit <- credit[idx == 2,]
dummy_train <- credit_dummy[idx == 1,]
dummy_test <- credit_dummy[idx == 2,]
# to solve imbalance problem
only_0 <- as.data.frame(credit[credit$default_payment == "0",])
only_1 <- as.data.frame(credit[credit$default_payment == "1",])
only_0_dummy <- as.data.frame(credit_dummy[credit_dummy$default_payment == "0",])
only_1_dummy <- as.data.frame(credit_dummy[credit_dummy$default_payment == "1",])
balanced_train <- rbind( sample_n(only_1,5000) , sample_n(only_0,5000))
train_balanced_dummy <- rbind(sample_n(only_1_dummy,5000),sample_n(only_0_dummy,5000))
# unbalanced training set
glmmod = cv.glmnet(as.matrix(dummy_train[,1:(ncol(dummy_train)-1)]),dummy_train$default_payment, family = "binomial",type.measure = "class")
# balanced training set
glmmod_balanced = cv.glmnet(as.matrix(train_balanced_dummy[,1:(ncol(train_balanced_dummy)-1)]),train_balanced_dummy$default_payment, family = "binomial",type.measure = "class")
# evaluate test error performance on model trained with unbalanced data set
linear_regression <- function(x){
prediction_test = predict(glmmod,as.matrix(dummy_test[,1:(ncol(dummy_test)-1)]),type = "class",s = x)
error_unbalanced= confusionMatrix(as.factor(prediction_test), dummy_test$default_payment)
# evaluate test error performance on model trained with balanced data set
prediction_balanced = predict(glmmod_balanced,as.matrix(dummy_test[,1:(ncol(dummy_test)-1)]),type = "class",s = x)
error_balanced= confusionMatrix(as.factor(prediction_balanced), dummy_test$default_payment)
print("Confusion matrix for model trained unbalanced data set")
print(error_unbalanced$table)
print(paste("Accuracy for model trained unbalanced data set", error_unbalanced$overall[1]))
print("Confusion matrix for the model trained balanced data set")
print(error_balanced$table)
print(paste("Accuracy for model trained balanced data set", error_balanced$overall[1]))
}
linear_regression(0.009)
## [1] "Confusion matrix for model trained unbalanced data set"
## Reference
## Prediction 0 1
## 0 5576 1088
## 1 264 559
## [1] "Accuracy for model trained unbalanced data set 0.819420328569521"
## [1] "Confusion matrix for the model trained balanced data set"
## Reference
## Prediction 0 1
## 0 4871 716
## 1 969 931
## [1] "Accuracy for model trained balanced data set 0.774943234940564"
linear_regression(0.01)
## [1] "Confusion matrix for model trained unbalanced data set"
## Reference
## Prediction 0 1
## 0 5582 1093
## 1 258 554
## [1] "Accuracy for model trained unbalanced data set 0.819553893415253"
## [1] "Confusion matrix for the model trained balanced data set"
## Reference
## Prediction 0 1
## 0 4869 714
## 1 971 933
## [1] "Accuracy for model trained balanced data set 0.774943234940564"
linear_regression(0.05)
## [1] "Confusion matrix for model trained unbalanced data set"
## Reference
## Prediction 0 1
## 0 5739 1345
## 1 101 302
## [1] "Accuracy for model trained unbalanced data set 0.806865233070656"
## [1] "Confusion matrix for the model trained balanced data set"
## Reference
## Prediction 0 1
## 0 5132 827
## 1 708 820
## [1] "Accuracy for model trained balanced data set 0.794977961800454"
The data set does not have equally distributed target values.This causes imbalance problem.Therefore, two data set were examined. One is original data set other is created from original data set as equally distributed values. When looking at model trained with unbalanced data, different lambda values did not make a difference in terms of accuracy.Thus, model trained with equally distributed data were examined. The best results are obtaned with lambda = 0.05. Test accuracy is achieved as 0.79
# Classification Tree
Decision_tree <- function(cp,minsplit){
class_tree = rpart(default_payment~.,train_credit, method = "class", cp = cp, minsplit = minsplit )
class_tree_balanced = rpart(default_payment~.,balanced_train, method = "class",cp = cp, minsplit = minsplit )
test_prediction_tree= predict(class_tree,test_credit, type = "class")
error_test_tree <- confusionMatrix(test_prediction_tree,test_credit$default_payment)
test_tree_balanced= predict(class_tree_balanced,test_credit, type = "class")
error_tree_balanced <- confusionMatrix(test_tree_balanced,test_credit$default_payment)
train_prediction<- predict(class_tree_balanced,balanced_train,type = "class")
error_train <- confusionMatrix(train_prediction,balanced_train$default_payment)
print("Confusion matrix for D.T. trained with unbalanced data set")
print(error_test_tree$table)
print(paste("Accuracy for D.T. trained with unbalanced data set", error_test_tree$overall[1]))
print("Confusion matrix for D.T. test data set")
print(error_tree_balanced$table)
print(paste("Accuracy for test data set", error_tree_balanced$overall[1]))
print(paste("Accuracy for balanced training data", error_train$overall[1]))
}
Decision_tree(0.0009,30)
## [1] "Confusion matrix for D.T. trained with unbalanced data set"
## Reference
## Prediction 0 1
## 0 5541 1057
## 1 299 590
## [1] "Accuracy for D.T. trained with unbalanced data set 0.81888606918659"
## [1] "Confusion matrix for D.T. test data set"
## Reference
## Prediction 0 1
## 0 4471 550
## 1 1369 1097
## [1] "Accuracy for test data set 0.743689061039134"
## [1] "Accuracy for balanced training data 0.7444"
Decision_tree(0.0001,30)
## [1] "Confusion matrix for D.T. trained with unbalanced data set"
## Reference
## Prediction 0 1
## 0 5244 981
## 1 596 666
## [1] "Accuracy for D.T. trained with unbalanced data set 0.789368238279685"
## [1] "Confusion matrix for D.T. test data set"
## Reference
## Prediction 0 1
## 0 4008 367
## 1 1832 1280
## [1] "Accuracy for test data set 0.706290904234006"
## [1] "Accuracy for balanced training data 0.8006"
Decision_tree(0.0001,10)
## [1] "Confusion matrix for D.T. trained with unbalanced data set"
## Reference
## Prediction 0 1
## 0 4971 1010
## 1 869 637
## [1] "Accuracy for D.T. trained with unbalanced data set 0.749031654868439"
## [1] "Confusion matrix for D.T. test data set"
## Reference
## Prediction 0 1
## 0 4006 273
## 1 1834 1374
## [1] "Accuracy for test data set 0.718578870041405"
## [1] "Accuracy for balanced training data 0.8948"
Highest training accuracy (0.89) was obtained when cp and minsplit were equal to 0.0001 and 10. Although model had high training accuracy, it had lower prediction accuracy (0.71) than other models using different hyperparameter values.This means that this model tends to be overfitting. Optimum results was achieved when cp and minsplit were taken 0.0009 and 30 (training accuracy =0.744 and test accuracy = 0.743).
Random_forest <- function(numfeat){
RF = randomForest(train_credit[ ,1:(ncol(train_credit)-1)],train_credit$default_payment,type = "class" ,mtry = numfeat)
RF_balanced = randomForest(balanced_train[ ,1:(ncol(balanced_train)-1)],balanced_train$default_payment,type = "class", mtry = numfeat )
# Test data
prediction_test_forest = predict(RF,test_credit, type = "class")
error_test_forest <- confusionMatrix(prediction_test_forest,test_credit$default_payment)
prediction_balanced_forest = predict(RF_balanced,test_credit, type = "class")
error_balanced_forest <- confusionMatrix(prediction_balanced_forest,test_credit$default_payment)
train_prediction<- predict(RF_balanced,balanced_train,type = "class")
error_train <- confusionMatrix(train_prediction,balanced_train$default_payment)
print("Confusion matrix for R.F. trained with unbalanced data set")
print(error_test_forest$table)
print(paste("Accuracy for R.F. trained with unbalanced data set ", error_test_forest$overall[1]))
print("Confusion matrix for test data set")
print(error_balanced_forest$table)
print(paste("Accuracy for test data set", error_balanced_forest$overall[1]))
print(paste("Accuracy for balanced training data", error_train$overall[1]))
}
Random_forest(5)
## [1] "Confusion matrix for R.F. trained with unbalanced data set"
## Reference
## Prediction 0 1
## 0 5508 1039
## 1 332 608
## [1] "Accuracy for R.F. trained with unbalanced data set 0.816882596500601"
## [1] "Confusion matrix for test data set"
## Reference
## Prediction 0 1
## 0 4714 139
## 1 1126 1508
## [1] "Accuracy for test data set 0.831040470148257"
## [1] "Accuracy for balanced training data 0.9946"
Random_forest(10)
## [1] "Confusion matrix for R.F. trained with unbalanced data set"
## Reference
## Prediction 0 1
## 0 5505 1043
## 1 335 604
## [1] "Accuracy for R.F. trained with unbalanced data set 0.815947642580473"
## [1] "Confusion matrix for test data set"
## Reference
## Prediction 0 1
## 0 4723 142
## 1 1117 1505
## [1] "Accuracy for test data set 0.831841859222653"
## [1] "Accuracy for balanced training data 0.9994"
Random_forest(15)
## [1] "Confusion matrix for R.F. trained with unbalanced data set"
## Reference
## Prediction 0 1
## 0 5502 1032
## 1 338 615
## [1] "Accuracy for R.F. trained with unbalanced data set 0.817016161346334"
## [1] "Confusion matrix for test data set"
## Reference
## Prediction 0 1
## 0 4708 141
## 1 1132 1506
## [1] "Accuracy for test data set 0.829971951382396"
## [1] "Accuracy for balanced training data 0.9996"
This simulation showed that random forest algorithm tended to be overfitting on this data. Training accuracy was about 1 but test accuracy was equal to 0.83. Nevertheless, the best test accuracy was obtained by random forest algorithm. When number of features which are randomly selected to evaluate which features are used for the aim of splitting was increased, it was observed that model mush more tended to be overfitting.
# stochastic gradient boosting
control <- trainControl(method = "repeatedcv",number = 5, repeats = 2, allowParallel = T)
tunning <- expand.grid(n.trees = c(100,150,200), interaction.depth=c(1:3), shrinkage=c(0.01,0.05,0.1), n.minobsinnode=c(20))
unwantedoutput = capture.output(stochastic <- train(default_payment~., data = balanced_train,trControl = control,method = "gbm",tuneGrid = tunning))
print(stochastic)
## Stochastic Gradient Boosting
##
## 10000 samples
## 23 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 8000, 8000, 8000, 8000, 8000, 8000, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees Accuracy Kappa
## 0.01 1 100 0.67240 0.3448
## 0.01 1 150 0.67815 0.3563
## 0.01 1 200 0.68650 0.3730
## 0.01 2 100 0.68555 0.3711
## 0.01 2 150 0.69490 0.3898
## 0.01 2 200 0.70060 0.4012
## 0.01 3 100 0.70165 0.4033
## 0.01 3 150 0.70525 0.4105
## 0.01 3 200 0.70915 0.4183
## 0.05 1 100 0.70045 0.4009
## 0.05 1 150 0.70550 0.4110
## 0.05 1 200 0.70685 0.4137
## 0.05 2 100 0.70990 0.4198
## 0.05 2 150 0.71265 0.4253
## 0.05 2 200 0.71395 0.4279
## 0.05 3 100 0.71100 0.4220
## 0.05 3 150 0.71395 0.4279
## 0.05 3 200 0.71530 0.4306
## 0.10 1 100 0.70660 0.4132
## 0.10 1 150 0.70815 0.4163
## 0.10 1 200 0.70675 0.4135
## 0.10 2 100 0.71190 0.4238
## 0.10 2 150 0.71145 0.4229
## 0.10 2 200 0.71185 0.4237
## 0.10 3 100 0.71335 0.4267
## 0.10 3 150 0.71345 0.4269
## 0.10 3 200 0.71225 0.4245
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 20
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 200, interaction.depth =
## 3, shrinkage = 0.05 and n.minobsinnode = 20.
unwantedoutput = capture.output(stochastic_unbalanced <- train(default_payment~., data = train_credit,trControl = control,method = "gbm",tuneGrid = tunning))
print(stochastic)
## Stochastic Gradient Boosting
##
## 10000 samples
## 23 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 8000, 8000, 8000, 8000, 8000, 8000, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees Accuracy Kappa
## 0.01 1 100 0.67240 0.3448
## 0.01 1 150 0.67815 0.3563
## 0.01 1 200 0.68650 0.3730
## 0.01 2 100 0.68555 0.3711
## 0.01 2 150 0.69490 0.3898
## 0.01 2 200 0.70060 0.4012
## 0.01 3 100 0.70165 0.4033
## 0.01 3 150 0.70525 0.4105
## 0.01 3 200 0.70915 0.4183
## 0.05 1 100 0.70045 0.4009
## 0.05 1 150 0.70550 0.4110
## 0.05 1 200 0.70685 0.4137
## 0.05 2 100 0.70990 0.4198
## 0.05 2 150 0.71265 0.4253
## 0.05 2 200 0.71395 0.4279
## 0.05 3 100 0.71100 0.4220
## 0.05 3 150 0.71395 0.4279
## 0.05 3 200 0.71530 0.4306
## 0.10 1 100 0.70660 0.4132
## 0.10 1 150 0.70815 0.4163
## 0.10 1 200 0.70675 0.4135
## 0.10 2 100 0.71190 0.4238
## 0.10 2 150 0.71145 0.4229
## 0.10 2 200 0.71185 0.4237
## 0.10 3 100 0.71335 0.4267
## 0.10 3 150 0.71345 0.4269
## 0.10 3 200 0.71225 0.4245
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 20
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 200, interaction.depth =
## 3, shrinkage = 0.05 and n.minobsinnode = 20.
prediction_unbalanced <- predict(stochastic_unbalanced,test_credit)
error_unbalanced <- confusionMatrix(prediction_unbalanced,test_credit$default_payment)
error_unbalanced$table
## Reference
## Prediction 0 1
## 0 5519 1043
## 1 321 604
error_unbalanced$overall[1]
## Accuracy
## 0.8178176
prediction_balanced <- predict(stochastic,test_credit)
error_balanced <- confusionMatrix(prediction_balanced,test_credit$default_payment)
error_balanced$table
## Reference
## Prediction 0 1
## 0 4605 592
## 1 1235 1055
print(paste("Accuracy for test set",error_balanced$overall[1]))
## [1] "Accuracy for test set 0.755977026846534"
train_prediction<- predict(stochastic,balanced_train)
error_train <- confusionMatrix(train_prediction,balanced_train$default_payment)
error_train$table
## Reference
## Prediction 0 1
## 0 4004 1725
## 1 996 3275
print(paste("Accuracy for training data",error_train$overall[1]))
## [1] "Accuracy for training data 0.7279"
Number of trees, depth and alpha values are choosen 200,3 and 0.05 to acquire the best prediction result. However,training accuracy was inconsistent with test accuracy. Training error was higher than test error. I did not characterize model to be overfitting or underfitting. This might result from my coding mistake. To summarize, random forest algorithm gave best accuracy among other learners