All features are integer in the range 0..16. Target column is the class 0..9
library("caret")
library("dplyr")
library("glmnet")
library(Metrics)
require(rpart)
library(rattle)
library(RColorBrewer)
require(data.table,quietly = TRUE)
library(data.table)
library(randomForest)
library(gbm)
library(xgboost)
library(e1071)
setwd("C:/Users/asus/Desktop")
digits_train = read.table("optdigits.tra.csv",sep=",")
digits_train$V65 = as.factor(digits_train$V65)
digits_test = read.table("optdigits.tes.csv",sep=",")
digits_test$V65 = as.factor(digits_test$V65)
digits_train <- digits_train[,-c(1,40)] # all variables belonging to these columns are equal to zero
digits_test <- digits_test[,-c(1,40)] # all variables belonging to these columns are equal to zero
control <- trainControl(method = "repeatedcv",number = 5, repeats = 2, allowParallel = T)
PRA = function(lambda){
glmmod = glmnet(as.matrix(digits_train[,1:(ncol(digits_train)-1)]),digits_train$V65, family = "multinomial",type.measure = "class",trcontrol = control)
# for test and training set
prediction_test = predict(glmmod,as.matrix(digits_test[,1:(ncol(digits_test)-1)]),type ="class",s = lambda)
test_error= confusionMatrix(as.factor(prediction_test),digits_test$V65)
train_prediction<- predict(glmmod,as.matrix(digits_train[,1:(ncol(digits_train)-1)]),type ="class", s = lambda)
error_train <- confusionMatrix(as.factor(train_prediction),digits_train$V65)
print("Confusion matrix PLR performing on test data ")
print(test_error$table)
print(paste("Accuracy for test data", test_error$overall[1]))
print(paste("Accuracy for training data", error_train$overall[1]))
}
PRA(0.1)
## [1] "Confusion matrix PLR performing on test data "
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 177 3 10 3 2 37 6 2 10 75
## 1 0 137 6 16 9 9 6 0 49 18
## 2 0 27 100 10 0 0 5 0 12 23
## 3 0 0 19 113 0 0 0 9 14 1
## 4 0 5 6 1 142 1 9 4 9 4
## 5 0 0 0 1 2 72 3 0 5 0
## 6 0 2 31 29 0 6 152 0 20 10
## 7 1 8 5 10 18 57 0 163 13 15
## 8 0 0 0 0 0 0 0 0 15 2
## 9 0 0 0 0 8 0 0 1 27 32
## [1] "Accuracy for test data 0.613800779076238"
## [1] "Accuracy for training data 0.636934344755428"
PRA(0.05)
## [1] "Confusion matrix PLR performing on test data "
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 178 1 1 0 0 1 0 0 0 9
## 1 0 148 1 4 17 2 6 0 21 8
## 2 0 23 141 9 0 0 1 1 9 14
## 3 0 0 7 160 0 0 0 2 9 9
## 4 0 0 2 0 152 0 1 4 1 6
## 5 0 1 2 4 2 167 2 7 8 1
## 6 0 2 10 0 0 1 169 0 4 0
## 7 0 2 2 5 7 2 0 157 3 8
## 8 0 2 10 1 0 1 1 1 106 1
## 9 0 3 1 0 3 8 1 7 13 124
## [1] "Accuracy for test data 0.835837506956038"
## [1] "Accuracy for training data 0.867120062777923"
PRA(0.01)
## [1] "Confusion matrix PLR performing on test data "
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 176 0 1 0 0 0 1 0 0 1
## 1 0 156 0 3 3 1 4 0 12 3
## 2 0 10 167 5 0 0 0 0 2 0
## 3 0 0 4 163 0 1 0 0 2 5
## 4 1 0 0 0 174 0 1 3 1 3
## 5 1 1 0 2 0 176 0 4 6 2
## 6 0 1 0 0 0 1 174 0 1 0
## 7 0 0 2 6 0 0 0 161 2 1
## 8 0 5 3 3 3 1 1 2 138 1
## 9 0 9 0 1 1 2 0 9 10 164
## [1] "Accuracy for test data 0.917640511964385"
## [1] "Accuracy for training data 0.948208213444939"
Penalized linear regression gave better results when lambda value was tuned.PLR with lambda = 0.1 gave the worst results among PLR with different lambda values(0.1,0.05,0.01) in terms of model accuracy. The best results was achieved at lambda = 0.01. This model also has high prediction accuracy (0.92). In addition, training error(0.08) is higher than test error(0.06). Difference between training and test error is relatively small. Therefore, model does not tends to be overfitting. It also fit training data well. It is safe to say that penalized linear regression with lambda 0.01 is good to be used.
# Classification Tree
Decision_tree <- function(cp,minsplit){
class_tree = rpart(V65~.,digits_train, method = "class",cp = cp, minsplit = minsplit)
test_prediction= predict(class_tree,digits_test, type = "class")
train_prediction<- predict(class_tree,digits_train,type = "class")
error_test <- confusionMatrix(as.factor(test_prediction),digits_test$V65)
error_train <- confusionMatrix(as.factor(train_prediction),digits_train$V65)
print("Confusion matrix for DT performing on test data ")
print(error_test$table)
print(paste("Accuracy for test data", error_test$overall[1]))
print(paste("Accuracy for training data", error_train$overall[1]))
}
Decision_tree(0.1,20)
## [1] "Confusion matrix for DT performing on test data "
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 174 0 4 1 6 25 5 0 3 57
## 1 4 182 173 182 175 157 176 179 171 123
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0 0 0 0
## [1] "Accuracy for test data 0.19810795770729"
## [1] "Accuracy for training data 0.193565262882553"
Decision_tree(0.01,20)
## [1] "Confusion matrix for DT performing on test data "
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 173 0 3 0 6 6 5 0 3 1
## 1 1 113 19 2 9 2 3 4 8 5
## 2 0 2 82 2 0 0 0 3 0 0
## 3 0 22 13 153 0 4 0 18 4 21
## 4 2 24 13 1 140 8 9 13 15 1
## 5 1 16 8 7 10 133 0 1 3 4
## 6 0 1 11 1 0 0 163 0 0 0
## 7 0 0 2 2 9 22 0 126 0 8
## 8 1 0 24 13 4 1 1 6 113 2
## 9 0 4 2 2 3 6 0 8 28 138
## [1] "Accuracy for test data 0.74234835837507"
## [1] "Accuracy for training data 0.76876798325922"
Decision_tree(0.01,40)
## [1] "Confusion matrix for DT performing on test data "
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 173 0 3 0 6 6 5 0 3 1
## 1 1 113 19 2 9 2 3 4 8 5
## 2 0 2 82 2 0 0 0 3 0 0
## 3 0 22 13 153 0 4 0 18 4 21
## 4 2 24 13 1 140 8 9 13 15 1
## 5 1 16 8 7 10 133 0 1 3 4
## 6 0 1 11 1 0 0 163 0 0 0
## 7 0 0 2 2 9 22 0 126 0 8
## 8 1 0 24 13 4 1 1 6 113 2
## 9 0 4 2 2 3 6 0 8 28 138
## [1] "Accuracy for test data 0.74234835837507"
## [1] "Accuracy for training data 0.76876798325922"
Decision tree learner with cp =0.1 and minsplit = 20 gave quite bad results in terms of accuracy. When cp was increased, learner made better prediction.Increasing min split value did not have effect on results.Best results were obtained when cp and min split was taken as 0.01 and 30, However, Classification tree fell behind penalized linear regression with lambda 0.01 in terms of test and training accuracy. This might stem from features’ characteristic.
Random_forest <- function(numfeat){
RF = randomForest(digits_train[,1:(ncol(digits_train)-1)],digits_train$V65,type = "classification", mtry = numfeat )
# Test data
prediction_test = predict(RF,digits_test, type = "class")
error_test <- confusionMatrix(prediction_test,digits_test$V65)
train_prediction<- predict(RF,digits_train,type = "class")
error_train <- confusionMatrix(as.factor(train_prediction),digits_train$V65)
print("Confusion matrix for test data ")
print(error_test$table)
print(paste("Accuracy for test data", error_test$overall[1]))
print(paste("Accuracy for training data", error_train$overall[1]))
}
Random_forest(20)
## [1] "Confusion matrix for test data "
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 177 0 1 0 0 0 2 0 0 0
## 1 0 181 1 0 3 0 1 0 4 0
## 2 0 0 175 0 0 0 0 0 0 0
## 3 0 0 0 169 0 0 0 0 0 6
## 4 1 0 0 0 177 1 0 0 0 0
## 5 0 0 0 1 1 176 0 0 1 3
## 6 0 0 0 0 0 2 176 0 0 0
## 7 0 0 0 3 0 0 0 166 0 0
## 8 0 0 0 7 0 1 2 2 168 2
## 9 0 1 0 3 0 2 0 11 1 169
## [1] "Accuracy for test data 0.964941569282137"
## [1] "Accuracy for training data 1"
Random_forest(30)
## [1] "Confusion matrix for test data "
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 176 0 1 0 0 0 1 0 0 0
## 1 0 179 1 1 4 0 1 0 4 1
## 2 0 0 174 0 0 0 0 0 0 0
## 3 0 1 0 165 0 0 0 0 0 5
## 4 1 0 0 0 176 1 0 0 0 0
## 5 1 0 0 1 1 175 0 2 1 4
## 6 0 0 0 0 0 2 176 0 0 0
## 7 0 0 1 3 0 0 0 167 0 0
## 8 0 1 0 10 0 1 3 2 166 2
## 9 0 1 0 3 0 3 0 8 3 168
## [1] "Accuracy for test data 0.958263772954925"
## [1] "Accuracy for training data 1"
Random_forest(40)
## [1] "Confusion matrix for test data "
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 174 0 1 0 0 0 1 0 0 0
## 1 0 180 1 2 7 0 2 0 3 1
## 2 0 0 173 1 0 0 0 0 0 0
## 3 0 1 0 164 0 1 0 0 1 6
## 4 2 0 0 0 172 1 0 0 0 1
## 5 2 0 0 1 1 175 0 2 1 4
## 6 0 0 0 0 0 2 175 0 0 0
## 7 0 0 1 2 1 0 0 166 0 0
## 8 0 0 1 9 0 1 3 2 165 2
## 9 0 1 0 4 0 2 0 9 4 166
## [1] "Accuracy for test data 0.951585976627713"
## [1] "Accuracy for training data 1"
Random forest gave the highest training accuracy (1) among other learners tried up to now. At the same time, its accurate prediction capability is highly good. Therefore, Random forest is the best option to predict target value in this data set.
control <- trainControl(method = "repeatedcv",number = 5, repeats = 2, allowParallel = T)
tunning <- expand.grid(n.trees = c(100,150,200), interaction.depth=c(1:3), shrinkage=c(0.01,0.05,0.1), n.minobsinnode=c(20))
stochastic = train(V65~., data = digits_train, method = "gbm" ,trControl = control,tuneGrid = tunning, verbose = F)
print(stochastic)
## Stochastic Gradient Boosting
##
## 3823 samples
## 62 predictor
## 10 classes: '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 3058, 3057, 3060, 3057, 3060, 3058, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees Accuracy Kappa
## 0.01 1 100 0.7751648 0.7501834
## 0.01 1 150 0.8142751 0.7936370
## 0.01 1 200 0.8448804 0.8276387
## 0.01 2 100 0.8660671 0.8511840
## 0.01 2 150 0.8873861 0.8748706
## 0.01 2 200 0.9045211 0.8939086
## 0.01 3 100 0.9025542 0.8917225
## 0.01 3 150 0.9215196 0.9127958
## 0.01 3 200 0.9360369 0.9289273
## 0.05 1 100 0.9136742 0.9040785
## 0.05 1 150 0.9356451 0.9284919
## 0.05 1 200 0.9475453 0.9417150
## 0.05 2 100 0.9518649 0.9465146
## 0.05 2 150 0.9629817 0.9588671
## 0.05 2 200 0.9683459 0.9648277
## 0.05 3 100 0.9607576 0.9563957
## 0.05 3 150 0.9676906 0.9640995
## 0.05 3 200 0.9718781 0.9687524
## 0.10 1 100 0.9457175 0.9396841
## 0.10 1 150 0.9563121 0.9514563
## 0.10 1 200 0.9611528 0.9568351
## 0.10 2 100 0.9640312 0.9600335
## 0.10 2 150 0.9705682 0.9672970
## 0.10 2 200 0.9733157 0.9703500
## 0.10 3 100 0.9718779 0.9687522
## 0.10 3 150 0.9746242 0.9718038
## 0.10 3 200 0.9759323 0.9732572
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 20
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 200, interaction.depth =
## 3, shrinkage = 0.1 and n.minobsinnode = 20.
prediction <- predict(stochastic,digits_test)
error_test <- confusionMatrix(prediction,digits_test$V65)
train_prediction<- predict(stochastic,digits_train)
error_train <- confusionMatrix(as.factor(train_prediction),digits_train$V65)
print(error_test$table)
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 176 0 0 0 0 0 2 0 0 0
## 1 0 178 1 1 3 0 2 0 4 0
## 2 0 0 174 1 0 0 0 0 0 0
## 3 0 1 1 171 0 0 0 0 0 1
## 4 1 1 0 0 175 1 0 0 0 0
## 5 1 0 0 1 0 177 0 3 1 3
## 6 0 0 0 0 0 2 175 0 0 0
## 7 0 0 1 3 1 0 0 164 0 0
## 8 0 0 0 2 1 0 2 2 160 2
## 9 0 2 0 4 1 2 0 10 9 174
print(paste("Accuracy for test data ", error_test$overall[1]))
## [1] "Accuracy for test data 0.95937673900946"
print(paste("Accuracy for training data", error_train$overall[1]))
## [1] "Accuracy for training data 1"
Stochastic gradient boosting also is good option to model data like random forest.it gave high test and training accuracy with number of trees = 200, depth = 3 and alpha = 0.1.In addition, stochastic gradient boosting have relatively higher test accuracy than random forest. Underfitting and overfitting were not observed.