Optical Recognition of Handwritten Digits Data Set

Feature Information

All features are integer in the range 0..16. Target column is the class 0..9

library("caret")
library("dplyr")
library("glmnet")
library(Metrics)
require(rpart)
library(rattle)
library(RColorBrewer)
require(data.table,quietly = TRUE)
library(data.table)
library(randomForest)
library(gbm)
library(xgboost)
library(e1071)

Data Manipulation Part

setwd("C:/Users/asus/Desktop")

digits_train = read.table("optdigits.tra.csv",sep=",")
digits_train$V65 = as.factor(digits_train$V65)
digits_test = read.table("optdigits.tes.csv",sep=",")
digits_test$V65 = as.factor(digits_test$V65)

digits_train <- digits_train[,-c(1,40)] # all variables belonging to these columns are equal to zero

digits_test <- digits_test[,-c(1,40)] # all variables belonging to these columns are equal to zero

Penalized Linear Regression

control <- trainControl(method = "repeatedcv",number = 5, repeats = 2, allowParallel = T)

PRA = function(lambda){
  
  glmmod = glmnet(as.matrix(digits_train[,1:(ncol(digits_train)-1)]),digits_train$V65, family = "multinomial",type.measure = "class",trcontrol = control)
  
  
  # for test and training set
  
  
  prediction_test = predict(glmmod,as.matrix(digits_test[,1:(ncol(digits_test)-1)]),type ="class",s = lambda)
  test_error= confusionMatrix(as.factor(prediction_test),digits_test$V65)
  
  train_prediction<- predict(glmmod,as.matrix(digits_train[,1:(ncol(digits_train)-1)]),type ="class",  s = lambda)
  error_train <- confusionMatrix(as.factor(train_prediction),digits_train$V65)
  
  
  print("Confusion matrix PLR performing on test data ")
  print(test_error$table)      
  print(paste("Accuracy for test data", test_error$overall[1]))
  print(paste("Accuracy for training data", error_train$overall[1]))
  
}

PRA(0.1)

## [1] "Confusion matrix PLR performing on test data "
##           Reference
## Prediction   0   1   2   3   4   5   6   7   8   9
##          0 177   3  10   3   2  37   6   2  10  75
##          1   0 137   6  16   9   9   6   0  49  18
##          2   0  27 100  10   0   0   5   0  12  23
##          3   0   0  19 113   0   0   0   9  14   1
##          4   0   5   6   1 142   1   9   4   9   4
##          5   0   0   0   1   2  72   3   0   5   0
##          6   0   2  31  29   0   6 152   0  20  10
##          7   1   8   5  10  18  57   0 163  13  15
##          8   0   0   0   0   0   0   0   0  15   2
##          9   0   0   0   0   8   0   0   1  27  32
## [1] "Accuracy for test data 0.613800779076238"
## [1] "Accuracy for training data 0.636934344755428"

PRA(0.05)

## [1] "Confusion matrix PLR performing on test data "
##           Reference
## Prediction   0   1   2   3   4   5   6   7   8   9
##          0 178   1   1   0   0   1   0   0   0   9
##          1   0 148   1   4  17   2   6   0  21   8
##          2   0  23 141   9   0   0   1   1   9  14
##          3   0   0   7 160   0   0   0   2   9   9
##          4   0   0   2   0 152   0   1   4   1   6
##          5   0   1   2   4   2 167   2   7   8   1
##          6   0   2  10   0   0   1 169   0   4   0
##          7   0   2   2   5   7   2   0 157   3   8
##          8   0   2  10   1   0   1   1   1 106   1
##          9   0   3   1   0   3   8   1   7  13 124
## [1] "Accuracy for test data 0.835837506956038"
## [1] "Accuracy for training data 0.867120062777923"

PRA(0.01)

## [1] "Confusion matrix PLR performing on test data "
##           Reference
## Prediction   0   1   2   3   4   5   6   7   8   9
##          0 176   0   1   0   0   0   1   0   0   1
##          1   0 156   0   3   3   1   4   0  12   3
##          2   0  10 167   5   0   0   0   0   2   0
##          3   0   0   4 163   0   1   0   0   2   5
##          4   1   0   0   0 174   0   1   3   1   3
##          5   1   1   0   2   0 176   0   4   6   2
##          6   0   1   0   0   0   1 174   0   1   0
##          7   0   0   2   6   0   0   0 161   2   1
##          8   0   5   3   3   3   1   1   2 138   1
##          9   0   9   0   1   1   2   0   9  10 164
## [1] "Accuracy for test data 0.917640511964385"
## [1] "Accuracy for training data 0.948208213444939"

Penalized linear regression gave better results when lambda value was tuned.PLR with lambda = 0.1 gave the worst results among PLR with different lambda values(0.1,0.05,0.01) in terms of model accuracy. The best results was achieved at lambda = 0.01. This model also has high prediction accuracy (0.92). In addition, training error(0.08) is higher than test error(0.06). Difference between training and test error is relatively small. Therefore, model does not tends to be overfitting. It also fit training data well. It is safe to say that penalized linear regression with lambda 0.01 is good to be used.

Decision Tree

# Classification Tree


Decision_tree <- function(cp,minsplit){
  
  class_tree = rpart(V65~.,digits_train, method = "class",cp = cp, minsplit = minsplit)
  
  
  
  test_prediction= predict(class_tree,digits_test, type = "class")
  train_prediction<- predict(class_tree,digits_train,type = "class")
  
  
  error_test <- confusionMatrix(as.factor(test_prediction),digits_test$V65)
  error_train <- confusionMatrix(as.factor(train_prediction),digits_train$V65)
  
  print("Confusion matrix for DT performing on test data ")
  print(error_test$table)      
  print(paste("Accuracy for test data", error_test$overall[1]))
  
  print(paste("Accuracy for training data", error_train$overall[1]))
  
  }

Decision_tree(0.1,20)

## [1] "Confusion matrix for DT performing on test data "
##           Reference
## Prediction   0   1   2   3   4   5   6   7   8   9
##          0 174   0   4   1   6  25   5   0   3  57
##          1   4 182 173 182 175 157 176 179 171 123
##          2   0   0   0   0   0   0   0   0   0   0
##          3   0   0   0   0   0   0   0   0   0   0
##          4   0   0   0   0   0   0   0   0   0   0
##          5   0   0   0   0   0   0   0   0   0   0
##          6   0   0   0   0   0   0   0   0   0   0
##          7   0   0   0   0   0   0   0   0   0   0
##          8   0   0   0   0   0   0   0   0   0   0
##          9   0   0   0   0   0   0   0   0   0   0
## [1] "Accuracy for test data 0.19810795770729"
## [1] "Accuracy for training data 0.193565262882553"

Decision_tree(0.01,20)

## [1] "Confusion matrix for DT performing on test data "
##           Reference
## Prediction   0   1   2   3   4   5   6   7   8   9
##          0 173   0   3   0   6   6   5   0   3   1
##          1   1 113  19   2   9   2   3   4   8   5
##          2   0   2  82   2   0   0   0   3   0   0
##          3   0  22  13 153   0   4   0  18   4  21
##          4   2  24  13   1 140   8   9  13  15   1
##          5   1  16   8   7  10 133   0   1   3   4
##          6   0   1  11   1   0   0 163   0   0   0
##          7   0   0   2   2   9  22   0 126   0   8
##          8   1   0  24  13   4   1   1   6 113   2
##          9   0   4   2   2   3   6   0   8  28 138
## [1] "Accuracy for test data 0.74234835837507"
## [1] "Accuracy for training data 0.76876798325922"

Decision_tree(0.01,40)

## [1] "Confusion matrix for DT performing on test data "
##           Reference
## Prediction   0   1   2   3   4   5   6   7   8   9
##          0 173   0   3   0   6   6   5   0   3   1
##          1   1 113  19   2   9   2   3   4   8   5
##          2   0   2  82   2   0   0   0   3   0   0
##          3   0  22  13 153   0   4   0  18   4  21
##          4   2  24  13   1 140   8   9  13  15   1
##          5   1  16   8   7  10 133   0   1   3   4
##          6   0   1  11   1   0   0 163   0   0   0
##          7   0   0   2   2   9  22   0 126   0   8
##          8   1   0  24  13   4   1   1   6 113   2
##          9   0   4   2   2   3   6   0   8  28 138
## [1] "Accuracy for test data 0.74234835837507"
## [1] "Accuracy for training data 0.76876798325922"

Decision tree learner with cp =0.1 and minsplit = 20 gave quite bad results in terms of accuracy. When cp was increased, learner made better prediction.Increasing min split value did not have effect on results.Best results were obtained when cp and min split was taken as 0.01 and 30, However, Classification tree fell behind penalized linear regression with lambda 0.01 in terms of test and training accuracy. This might stem from features’ characteristic.

Random Forest

Random_forest <- function(numfeat){
  RF = randomForest(digits_train[,1:(ncol(digits_train)-1)],digits_train$V65,type = "classification", mtry = numfeat )
  
  
  # Test data
  prediction_test = predict(RF,digits_test, type = "class")
  error_test <- confusionMatrix(prediction_test,digits_test$V65)
  
  train_prediction<- predict(RF,digits_train,type = "class")
  error_train <- confusionMatrix(as.factor(train_prediction),digits_train$V65)

  
  
  print("Confusion matrix for test data ")
  print(error_test$table)      
  print(paste("Accuracy for test data", error_test$overall[1]))
  
  print(paste("Accuracy for training data", error_train$overall[1]))
  
  
}

Random_forest(20)

## [1] "Confusion matrix for test data "
##           Reference
## Prediction   0   1   2   3   4   5   6   7   8   9
##          0 177   0   1   0   0   0   2   0   0   0
##          1   0 181   1   0   3   0   1   0   4   0
##          2   0   0 175   0   0   0   0   0   0   0
##          3   0   0   0 169   0   0   0   0   0   6
##          4   1   0   0   0 177   1   0   0   0   0
##          5   0   0   0   1   1 176   0   0   1   3
##          6   0   0   0   0   0   2 176   0   0   0
##          7   0   0   0   3   0   0   0 166   0   0
##          8   0   0   0   7   0   1   2   2 168   2
##          9   0   1   0   3   0   2   0  11   1 169
## [1] "Accuracy for test data 0.964941569282137"
## [1] "Accuracy for training data 1"

Random_forest(30)

## [1] "Confusion matrix for test data "
##           Reference
## Prediction   0   1   2   3   4   5   6   7   8   9
##          0 176   0   1   0   0   0   1   0   0   0
##          1   0 179   1   1   4   0   1   0   4   1
##          2   0   0 174   0   0   0   0   0   0   0
##          3   0   1   0 165   0   0   0   0   0   5
##          4   1   0   0   0 176   1   0   0   0   0
##          5   1   0   0   1   1 175   0   2   1   4
##          6   0   0   0   0   0   2 176   0   0   0
##          7   0   0   1   3   0   0   0 167   0   0
##          8   0   1   0  10   0   1   3   2 166   2
##          9   0   1   0   3   0   3   0   8   3 168
## [1] "Accuracy for test data 0.958263772954925"
## [1] "Accuracy for training data 1"

Random_forest(40)

## [1] "Confusion matrix for test data "
##           Reference
## Prediction   0   1   2   3   4   5   6   7   8   9
##          0 174   0   1   0   0   0   1   0   0   0
##          1   0 180   1   2   7   0   2   0   3   1
##          2   0   0 173   1   0   0   0   0   0   0
##          3   0   1   0 164   0   1   0   0   1   6
##          4   2   0   0   0 172   1   0   0   0   1
##          5   2   0   0   1   1 175   0   2   1   4
##          6   0   0   0   0   0   2 175   0   0   0
##          7   0   0   1   2   1   0   0 166   0   0
##          8   0   0   1   9   0   1   3   2 165   2
##          9   0   1   0   4   0   2   0   9   4 166
## [1] "Accuracy for test data 0.951585976627713"
## [1] "Accuracy for training data 1"

Random forest gave the highest training accuracy (1) among other learners tried up to now. At the same time, its accurate prediction capability is highly good. Therefore, Random forest is the best option to predict target value in this data set.

Stochastic Gradient Boosting

control <- trainControl(method = "repeatedcv",number = 5, repeats = 2, allowParallel = T)
tunning  <- expand.grid(n.trees = c(100,150,200), interaction.depth=c(1:3), shrinkage=c(0.01,0.05,0.1), n.minobsinnode=c(20))

 stochastic = train(V65~., data = digits_train, method = "gbm" ,trControl = control,tuneGrid = tunning, verbose = F)
print(stochastic)

## Stochastic Gradient Boosting 
## 
## 3823 samples
##   62 predictor
##   10 classes: '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 3058, 3057, 3060, 3057, 3060, 3058, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  Accuracy   Kappa    
##   0.01       1                  100      0.7751648  0.7501834
##   0.01       1                  150      0.8142751  0.7936370
##   0.01       1                  200      0.8448804  0.8276387
##   0.01       2                  100      0.8660671  0.8511840
##   0.01       2                  150      0.8873861  0.8748706
##   0.01       2                  200      0.9045211  0.8939086
##   0.01       3                  100      0.9025542  0.8917225
##   0.01       3                  150      0.9215196  0.9127958
##   0.01       3                  200      0.9360369  0.9289273
##   0.05       1                  100      0.9136742  0.9040785
##   0.05       1                  150      0.9356451  0.9284919
##   0.05       1                  200      0.9475453  0.9417150
##   0.05       2                  100      0.9518649  0.9465146
##   0.05       2                  150      0.9629817  0.9588671
##   0.05       2                  200      0.9683459  0.9648277
##   0.05       3                  100      0.9607576  0.9563957
##   0.05       3                  150      0.9676906  0.9640995
##   0.05       3                  200      0.9718781  0.9687524
##   0.10       1                  100      0.9457175  0.9396841
##   0.10       1                  150      0.9563121  0.9514563
##   0.10       1                  200      0.9611528  0.9568351
##   0.10       2                  100      0.9640312  0.9600335
##   0.10       2                  150      0.9705682  0.9672970
##   0.10       2                  200      0.9733157  0.9703500
##   0.10       3                  100      0.9718779  0.9687522
##   0.10       3                  150      0.9746242  0.9718038
##   0.10       3                  200      0.9759323  0.9732572
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 20
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 200, interaction.depth =
##  3, shrinkage = 0.1 and n.minobsinnode = 20.

prediction <- predict(stochastic,digits_test)
error_test <- confusionMatrix(prediction,digits_test$V65)

train_prediction<- predict(stochastic,digits_train)
error_train <- confusionMatrix(as.factor(train_prediction),digits_train$V65)



print(error_test$table)

##           Reference
## Prediction   0   1   2   3   4   5   6   7   8   9
##          0 176   0   0   0   0   0   2   0   0   0
##          1   0 178   1   1   3   0   2   0   4   0
##          2   0   0 174   1   0   0   0   0   0   0
##          3   0   1   1 171   0   0   0   0   0   1
##          4   1   1   0   0 175   1   0   0   0   0
##          5   1   0   0   1   0 177   0   3   1   3
##          6   0   0   0   0   0   2 175   0   0   0
##          7   0   0   1   3   1   0   0 164   0   0
##          8   0   0   0   2   1   0   2   2 160   2
##          9   0   2   0   4   1   2   0  10   9 174

print(paste("Accuracy for test data ", error_test$overall[1]))

## [1] "Accuracy for test data  0.95937673900946"

print(paste("Accuracy for training data", error_train$overall[1]))

## [1] "Accuracy for training data 1"

Stochastic gradient boosting also is good option to model data like random forest.it gave high test and training accuracy with number of trees = 200, depth = 3 and alpha = 0.1.In addition, stochastic gradient boosting have relatively higher test accuracy than random forest. Underfitting and overfitting were not observed.

HOMEWORK 4