Mushroom Data Set

This data set includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family (pp. 500-525). Each species is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended. This latter class was combined with the poisonous one. The Guide clearly states that there is no simple rule for determining the edibility of a mushroom

Features Information

1.cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s

cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r, pink=p,purple=u,red=e,white=w,yellow=y
bruises?: bruises=t,no=f
odor: almond=a,anise=l,creosote=c,fishy=y,foul=f, musty=m,none=n,pungent=p,spicy=s
gill-attachment: attached=a,descending=d,free=f,notched=n
gill-spacing: close=c,crowded=w,distant=d
gill-size: broad=b,narrow=n
gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e, white=w,yellow=y
stalk-shape: enlarging=e,tapering=t
stalk-root: bulbous=b,club=c,cup=u,equal=e, rhizomorphs=z,rooted=r,missing=?
stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
veil-type: partial=p,universal=u
veil-color: brown=n,orange=o,white=w,yellow=y
ring-number: none=n,one=o,two=t
ring-type: cobwebby=c,evanescent=e,flaring=f,large=l, none=n,pendant=p,sheathing=s,zone=z
spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r, orange=o,purple=u,white=w,yellow=y
population: abundant=a,clustered=c,numerous=n, scattered=s,several=v,solitary=y
habitat: grasses=g,leaves=l,meadows=m,paths=p, urban=u,waste=w,woods=d

This data set includes categorical features.In part of data manipulation, data set is converted into dummy variables for linear regression. Original data set was used by other learners.

library("caret")
library("dplyr")
library("glmnet")
library(Metrics)
require(rpart)
library(rattle)
library(RColorBrewer)
require(data.table,quietly = TRUE)
library(data.table)
library(randomForest)
library(gbm)
library(xgboost)
library(e1071)

Data Manipulation Part

setwd("C:/Users/asus/Desktop")

mush = read.table("agaricus-lepiota.data.csv",sep=",")

for(i in 1:ncol(mush)){
  
  mush[,i] = as.factor(mush[,i])
  
  
}

mush <- mush[,-17]

# convert categorical variables to dummy
Dummies =  dummyVars("~.", data = mush[,2:ncol(mush)],fullRank = T)
mush_dummy = data.frame(predict(Dummies, newdata = mush[,2:ncol(mush)]))

mush_dummy$pois <- mush$V1





# split whole data as training and test 

idx <- sample(seq(1, 2), size = nrow(mush_dummy), replace = TRUE, prob = c(.75, .25))

train_mush <- mush[idx == 1,]
test_mush <- mush[idx == 2,]

train_mush_dummy <- mush_dummy[idx == 1,]
test_mush_dummy <-  mush_dummy[idx == 2,]

Penalized Linear Regression

# apply penalized linear regression


# for train set
control <- trainControl(method = "repeatedcv",number = 5, repeats = 2, allowParallel = T)


PRA = function(lambda){

glmmod = glmnet(as.matrix(train_mush_dummy[,1:(ncol(train_mush_dummy)-1)]),train_mush_dummy$pois, family = "binomial",type.measure = "class",trcontrol = control)




# for test set


prediction_test = predict(glmmod,as.matrix(test_mush_dummy[,1:(ncol(test_mush_dummy)-1)]),type = "class",s = lambda)
test_error= confusionMatrix(as.factor(prediction_test),test_mush_dummy$pois)

train_prediction<- predict(glmmod,as.matrix(train_mush_dummy[,1:(ncol(train_mush_dummy)-1)]),type = "class",s = lambda)
error_train <- confusionMatrix(as.factor(train_prediction),train_mush_dummy$pois)


print("Confusion matrix PLR performing on test data ")
print(test_error$table)      
print(paste("Accuracy for test data", test_error$overall[1]))

print(paste("Accuracy for training data", error_train$overall[1]))

}

PRA(0.01)

## [1] "Confusion matrix PLR performing on test data "
##           Reference
## Prediction    e    p
##          e 1034    3
##          p    0  966
## [1] "Accuracy for test data 0.998502246630055"
## [1] "Accuracy for training data 0.999183140009802"

PRA(0.001)

## [1] "Confusion matrix PLR performing on test data "
##           Reference
## Prediction    e    p
##          e 1034    0
##          p    0  969
## [1] "Accuracy for test data 1"
## [1] "Accuracy for training data 0.999509884005881"

PRA(0.005)

## [1] "Confusion matrix PLR performing on test data "
##           Reference
## Prediction    e    p
##          e 1034    0
##          p    0  969
## [1] "Accuracy for test data 1"
## [1] "Accuracy for training data 0.999346512007842"

Decision Tree

# Classification Tree


Decision_tree <- function(cp,minsplit){

class_tree = rpart(V1~.,train_mush, method = "class",cp = cp, minsplit = minsplit)


test_prediction= predict(class_tree,test_mush, type = "class")
error_test <- confusionMatrix(test_prediction,test_mush$V1)

train_prediction<- predict(class_tree,train_mush,type = "class")
error_train <- confusionMatrix(as.factor(train_prediction),train_mush$V1)


print("Confusion matrix for DT performing on test data ")
print(error_test$table)      
print(paste("Accuracy for test data", error_test$overall[1]))

print(paste("Accuracy for training data", error_train$overall[1]))


}

Decision_tree(cp = 0.01, minsplit =20)

## [1] "Confusion matrix for DT performing on test data "
##           Reference
## Prediction    e    p
##          e 1034   11
##          p    0  958
## [1] "Accuracy for test data 0.994508237643535"
## [1] "Accuracy for training data 0.993955236072537"

Decision_tree(cp = 0.001, minsplit =20)

## [1] "Confusion matrix for DT performing on test data "
##           Reference
## Prediction    e    p
##          e 1034    3
##          p    0  966
## [1] "Accuracy for test data 0.998502246630055"
## [1] "Accuracy for training data 0.999183140009802"

Decision_tree(cp = 0.001, minsplit = 10)

## [1] "Confusion matrix for DT performing on test data "
##           Reference
## Prediction    e    p
##          e 1034    3
##          p    0  966
## [1] "Accuracy for test data 0.998502246630055"
## [1] "Accuracy for training data 0.999183140009802"

Random Forest

#Random forest

Random_forest <- function(numfeat){
RF = randomForest(train_mush[,2:ncol(train_mush)],train_mush$V1,type = "classification", mtry = numfeat )
print(RF)

# Test data
prediction_test = predict(RF,test_mush, type = "class")
error_test <- confusionMatrix(prediction_test,test_mush$V1)

train_prediction<- predict(RF,train_mush,type = "class")
error_train <- confusionMatrix(as.factor(train_prediction),train_mush$V1)



print("Confusion matrix for RF performing on test data ")
print(error_test$table)      
print(paste("Accuracy for test data", error_test$overall[1]))

print(paste("Accuracy for training data", error_train$overall[1]))

}

Random_forest(5)

## 
## Call:
##  randomForest(x = train_mush[, 2:ncol(train_mush)], y = train_mush$V1,      mtry = numfeat, type = "classification") 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 0%
## Confusion matrix:
##      e    p class.error
## e 3174    0           0
## p    0 2947           0
## [1] "Confusion matrix for RF performing on test data "
##           Reference
## Prediction    e    p
##          e 1034    0
##          p    0  969
## [1] "Accuracy for test data 1"
## [1] "Accuracy for training data 1"

Random_forest(10)

## 
## Call:
##  randomForest(x = train_mush[, 2:ncol(train_mush)], y = train_mush$V1,      mtry = numfeat, type = "classification") 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 10
## 
##         OOB estimate of  error rate: 0%
## Confusion matrix:
##      e    p class.error
## e 3174    0           0
## p    0 2947           0
## [1] "Confusion matrix for RF performing on test data "
##           Reference
## Prediction    e    p
##          e 1034    0
##          p    0  969
## [1] "Accuracy for test data 1"
## [1] "Accuracy for training data 1"

Random_forest(15)

## 
## Call:
##  randomForest(x = train_mush[, 2:ncol(train_mush)], y = train_mush$V1,      mtry = numfeat, type = "classification") 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 15
## 
##         OOB estimate of  error rate: 0%
## Confusion matrix:
##      e    p class.error
## e 3174    0           0
## p    0 2947           0
## [1] "Confusion matrix for RF performing on test data "
##           Reference
## Prediction    e    p
##          e 1034    0
##          p    0  969
## [1] "Accuracy for test data 1"
## [1] "Accuracy for training data 1"

Stochastic Gradient Boosting

# stochastic gradient boosting

control <- trainControl(method = "repeatedcv",number = 5, repeats = 2, allowParallel = T)
tunning  <- expand.grid(n.trees = c(100,150,200), interaction.depth=c(1:3), shrinkage=c(0.01,0.05,0.1), n.minobsinnode=c(20))
                                 
unwantedoutput =  capture.output(stochastic <- train(V1~., data = train_mush,trControl = control,method = "gbm",tuneGrid = tunning))
print(stochastic)

## Stochastic Gradient Boosting 
## 
## 6121 samples
##   21 predictor
##    2 classes: 'e', 'p' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 4897, 4896, 4897, 4897, 4897, 4896, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  Accuracy   Kappa    
##   0.01       1                  100      0.8866190  0.7743526
##   0.01       1                  150      0.8866190  0.7743526
##   0.01       1                  200      0.9536029  0.9072345
##   0.01       2                  100      0.9606296  0.9212096
##   0.01       2                  150      0.9589132  0.9177355
##   0.01       2                  200      0.9691231  0.9381253
##   0.01       3                  100      0.9691231  0.9381253
##   0.01       3                  150      0.9761485  0.9521950
##   0.01       3                  200      0.9834996  0.9669395
##   0.05       1                  100      0.9738608  0.9476384
##   0.05       1                  150      0.9843982  0.9687622
##   0.05       1                  200      0.9888091  0.9775890
##   0.05       2                  100      0.9921583  0.9842994
##   0.05       2                  150      0.9975495  0.9950923
##   0.05       2                  200      0.9983661  0.9967273
##   0.05       3                  100      0.9973858  0.9947618
##   0.05       3                  150      0.9991831  0.9983639
##   0.05       3                  200      0.9991831  0.9983639
##   0.10       1                  100      0.9897887  0.9795508
##   0.10       1                  150      0.9952623  0.9905127
##   0.10       1                  200      0.9965695  0.9931303
##   0.10       2                  100      0.9986112  0.9972183
##   0.10       2                  150      0.9991831  0.9983639
##   0.10       2                  200      0.9992648  0.9985275
##   0.10       3                  100      0.9991831  0.9983639
##   0.10       3                  150      0.9999183  0.9998364
##   0.10       3                  200      1.0000000  1.0000000
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 20
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 200, interaction.depth =
##  3, shrinkage = 0.1 and n.minobsinnode = 20.

prediction <- predict(stochastic,test_mush)
error_test <- confusionMatrix(as.factor(prediction),test_mush$V1)
  
train_prediction<- predict(stochastic,train_mush)
error_train <- confusionMatrix(as.factor(train_prediction),train_mush$V1)

error_train$table

##           Reference
## Prediction    e    p
##          e 3174    0
##          p    0 2947

error_train$overall

##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##      1.0000000      1.0000000      0.9993975      1.0000000      0.5185427 
## AccuracyPValue  McnemarPValue 
##      0.0000000            NaN

There is no difference among test accuracy acquired when applying different learners. All learners had high accuracy in terms of test and training. In addition to this,it is not possible to compare a learner hyperparameters. Therefore, This data set is not feasible to compare learners’ performance on it.

HOMEWORK 4