This data set includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family (pp. 500-525). Each species is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended. This latter class was combined with the poisonous one. The Guide clearly states that there is no simple rule for determining the edibility of a mushroom
1.cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r, pink=p,purple=u,red=e,white=w,yellow=y
bruises?: bruises=t,no=f
odor: almond=a,anise=l,creosote=c,fishy=y,foul=f, musty=m,none=n,pungent=p,spicy=s
gill-attachment: attached=a,descending=d,free=f,notched=n
gill-spacing: close=c,crowded=w,distant=d
gill-size: broad=b,narrow=n
gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e, white=w,yellow=y
stalk-shape: enlarging=e,tapering=t
stalk-root: bulbous=b,club=c,cup=u,equal=e, rhizomorphs=z,rooted=r,missing=?
stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
veil-type: partial=p,universal=u
veil-color: brown=n,orange=o,white=w,yellow=y
ring-number: none=n,one=o,two=t
ring-type: cobwebby=c,evanescent=e,flaring=f,large=l, none=n,pendant=p,sheathing=s,zone=z
spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r, orange=o,purple=u,white=w,yellow=y
population: abundant=a,clustered=c,numerous=n, scattered=s,several=v,solitary=y
habitat: grasses=g,leaves=l,meadows=m,paths=p, urban=u,waste=w,woods=d
This data set includes categorical features.In part of data manipulation, data set is converted into dummy variables for linear regression. Original data set was used by other learners.
library("caret")
library("dplyr")
library("glmnet")
library(Metrics)
require(rpart)
library(rattle)
library(RColorBrewer)
require(data.table,quietly = TRUE)
library(data.table)
library(randomForest)
library(gbm)
library(xgboost)
library(e1071)
setwd("C:/Users/asus/Desktop")
mush = read.table("agaricus-lepiota.data.csv",sep=",")
for(i in 1:ncol(mush)){
mush[,i] = as.factor(mush[,i])
}
mush <- mush[,-17]
# convert categorical variables to dummy
Dummies = dummyVars("~.", data = mush[,2:ncol(mush)],fullRank = T)
mush_dummy = data.frame(predict(Dummies, newdata = mush[,2:ncol(mush)]))
mush_dummy$pois <- mush$V1
# split whole data as training and test
idx <- sample(seq(1, 2), size = nrow(mush_dummy), replace = TRUE, prob = c(.75, .25))
train_mush <- mush[idx == 1,]
test_mush <- mush[idx == 2,]
train_mush_dummy <- mush_dummy[idx == 1,]
test_mush_dummy <- mush_dummy[idx == 2,]
# apply penalized linear regression
# for train set
control <- trainControl(method = "repeatedcv",number = 5, repeats = 2, allowParallel = T)
PRA = function(lambda){
glmmod = glmnet(as.matrix(train_mush_dummy[,1:(ncol(train_mush_dummy)-1)]),train_mush_dummy$pois, family = "binomial",type.measure = "class",trcontrol = control)
# for test set
prediction_test = predict(glmmod,as.matrix(test_mush_dummy[,1:(ncol(test_mush_dummy)-1)]),type = "class",s = lambda)
test_error= confusionMatrix(as.factor(prediction_test),test_mush_dummy$pois)
train_prediction<- predict(glmmod,as.matrix(train_mush_dummy[,1:(ncol(train_mush_dummy)-1)]),type = "class",s = lambda)
error_train <- confusionMatrix(as.factor(train_prediction),train_mush_dummy$pois)
print("Confusion matrix PLR performing on test data ")
print(test_error$table)
print(paste("Accuracy for test data", test_error$overall[1]))
print(paste("Accuracy for training data", error_train$overall[1]))
}
PRA(0.01)
## [1] "Confusion matrix PLR performing on test data "
## Reference
## Prediction e p
## e 1034 3
## p 0 966
## [1] "Accuracy for test data 0.998502246630055"
## [1] "Accuracy for training data 0.999183140009802"
PRA(0.001)
## [1] "Confusion matrix PLR performing on test data "
## Reference
## Prediction e p
## e 1034 0
## p 0 969
## [1] "Accuracy for test data 1"
## [1] "Accuracy for training data 0.999509884005881"
PRA(0.005)
## [1] "Confusion matrix PLR performing on test data "
## Reference
## Prediction e p
## e 1034 0
## p 0 969
## [1] "Accuracy for test data 1"
## [1] "Accuracy for training data 0.999346512007842"
# Classification Tree
Decision_tree <- function(cp,minsplit){
class_tree = rpart(V1~.,train_mush, method = "class",cp = cp, minsplit = minsplit)
test_prediction= predict(class_tree,test_mush, type = "class")
error_test <- confusionMatrix(test_prediction,test_mush$V1)
train_prediction<- predict(class_tree,train_mush,type = "class")
error_train <- confusionMatrix(as.factor(train_prediction),train_mush$V1)
print("Confusion matrix for DT performing on test data ")
print(error_test$table)
print(paste("Accuracy for test data", error_test$overall[1]))
print(paste("Accuracy for training data", error_train$overall[1]))
}
Decision_tree(cp = 0.01, minsplit =20)
## [1] "Confusion matrix for DT performing on test data "
## Reference
## Prediction e p
## e 1034 11
## p 0 958
## [1] "Accuracy for test data 0.994508237643535"
## [1] "Accuracy for training data 0.993955236072537"
Decision_tree(cp = 0.001, minsplit =20)
## [1] "Confusion matrix for DT performing on test data "
## Reference
## Prediction e p
## e 1034 3
## p 0 966
## [1] "Accuracy for test data 0.998502246630055"
## [1] "Accuracy for training data 0.999183140009802"
Decision_tree(cp = 0.001, minsplit = 10)
## [1] "Confusion matrix for DT performing on test data "
## Reference
## Prediction e p
## e 1034 3
## p 0 966
## [1] "Accuracy for test data 0.998502246630055"
## [1] "Accuracy for training data 0.999183140009802"
#Random forest
Random_forest <- function(numfeat){
RF = randomForest(train_mush[,2:ncol(train_mush)],train_mush$V1,type = "classification", mtry = numfeat )
print(RF)
# Test data
prediction_test = predict(RF,test_mush, type = "class")
error_test <- confusionMatrix(prediction_test,test_mush$V1)
train_prediction<- predict(RF,train_mush,type = "class")
error_train <- confusionMatrix(as.factor(train_prediction),train_mush$V1)
print("Confusion matrix for RF performing on test data ")
print(error_test$table)
print(paste("Accuracy for test data", error_test$overall[1]))
print(paste("Accuracy for training data", error_train$overall[1]))
}
Random_forest(5)
##
## Call:
## randomForest(x = train_mush[, 2:ncol(train_mush)], y = train_mush$V1, mtry = numfeat, type = "classification")
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 0%
## Confusion matrix:
## e p class.error
## e 3174 0 0
## p 0 2947 0
## [1] "Confusion matrix for RF performing on test data "
## Reference
## Prediction e p
## e 1034 0
## p 0 969
## [1] "Accuracy for test data 1"
## [1] "Accuracy for training data 1"
Random_forest(10)
##
## Call:
## randomForest(x = train_mush[, 2:ncol(train_mush)], y = train_mush$V1, mtry = numfeat, type = "classification")
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 10
##
## OOB estimate of error rate: 0%
## Confusion matrix:
## e p class.error
## e 3174 0 0
## p 0 2947 0
## [1] "Confusion matrix for RF performing on test data "
## Reference
## Prediction e p
## e 1034 0
## p 0 969
## [1] "Accuracy for test data 1"
## [1] "Accuracy for training data 1"
Random_forest(15)
##
## Call:
## randomForest(x = train_mush[, 2:ncol(train_mush)], y = train_mush$V1, mtry = numfeat, type = "classification")
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 15
##
## OOB estimate of error rate: 0%
## Confusion matrix:
## e p class.error
## e 3174 0 0
## p 0 2947 0
## [1] "Confusion matrix for RF performing on test data "
## Reference
## Prediction e p
## e 1034 0
## p 0 969
## [1] "Accuracy for test data 1"
## [1] "Accuracy for training data 1"
# stochastic gradient boosting
control <- trainControl(method = "repeatedcv",number = 5, repeats = 2, allowParallel = T)
tunning <- expand.grid(n.trees = c(100,150,200), interaction.depth=c(1:3), shrinkage=c(0.01,0.05,0.1), n.minobsinnode=c(20))
unwantedoutput = capture.output(stochastic <- train(V1~., data = train_mush,trControl = control,method = "gbm",tuneGrid = tunning))
print(stochastic)
## Stochastic Gradient Boosting
##
## 6121 samples
## 21 predictor
## 2 classes: 'e', 'p'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 4897, 4896, 4897, 4897, 4897, 4896, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees Accuracy Kappa
## 0.01 1 100 0.8866190 0.7743526
## 0.01 1 150 0.8866190 0.7743526
## 0.01 1 200 0.9536029 0.9072345
## 0.01 2 100 0.9606296 0.9212096
## 0.01 2 150 0.9589132 0.9177355
## 0.01 2 200 0.9691231 0.9381253
## 0.01 3 100 0.9691231 0.9381253
## 0.01 3 150 0.9761485 0.9521950
## 0.01 3 200 0.9834996 0.9669395
## 0.05 1 100 0.9738608 0.9476384
## 0.05 1 150 0.9843982 0.9687622
## 0.05 1 200 0.9888091 0.9775890
## 0.05 2 100 0.9921583 0.9842994
## 0.05 2 150 0.9975495 0.9950923
## 0.05 2 200 0.9983661 0.9967273
## 0.05 3 100 0.9973858 0.9947618
## 0.05 3 150 0.9991831 0.9983639
## 0.05 3 200 0.9991831 0.9983639
## 0.10 1 100 0.9897887 0.9795508
## 0.10 1 150 0.9952623 0.9905127
## 0.10 1 200 0.9965695 0.9931303
## 0.10 2 100 0.9986112 0.9972183
## 0.10 2 150 0.9991831 0.9983639
## 0.10 2 200 0.9992648 0.9985275
## 0.10 3 100 0.9991831 0.9983639
## 0.10 3 150 0.9999183 0.9998364
## 0.10 3 200 1.0000000 1.0000000
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 20
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 200, interaction.depth =
## 3, shrinkage = 0.1 and n.minobsinnode = 20.
prediction <- predict(stochastic,test_mush)
error_test <- confusionMatrix(as.factor(prediction),test_mush$V1)
train_prediction<- predict(stochastic,train_mush)
error_train <- confusionMatrix(as.factor(train_prediction),train_mush$V1)
error_train$table
## Reference
## Prediction e p
## e 3174 0
## p 0 2947
error_train$overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 1.0000000 1.0000000 0.9993975 1.0000000 0.5185427
## AccuracyPValue McnemarPValue
## 0.0000000 NaN
There is no difference among test accuracy acquired when applying different learners. All learners had high accuracy in terms of test and training. In addition to this,it is not possible to compare a learner hyperparameters. Therefore, This data set is not feasible to compare learners’ performance on it.