# Chapter 8 Lab: Decision Trees ################################# # Fitting Classification Trees ################################# library(tree) library(ISLR) attach(Carseats) head(Carseats) High=ifelse(Sales<=8,"No","Yes") Carseats=data.frame(Carseats,High) # Build a basic tree tree.carseats=tree(High~.-Sales,Carseats) summary(tree.carseats) plot(tree.carseats) text(tree.carseats,pretty=0) tree.carseats # Evaluate on the validation set set.seed(2) train=sample(1:nrow(Carseats), 200) Carseats.test=Carseats[-train,] High.test=High[-train] tree.carseats=tree(High~.-Sales,Carseats,subset=train) tree.pred=predict(tree.carseats,Carseats.test,type="class") table(tree.pred,High.test) (86+57)/200 # Will prunning improve the results? # Search for the tree size with optimal cross-validated performance set.seed(3) cv.carseats=cv.tree(tree.carseats,FUN=prune.misclass) names(cv.carseats) cv.carseats # Plot missclassification vs complexity par(mfrow=c(1,2)) plot(cv.carseats$size,cv.carseats$dev,type="b") plot(cv.carseats$k,cv.carseats$dev,type="b") # Prume the tree at 9 leaves prune.carseats=prune.misclass(tree.carseats,best=9) plot(prune.carseats) text(prune.carseats,pretty=0) tree.pred=predict(prune.carseats,Carseats.test,type="class") table(tree.pred,High.test) (94+60)/200 # More interpretable results and better accuracy # Increasing the size of the tree will reduce the accuracy prune.carseats=prune.misclass(tree.carseats,best=15) plot(prune.carseats) text(prune.carseats,pretty=0) tree.pred=predict(prune.carseats,Carseats.test,type="class") table(tree.pred,High.test) (86+62)/200 ################################# # Fitting Regression Trees ################################# # Boston real estate dataset library(MASS) set.seed(1) train = sample(1:nrow(Boston), nrow(Boston)/2) tree.boston=tree(medv~.,Boston,subset=train) summary(tree.boston) plot(tree.boston) text(tree.boston,pretty=0) cv.boston=cv.tree(tree.boston) plot(cv.boston$size,cv.boston$dev,type='b') # Prune and plot the tree of optimal size prune.boston=prune.tree(tree.boston,best=5) plot(prune.boston) text(prune.boston,pretty=0) # Predict with the unprunned tree on the validation set yhat=predict(tree.boston,newdata=Boston[-train,]) boston.test=Boston[-train,"medv"] plot(yhat,boston.test) abline(0,1) mean((yhat-boston.test)^2) ################################# # Bagging and Random Forests ################################# # Bagging (i.e., random forest with m=p. Here m=13) library(randomForest) set.seed(1) bag.boston=randomForest(medv~.,data=Boston,subset=train,mtry=13,importance=TRUE) bag.boston # Prediction of the bagged model on the test set yhat.bag = predict(bag.boston,newdata=Boston[-train,]) plot(yhat.bag, boston.test) abline(0,1) mean((yhat.bag-boston.test)^2) # smaller than that of the maximal tree # Change the number of trees bag.boston=randomForest(medv~.,data=Boston,subset=train,mtry=13,ntree=25) yhat.bag = predict(bag.boston,newdata=Boston[-train,]) mean((yhat.bag-boston.test)^2) # Random forest - use a smaller number mtry set.seed(1) rf.boston=randomForest(medv~.,data=Boston,subset=train,mtry=6,importance=TRUE) yhat.rf = predict(rf.boston,newdata=Boston[-train,]) mean((yhat.rf-boston.test)^2) # improvement over bagging importance(rf.boston) # mean decrease of accuracy of prediction; total decrease of node impurity over all trees varImpPlot(rf.boston) ################################# # Boosting ################################# library(gbm) set.seed(1) boost.boston=gbm(medv~.,data=Boston[train,],distribution="gaussian",n.trees=5000,interaction.depth=4) summary(boost.boston) # partial dependence plot # - marginal effect on the selected variables, on average over the others par(mfrow=c(1,2)) plot(boost.boston,i="rm") plot(boost.boston,i="lstat") # Predict on the test set yhat.boost=predict(boost.boston,newdata=Boston[-train,],n.trees=5000) mean((yhat.boost-boston.test)^2) # even better performance # Change lambda boost.boston=gbm(medv~.,data=Boston[train,],distribution="gaussian",n.trees=5000,interaction.depth=4,shrinkage=0.2,verbose=F) yhat.boost=predict(boost.boston,newdata=Boston[-train,],n.trees=5000) mean((yhat.boost-boston.test)^2)