## Chapter 11: Tree-based Classification and Regression
## Note
## Sec 11.1: The Uses of Tree-based Methods
## ss 11.1.1: Problems for which tree-based regression may be used
## When are tree-based methods appropriate?
## Sec 11.2: Detecting Email Spam~-- an Example
## Footnote Code
## Obtain 500-row sample; repeat the first plot (of crl.tot)
library(DAAG)
## Loading required package: lattice
spam.sample <- spam7[sample(seq(1,4601), 500, replace=FALSE), ]
boxplot(split(spam.sample$crl.tot, spam.sample$yesno))

library(rpart)
spam.rpart <- rpart(formula = yesno ~ crl.tot + dollar + bang +
money + n000 + make, method="class", data=spam7)
plot(spam.rpart) # Draw tree
text(spam.rpart) # Add labeling

## ss 11.2.1: Choosing the number of splits
## Sec 11.3: Terminology and Methodology
## Footnote Code
## Code to plot tree
Criterion <- factor(paste("Leaf", 1:5))
Node <- c(1,2,3,4,5)
demo.df <- data.frame(Criterion = Criterion, Node = Node)
demo.rpart <- rpart(Node ~ Criterion, data = demo.df,
control = list(minsplit = 2, minbucket = 1))
plot(demo.rpart, uniform=TRUE)
text(demo.rpart)

## ss 11.3.1: Choosing the split~-- regression trees
## ss 11.3.2: Within and between sums of squares
## ss 11.3.3: Choosing the split~-- classification trees
## ss 11.3.4: Tree-based regression versus loess regression smoothing
## loess fit to Mileage vs Weight: data frame car.test.frame (rpart)
with(car.test.frame, scatter.smooth(Mileage ~ Weight))

par(xpd=TRUE)
car.tree <- rpart(Mileage ~ Weight, data=car.test.frame,
control = list(minsplit = 10, minbucket = 5,
cp = 0.0001), method="anova")
plot(car.tree, uniform = TRUE)
text(car.tree, digits = 3, use.n = TRUE)

car.tree <- rpart(Mileage ~ Weight, data = car.test.frame)
plot(car.tree, uniform = FALSE)
text(car.tree, digits = 3, use.n = TRUE)

## Sec 11.4: Predictive Accuracy, and the Cost-complexity Tradeoff
## ss 11.4.1: Cross-validation
## ss 11.4.2: The cost-complexity parameter
## ss 11.4.3: Prediction error versus tree size
## Sec 11.5: Data for female heart attack patients
summary(mifem) # data frame mifem (DAAG)
## outcome age yronset premi smstat diabetes
## live:974 Min. :35.0 Min. :85.0 y :311 c :390 y :248
## dead:321 1st Qu.:57.0 1st Qu.:87.0 n :928 x :280 n :978
## Median :63.0 Median :89.0 nk: 56 n :522 nk: 69
## Mean :60.9 Mean :88.8 nk:103
## 3rd Qu.:66.0 3rd Qu.:91.0
## Max. :69.0 Max. :93.0
## highbp hichol angina stroke
## y :813 y :452 y :472 y : 153
## n :406 n :655 n :724 n :1063
## nk: 76 nk:188 nk: 99 nk: 79
##
##
##
mifem.rpart <- rpart(outcome ~ ., method="class",
data = mifem, cp = 0.0025)
plotcp(mifem.rpart) # Cross-validated error vs cp

printcp(mifem.rpart) # Tabular version of the same information
##
## Classification tree:
## rpart(formula = outcome ~ ., data = mifem, method = "class",
## cp = 0.0025)
##
## Variables actually used in tree construction:
## [1] age angina diabetes hichol premi smstat stroke yronset
##
## Root node error: 321/1295 = 0.25
##
## n= 1295
##
## CP nsplit rel error xerror xstd
## 1 0.2025 0 1.00 1.00 0.048
## 2 0.0056 1 0.80 0.80 0.045
## 3 0.0047 13 0.72 0.85 0.046
## 4 0.0031 17 0.70 0.84 0.046
## 5 0.0025 18 0.69 0.85 0.046
mifemb.rpart <- prune(mifem.rpart, cp=0.03)
plot(mifemb.rpart) # May be needed so that labels appear
text(mifemb.rpart, use.n=T, digits=3)

## ss 11.5.1: The one-standard-deviation rule
## ss 11.5.2: Printed Information on Each Split
print(mifemb.rpart)
## n= 1295
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 1295 321 live (0.7521 0.2479)
## 2) angina=y,n 1196 239 live (0.8002 0.1998) *
## 3) angina=nk 99 17 dead (0.1717 0.8283) *
## Sec 11.6: Detecting Email Spam~-- the Optimal Tree
spam7a.rpart <- rpart(formula = yesno ~ crl.tot + dollar +
bang + money + n000 + make,
method="class", data = spam7, cp = 0.001)
printcp(spam7a.rpart)
##
## Classification tree:
## rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 +
## make, data = spam7, method = "class", cp = 0.001)
##
## Variables actually used in tree construction:
## [1] bang crl.tot dollar money n000
##
## Root node error: 1813/4601 = 0.39
##
## n= 4601
##
## CP nsplit rel error xerror xstd
## 1 0.4766 0 1.00 1.00 0.018
## 2 0.0756 1 0.52 0.56 0.015
## 3 0.0116 3 0.37 0.39 0.013
## 4 0.0105 4 0.36 0.38 0.013
## 5 0.0063 5 0.35 0.37 0.013
## 6 0.0055 10 0.32 0.36 0.013
## 7 0.0044 11 0.31 0.35 0.013
## 8 0.0039 12 0.31 0.34 0.013
## 9 0.0028 16 0.29 0.34 0.013
## 10 0.0022 17 0.29 0.33 0.013
## 11 0.0019 18 0.29 0.33 0.013
## 12 0.0017 20 0.28 0.34 0.013
## 13 0.0010 25 0.27 0.33 0.013
## Footnote Code
## Use prune.rpart() with cp = 0.03 (0.00276 < 0.03 < 0.00386),
## to prune back to nsplit=16.
spam7b.rpart <- prune(spam7a.rpart, cp=0.003)
plot(spam7b.rpart, uniform=TRUE)
text(spam7b.rpart, cex=0.75)

par(xpd=FALSE)
## How does the one standard error rule affect accuracy estimates?
acctree.mat <- matrix(0, nrow=100, ncol=6)
for(i in 1:100)acctree.mat[i,] <- compareTreecalcs(data=spam7,
fun="rpart")
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
## Minimum cv error was for largest tree
## Perhaps try a smaller value of cp
## How is the standard error calculated?
## When are tree-based methods appropriate?
## Sec 11.7: The {randomForest} Package
library(randomForest)
spam7.rf <- randomForest(yesno ~ ., data=spam7, importance=TRUE)
print(spam7.rf)
##
## Call:
## randomForest(formula = yesno ~ ., data = spam7, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 11.82%
## Confusion matrix:
## n y class.error
## n 2646 142 0.05093
## y 402 1411 0.22173
tuneRF(x=spam7[, -7], y=spam7$yesno, trace=FALSE)
## -0.1642 0.05
## -0.04104 0.05

## mtry OOBError
## 1.OOB 1 0.1356
## 2.OOB 2 0.1165
## 4.OOB 4 0.1213
importance(spam7.rf)
## n y MeanDecreaseAccuracy MeanDecreaseGini
## crl.tot 54.02 55.94 73.76 244.04
## dollar 56.67 54.88 74.41 422.63
## bang 103.14 100.62 122.83 596.54
## money 33.15 52.19 52.63 207.90
## n000 60.16 14.60 62.98 121.01
## make 13.40 20.67 24.67 41.81
## Comparison between {rpart()} and {randomForest()}
## Footnote Code
## Accuracy comparisons
acctree.mat <- matrix(0, nrow=100, ncol=8)
colnames(acctree.mat) <- c("rpSEcvI", "rpcvI", "rpSEtest", "rptest",
"n.SErule", "nre.min.12", "rfcvI", "rftest")
for(i in 1:100)acctree.mat[i,] <-
compareTreecalcs(data=spam7, fun=c("rpart", "randomForest"))
acctree.df <- data.frame(acctree.mat)
lims <- range(acctree.mat[, c(4,7,8)], na.rm=TRUE)
plot(rfcvI ~ rftest, data=acctree.df); abline(0,1) # Panel A

plot(rptest ~ rftest, data=acctree.df); abline(0,1) # Panel B

## Efficient computation
## Differences between {rpart()} and {randomForest()}
## Sec 11.8: Additional Notes on Tree-Based Methods
## The combining of tree-based methods with other approaches
## Models with a complex error structure
## Pruning as variable selection
## Other types of tree
## Summary of pluses and minuses of tree-based methods
## Sec 11.9: Further Reading and Extensions
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:DAAG':
##
## hills
sapply(biopsy, function(x)sum(is.na(x)))
## ID V1 V2 V3 V4 V5 V6 V7 V8 V9 class
## 0 0 0 0 0 0 16 0 0 0 0
biops <- na.omit(biopsy[,-1]) ## Column 1 is ID
## Examine list element names in randomForest object
names(randomForest(class ~ ., data=biops))
## [1] "call" "type" "predicted"
## [4] "err.rate" "confusion" "votes"
## [7] "oob.times" "classes" "importance"
## [10] "importanceSD" "localImportance" "proximity"
## [13] "ntree" "mtry" "forest"
## [16] "y" "test" "inbag"
## [19] "terms"
## Repeated runs, note variation in OOB accuracy.
for(i in 1:10) {
biops.rf <- randomForest(class ~ ., data=biops)
OOBerr <- mean(biops.rf$err.rate[,"OOB"])
print(paste(i, ": ", round(OOBerr, 4), sep=""))
print(round(biops.rf$confusion,4))
}
## [1] "1: 0.0275"
## benign malignant class.error
## benign 433 11 0.0248
## malignant 6 233 0.0251
## [1] "2: 0.0271"
## benign malignant class.error
## benign 433 11 0.0248
## malignant 6 233 0.0251
## [1] "3: 0.0277"
## benign malignant class.error
## benign 433 11 0.0248
## malignant 6 233 0.0251
## [1] "4: 0.0299"
## benign malignant class.error
## benign 433 11 0.0248
## malignant 7 232 0.0293
## [1] "5: 0.0298"
## benign malignant class.error
## benign 433 11 0.0248
## malignant 7 232 0.0293
## [1] "6: 0.0294"
## benign malignant class.error
## benign 433 11 0.0248
## malignant 8 231 0.0335
## [1] "7: 0.0297"
## benign malignant class.error
## benign 432 12 0.0270
## malignant 7 232 0.0293
## [1] "8: 0.0283"
## benign malignant class.error
## benign 431 13 0.0293
## malignant 7 232 0.0293
## [1] "9: 0.0302"
## benign malignant class.error
## benign 431 13 0.0293
## malignant 7 232 0.0293
## [1] "10: 0.03"
## benign malignant class.error
## benign 433 11 0.0248
## malignant 7 232 0.0293
## Repeated train/test splits: OOB accuracy vs test set accuracy.
for(i in 1:10){
trRows <- sample(1:dim(biops)[1], size=round(dim(biops)[1]/2))
biops.rf <- randomForest(class ~ ., data=biops[trRows, ],
xtest=biops[-trRows,-10],
ytest=biops[-trRows,10])
oobErr <- mean(biops.rf$err.rate[,"OOB"])
testErr <- mean(biops.rf$test$err.rate[,"Test"])
print(round(c(oobErr,testErr),4))
}
## [1] 0.0318 0.0260
## [1] 0.0233 0.0493
## [1] 0.0283 0.0441
## [1] 0.0341 0.0301
## [1] 0.0471 0.0249
## [1] 0.0426 0.0331
## [1] 0.0267 0.0316
## [1] 0.0318 0.0307
## [1] 0.0418 0.0210
## [1] 0.0351 0.0276
randomForest(class ~ ., data=biops, xtest=biops[,-10],
ytest=biops[,10])
##
## Call:
## randomForest(formula = class ~ ., data = biops, xtest = biops[, -10], ytest = biops[, 10])
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 2.64%
## Confusion matrix:
## benign malignant class.error
## benign 432 12 0.02703
## malignant 6 233 0.02510
## Test set error rate: 0%
## Confusion matrix:
## benign malignant class.error
## benign 444 0 0
## malignant 0 239 0
## Run model on total data
randomForest(as.factor(type) ~ ., data=Pima.tr)
##
## Call:
## randomForest(formula = as.factor(type) ~ ., data = Pima.tr)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 28.5%
## Confusion matrix:
## No Yes class.error
## No 110 22 0.1667
## Yes 35 33 0.5147
rowsamp <- sample(dim(Pima.tr)[1], replace=TRUE)
randomForest(as.factor(type) ~ ., data=Pima.tr[rowsamp, ])
##
## Call:
## randomForest(formula = as.factor(type) ~ ., data = Pima.tr[rowsamp, ])
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 12%
## Confusion matrix:
## No Yes class.error
## No 114 11 0.0880
## Yes 13 62 0.1733