treebased.R

## Chapter 11: Tree-based Classification and Regression 
##                                Note 

## Sec 11.1: The Uses of Tree-based Methods 
## ss 11.1.1: Problems for which tree-based regression may be used 
##              When are tree-based methods appropriate? 

## Sec 11.2: Detecting Email Spam~-- an Example 
## Footnote Code
## Obtain 500-row sample; repeat the first plot (of crl.tot) 
library(DAAG)

## Loading required package: lattice

spam.sample <- spam7[sample(seq(1,4601), 500, replace=FALSE), ] 
boxplot(split(spam.sample$crl.tot, spam.sample$yesno))

plot of chunk unnamed-chunk-1

library(rpart) 
spam.rpart <- rpart(formula = yesno ~ crl.tot + dollar + bang + 
                    money + n000 + make,  method="class", data=spam7) 
plot(spam.rpart)     # Draw tree 
text(spam.rpart)     # Add labeling

plot of chunk unnamed-chunk-1

## ss 11.2.1: Choosing the number of splits 

## Sec 11.3: Terminology and Methodology 
## Footnote Code
## Code to plot tree 
Criterion <- factor(paste("Leaf", 1:5)) 
Node <- c(1,2,3,4,5) 
demo.df <- data.frame(Criterion = Criterion, Node = Node) 
demo.rpart <- rpart(Node ~ Criterion, data = demo.df,  
                    control = list(minsplit = 2, minbucket = 1)) 
plot(demo.rpart, uniform=TRUE) 
text(demo.rpart)

plot of chunk unnamed-chunk-1

## ss 11.3.1: Choosing the split~-- regression trees 
## ss 11.3.2: Within and between sums of squares 
## ss 11.3.3: Choosing the split~-- classification trees 
## ss 11.3.4: Tree-based regression versus loess regression smoothing 
## loess fit to Mileage vs Weight: data frame car.test.frame (rpart) 
with(car.test.frame, scatter.smooth(Mileage ~ Weight))

plot of chunk unnamed-chunk-1

par(xpd=TRUE)
car.tree <- rpart(Mileage ~ Weight, data=car.test.frame, 
                  control = list(minsplit = 10, minbucket = 5, 
                  cp = 0.0001), method="anova") 
plot(car.tree, uniform = TRUE) 
text(car.tree, digits = 3, use.n = TRUE)

plot of chunk unnamed-chunk-1

car.tree <- rpart(Mileage ~ Weight, data = car.test.frame) 
plot(car.tree, uniform = FALSE) 
text(car.tree, digits = 3, use.n = TRUE)

plot of chunk unnamed-chunk-1

## Sec 11.4: Predictive Accuracy, and the Cost-complexity Tradeoff 
## ss 11.4.1: Cross-validation 
## ss 11.4.2: The cost-complexity parameter 
## ss 11.4.3: Prediction error versus tree size 

## Sec 11.5: Data for female heart attack patients 
summary(mifem)     # data frame mifem (DAAG)

##  outcome         age          yronset     premi    smstat   diabetes
##  live:974   Min.   :35.0   Min.   :85.0   y :311   c :390   y :248  
##  dead:321   1st Qu.:57.0   1st Qu.:87.0   n :928   x :280   n :978  
##             Median :63.0   Median :89.0   nk: 56   n :522   nk: 69  
##             Mean   :60.9   Mean   :88.8            nk:103           
##             3rd Qu.:66.0   3rd Qu.:91.0                             
##             Max.   :69.0   Max.   :93.0                             
##  highbp   hichol   angina   stroke   
##  y :813   y :452   y :472   y : 153  
##  n :406   n :655   n :724   n :1063  
##  nk: 76   nk:188   nk: 99   nk:  79  
##                                      
##                                      
##

mifem.rpart <- rpart(outcome ~ ., method="class",   
                     data = mifem, cp = 0.0025) 

plotcp(mifem.rpart)  # Cross-validated error vs cp

plot of chunk unnamed-chunk-1

printcp(mifem.rpart) # Tabular version of the same information

## 
## Classification tree:
## rpart(formula = outcome ~ ., data = mifem, method = "class", 
##     cp = 0.0025)
## 
## Variables actually used in tree construction:
## [1] age      angina   diabetes hichol   premi    smstat   stroke   yronset 
## 
## Root node error: 321/1295 = 0.25
## 
## n= 1295 
## 
##       CP nsplit rel error xerror  xstd
## 1 0.2025      0      1.00   1.00 0.048
## 2 0.0056      1      0.80   0.80 0.045
## 3 0.0047     13      0.72   0.85 0.046
## 4 0.0031     17      0.70   0.84 0.046
## 5 0.0025     18      0.69   0.85 0.046

mifemb.rpart <- prune(mifem.rpart, cp=0.03) 

plot(mifemb.rpart)   # May be needed so that labels appear 
text(mifemb.rpart, use.n=T, digits=3)

## ss 11.5.1: The one-standard-deviation rule 
## ss 11.5.2: Printed Information on Each Split 
print(mifemb.rpart)

## n= 1295 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 1295 321 live (0.7521 0.2479)  
##   2) angina=y,n 1196 239 live (0.8002 0.1998) *
##   3) angina=nk 99  17 dead (0.1717 0.8283) *

## Sec 11.6: Detecting Email Spam~-- the Optimal Tree 
spam7a.rpart <- rpart(formula = yesno ~ crl.tot + dollar +  
                      bang + money + n000 + make,  
                      method="class", data = spam7, cp = 0.001) 

printcp(spam7a.rpart)

## 
## Classification tree:
## rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 + 
##     make, data = spam7, method = "class", cp = 0.001)
## 
## Variables actually used in tree construction:
## [1] bang    crl.tot dollar  money   n000   
## 
## Root node error: 1813/4601 = 0.39
## 
## n= 4601 
## 
##        CP nsplit rel error xerror  xstd
## 1  0.4766      0      1.00   1.00 0.018
## 2  0.0756      1      0.52   0.56 0.015
## 3  0.0116      3      0.37   0.39 0.013
## 4  0.0105      4      0.36   0.38 0.013
## 5  0.0063      5      0.35   0.37 0.013
## 6  0.0055     10      0.32   0.36 0.013
## 7  0.0044     11      0.31   0.35 0.013
## 8  0.0039     12      0.31   0.34 0.013
## 9  0.0028     16      0.29   0.34 0.013
## 10 0.0022     17      0.29   0.33 0.013
## 11 0.0019     18      0.29   0.33 0.013
## 12 0.0017     20      0.28   0.34 0.013
## 13 0.0010     25      0.27   0.33 0.013

## Footnote Code
## Use prune.rpart() with cp = 0.03 (0.00276 < 0.03 < 0.00386),   
## to prune back to nsplit=16. 
spam7b.rpart <- prune(spam7a.rpart, cp=0.003) 
plot(spam7b.rpart, uniform=TRUE) 
text(spam7b.rpart, cex=0.75)

plot of chunk unnamed-chunk-1

par(xpd=FALSE) 

##  How does the one standard error rule affect accuracy estimates? 
acctree.mat <- matrix(0, nrow=100, ncol=6) 
for(i in 1:100)acctree.mat[i,] <- compareTreecalcs(data=spam7,  
                                                   fun="rpart")

## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.

## Minimum cv error was for largest tree
## Perhaps try a smaller value of cp

##               How is the standard error calculated? 
##              When are tree-based methods appropriate? 

## Sec 11.7: The {randomForest} Package 
library(randomForest) 
spam7.rf <- randomForest(yesno ~ ., data=spam7, importance=TRUE) 

print(spam7.rf)

## 
## Call:
##  randomForest(formula = yesno ~ ., data = spam7, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 11.82%
## Confusion matrix:
##      n    y class.error
## n 2646  142     0.05093
## y  402 1411     0.22173

tuneRF(x=spam7[, -7], y=spam7$yesno, trace=FALSE)

## -0.1642 0.05 
## -0.04104 0.05

plot of chunk unnamed-chunk-1

##       mtry OOBError
## 1.OOB    1   0.1356
## 2.OOB    2   0.1165
## 4.OOB    4   0.1213

importance(spam7.rf)

##              n      y MeanDecreaseAccuracy MeanDecreaseGini
## crl.tot  54.02  55.94                73.76           244.04
## dollar   56.67  54.88                74.41           422.63
## bang    103.14 100.62               122.83           596.54
## money    33.15  52.19                52.63           207.90
## n000     60.16  14.60                62.98           121.01
## make     13.40  20.67                24.67            41.81

##         Comparison between {rpart()} and {randomForest()} 
## Footnote Code
## Accuracy comparisons 
acctree.mat <- matrix(0, nrow=100, ncol=8) 
colnames(acctree.mat) <- c("rpSEcvI", "rpcvI", "rpSEtest", "rptest", 
                           "n.SErule", "nre.min.12", "rfcvI", "rftest") 
for(i in 1:100)acctree.mat[i,] <-  
   compareTreecalcs(data=spam7, fun=c("rpart", "randomForest")) 
acctree.df <- data.frame(acctree.mat) 
lims <- range(acctree.mat[, c(4,7,8)], na.rm=TRUE) 
plot(rfcvI ~ rftest, data=acctree.df); abline(0,1)    # Panel A

plot of chunk unnamed-chunk-1

plot(rptest ~ rftest, data=acctree.df); abline(0,1)   # Panel B

plot of chunk unnamed-chunk-1

##                       Efficient computation 
##         Differences between {rpart()} and {randomForest()} 

## Sec 11.8: Additional Notes on Tree-Based Methods 
##     The combining of tree-based methods with other approaches 
##               Models with a complex error structure 
##                   Pruning as variable selection 
##                        Other types of tree 
##        Summary of pluses and minuses of tree-based methods 

## Sec 11.9: Further Reading and Extensions 
library(MASS)

## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:DAAG':
## 
##     hills

sapply(biopsy, function(x)sum(is.na(x)))

##    ID    V1    V2    V3    V4    V5    V6    V7    V8    V9 class 
##     0     0     0     0     0     0    16     0     0     0     0

biops <- na.omit(biopsy[,-1])  ## Column 1 is ID 
## Examine list element names in randomForest object 
names(randomForest(class ~ ., data=biops))

##  [1] "call"            "type"            "predicted"      
##  [4] "err.rate"        "confusion"       "votes"          
##  [7] "oob.times"       "classes"         "importance"     
## [10] "importanceSD"    "localImportance" "proximity"      
## [13] "ntree"           "mtry"            "forest"         
## [16] "y"               "test"            "inbag"          
## [19] "terms"

## Repeated runs, note variation in OOB accuracy. 
for(i in 1:10) { 
  biops.rf <- randomForest(class ~ ., data=biops) 
  OOBerr <- mean(biops.rf$err.rate[,"OOB"]) 
  print(paste(i, ": ", round(OOBerr, 4), sep="")) 
  print(round(biops.rf$confusion,4)) 
}

## [1] "1: 0.0275"
##           benign malignant class.error
## benign       433        11      0.0248
## malignant      6       233      0.0251
## [1] "2: 0.0271"
##           benign malignant class.error
## benign       433        11      0.0248
## malignant      6       233      0.0251
## [1] "3: 0.0277"
##           benign malignant class.error
## benign       433        11      0.0248
## malignant      6       233      0.0251
## [1] "4: 0.0299"
##           benign malignant class.error
## benign       433        11      0.0248
## malignant      7       232      0.0293
## [1] "5: 0.0298"
##           benign malignant class.error
## benign       433        11      0.0248
## malignant      7       232      0.0293
## [1] "6: 0.0294"
##           benign malignant class.error
## benign       433        11      0.0248
## malignant      8       231      0.0335
## [1] "7: 0.0297"
##           benign malignant class.error
## benign       432        12      0.0270
## malignant      7       232      0.0293
## [1] "8: 0.0283"
##           benign malignant class.error
## benign       431        13      0.0293
## malignant      7       232      0.0293
## [1] "9: 0.0302"
##           benign malignant class.error
## benign       431        13      0.0293
## malignant      7       232      0.0293
## [1] "10: 0.03"
##           benign malignant class.error
## benign       433        11      0.0248
## malignant      7       232      0.0293

## Repeated train/test splits: OOB accuracy vs test set accuracy. 
for(i in 1:10){ 
  trRows <- sample(1:dim(biops)[1], size=round(dim(biops)[1]/2)) 
  biops.rf <- randomForest(class ~ ., data=biops[trRows, ],  
                           xtest=biops[-trRows,-10],  
                           ytest=biops[-trRows,10]) 
  oobErr <- mean(biops.rf$err.rate[,"OOB"]) 
  testErr <- mean(biops.rf$test$err.rate[,"Test"]) 
  print(round(c(oobErr,testErr),4)) 
}

## [1] 0.0318 0.0260
## [1] 0.0233 0.0493
## [1] 0.0283 0.0441
## [1] 0.0341 0.0301
## [1] 0.0471 0.0249
## [1] 0.0426 0.0331
## [1] 0.0267 0.0316
## [1] 0.0318 0.0307
## [1] 0.0418 0.0210
## [1] 0.0351 0.0276

randomForest(class ~ ., data=biops, xtest=biops[,-10],  
             ytest=biops[,10])

## 
## Call:
##  randomForest(formula = class ~ ., data = biops, xtest = biops[,      -10], ytest = biops[, 10]) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 2.64%
## Confusion matrix:
##           benign malignant class.error
## benign       432        12     0.02703
## malignant      6       233     0.02510
##                 Test set error rate: 0%
## Confusion matrix:
##           benign malignant class.error
## benign       444         0           0
## malignant      0       239           0

## Run model on total data 
randomForest(as.factor(type) ~ ., data=Pima.tr)

## 
## Call:
##  randomForest(formula = as.factor(type) ~ ., data = Pima.tr) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 28.5%
## Confusion matrix:
##      No Yes class.error
## No  110  22      0.1667
## Yes  35  33      0.5147

rowsamp <- sample(dim(Pima.tr)[1], replace=TRUE) 
randomForest(as.factor(type) ~ ., data=Pima.tr[rowsamp, ])

## 
## Call:
##  randomForest(formula = as.factor(type) ~ ., data = Pima.tr[rowsamp,      ]) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 12%
## Confusion matrix:
##      No Yes class.error
## No  114  11      0.0880
## Yes  13  62      0.1733

treebased.R

johnm1

Wed Sep 10 17:15:49 2014