## Chapter 11: Tree-based Classification and Regression 
##                                Note 

## Sec 11.1: The Uses of Tree-based Methods 
## ss 11.1.1: Problems for which tree-based regression may be used 
##              When are tree-based methods appropriate? 

## Sec 11.2: Detecting Email Spam~-- an Example 
## Footnote Code
## Obtain 500-row sample; repeat the first plot (of crl.tot) 
library(DAAG)
## Loading required package: lattice
spam.sample <- spam7[sample(seq(1,4601), 500, replace=FALSE), ] 
boxplot(split(spam.sample$crl.tot, spam.sample$yesno)) 

plot of chunk unnamed-chunk-1

library(rpart) 
spam.rpart <- rpart(formula = yesno ~ crl.tot + dollar + bang + 
                    money + n000 + make,  method="class", data=spam7) 
plot(spam.rpart)     # Draw tree 
text(spam.rpart)     # Add labeling 

plot of chunk unnamed-chunk-1

## ss 11.2.1: Choosing the number of splits 

## Sec 11.3: Terminology and Methodology 
## Footnote Code
## Code to plot tree 
Criterion <- factor(paste("Leaf", 1:5)) 
Node <- c(1,2,3,4,5) 
demo.df <- data.frame(Criterion = Criterion, Node = Node) 
demo.rpart <- rpart(Node ~ Criterion, data = demo.df,  
                    control = list(minsplit = 2, minbucket = 1)) 
plot(demo.rpart, uniform=TRUE) 
text(demo.rpart) 

plot of chunk unnamed-chunk-1

## ss 11.3.1: Choosing the split~-- regression trees 
## ss 11.3.2: Within and between sums of squares 
## ss 11.3.3: Choosing the split~-- classification trees 
## ss 11.3.4: Tree-based regression versus loess regression smoothing 
## loess fit to Mileage vs Weight: data frame car.test.frame (rpart) 
with(car.test.frame, scatter.smooth(Mileage ~ Weight)) 

plot of chunk unnamed-chunk-1

par(xpd=TRUE)
car.tree <- rpart(Mileage ~ Weight, data=car.test.frame, 
                  control = list(minsplit = 10, minbucket = 5, 
                  cp = 0.0001), method="anova") 
plot(car.tree, uniform = TRUE) 
text(car.tree, digits = 3, use.n = TRUE) 

plot of chunk unnamed-chunk-1

car.tree <- rpart(Mileage ~ Weight, data = car.test.frame) 
plot(car.tree, uniform = FALSE) 
text(car.tree, digits = 3, use.n = TRUE) 

plot of chunk unnamed-chunk-1

## Sec 11.4: Predictive Accuracy, and the Cost-complexity Tradeoff 
## ss 11.4.1: Cross-validation 
## ss 11.4.2: The cost-complexity parameter 
## ss 11.4.3: Prediction error versus tree size 

## Sec 11.5: Data for female heart attack patients 
summary(mifem)     # data frame mifem (DAAG) 
##  outcome         age          yronset     premi    smstat   diabetes
##  live:974   Min.   :35.0   Min.   :85.0   y :311   c :390   y :248  
##  dead:321   1st Qu.:57.0   1st Qu.:87.0   n :928   x :280   n :978  
##             Median :63.0   Median :89.0   nk: 56   n :522   nk: 69  
##             Mean   :60.9   Mean   :88.8            nk:103           
##             3rd Qu.:66.0   3rd Qu.:91.0                             
##             Max.   :69.0   Max.   :93.0                             
##  highbp   hichol   angina   stroke   
##  y :813   y :452   y :472   y : 153  
##  n :406   n :655   n :724   n :1063  
##  nk: 76   nk:188   nk: 99   nk:  79  
##                                      
##                                      
## 
mifem.rpart <- rpart(outcome ~ ., method="class",   
                     data = mifem, cp = 0.0025) 

plotcp(mifem.rpart)  # Cross-validated error vs cp 

plot of chunk unnamed-chunk-1

printcp(mifem.rpart) # Tabular version of the same information 
## 
## Classification tree:
## rpart(formula = outcome ~ ., data = mifem, method = "class", 
##     cp = 0.0025)
## 
## Variables actually used in tree construction:
## [1] age      angina   diabetes hichol   premi    smstat   stroke   yronset 
## 
## Root node error: 321/1295 = 0.25
## 
## n= 1295 
## 
##       CP nsplit rel error xerror  xstd
## 1 0.2025      0      1.00   1.00 0.048
## 2 0.0056      1      0.80   0.80 0.045
## 3 0.0047     13      0.72   0.85 0.046
## 4 0.0031     17      0.70   0.84 0.046
## 5 0.0025     18      0.69   0.85 0.046
mifemb.rpart <- prune(mifem.rpart, cp=0.03) 

plot(mifemb.rpart)   # May be needed so that labels appear 
text(mifemb.rpart, use.n=T, digits=3) 

plot of chunk unnamed-chunk-1

## ss 11.5.1: The one-standard-deviation rule 
## ss 11.5.2: Printed Information on Each Split 
print(mifemb.rpart) 
## n= 1295 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 1295 321 live (0.7521 0.2479)  
##   2) angina=y,n 1196 239 live (0.8002 0.1998) *
##   3) angina=nk 99  17 dead (0.1717 0.8283) *
## Sec 11.6: Detecting Email Spam~-- the Optimal Tree 
spam7a.rpart <- rpart(formula = yesno ~ crl.tot + dollar +  
                      bang + money + n000 + make,  
                      method="class", data = spam7, cp = 0.001) 

printcp(spam7a.rpart) 
## 
## Classification tree:
## rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 + 
##     make, data = spam7, method = "class", cp = 0.001)
## 
## Variables actually used in tree construction:
## [1] bang    crl.tot dollar  money   n000   
## 
## Root node error: 1813/4601 = 0.39
## 
## n= 4601 
## 
##        CP nsplit rel error xerror  xstd
## 1  0.4766      0      1.00   1.00 0.018
## 2  0.0756      1      0.52   0.56 0.015
## 3  0.0116      3      0.37   0.39 0.013
## 4  0.0105      4      0.36   0.38 0.013
## 5  0.0063      5      0.35   0.37 0.013
## 6  0.0055     10      0.32   0.36 0.013
## 7  0.0044     11      0.31   0.35 0.013
## 8  0.0039     12      0.31   0.34 0.013
## 9  0.0028     16      0.29   0.34 0.013
## 10 0.0022     17      0.29   0.33 0.013
## 11 0.0019     18      0.29   0.33 0.013
## 12 0.0017     20      0.28   0.34 0.013
## 13 0.0010     25      0.27   0.33 0.013
## Footnote Code
## Use prune.rpart() with cp = 0.03 (0.00276 < 0.03 < 0.00386),   
## to prune back to nsplit=16. 
spam7b.rpart <- prune(spam7a.rpart, cp=0.003) 
plot(spam7b.rpart, uniform=TRUE) 
text(spam7b.rpart, cex=0.75)