``````## Chapter 11: Tree-based Classification and Regression
##                                Note

## Sec 11.1: The Uses of Tree-based Methods
## ss 11.1.1: Problems for which tree-based regression may be used
##              When are tree-based methods appropriate?

## Sec 11.2: Detecting Email Spam~-- an Example
## Footnote Code
## Obtain 500-row sample; repeat the first plot (of crl.tot)
library(DAAG)``````
``## Loading required package: lattice``
``````spam.sample <- spam7[sample(seq(1,4601), 500, replace=FALSE), ]
boxplot(split(spam.sample\$crl.tot, spam.sample\$yesno)) ``````

``````library(rpart)
spam.rpart <- rpart(formula = yesno ~ crl.tot + dollar + bang +
money + n000 + make,  method="class", data=spam7)
plot(spam.rpart)     # Draw tree

``````## ss 11.2.1: Choosing the number of splits

## Sec 11.3: Terminology and Methodology
## Footnote Code
## Code to plot tree
Criterion <- factor(paste("Leaf", 1:5))
Node <- c(1,2,3,4,5)
demo.df <- data.frame(Criterion = Criterion, Node = Node)
demo.rpart <- rpart(Node ~ Criterion, data = demo.df,
control = list(minsplit = 2, minbucket = 1))
plot(demo.rpart, uniform=TRUE)
text(demo.rpart) ``````

``````## ss 11.3.1: Choosing the split~-- regression trees
## ss 11.3.2: Within and between sums of squares
## ss 11.3.3: Choosing the split~-- classification trees
## ss 11.3.4: Tree-based regression versus loess regression smoothing
## loess fit to Mileage vs Weight: data frame car.test.frame (rpart)
with(car.test.frame, scatter.smooth(Mileage ~ Weight)) ``````

``````par(xpd=TRUE)
car.tree <- rpart(Mileage ~ Weight, data=car.test.frame,
control = list(minsplit = 10, minbucket = 5,
cp = 0.0001), method="anova")
plot(car.tree, uniform = TRUE)
text(car.tree, digits = 3, use.n = TRUE) ``````

``````car.tree <- rpart(Mileage ~ Weight, data = car.test.frame)
plot(car.tree, uniform = FALSE)
text(car.tree, digits = 3, use.n = TRUE) ``````

``````## Sec 11.4: Predictive Accuracy, and the Cost-complexity Tradeoff
## ss 11.4.1: Cross-validation
## ss 11.4.2: The cost-complexity parameter
## ss 11.4.3: Prediction error versus tree size

## Sec 11.5: Data for female heart attack patients
summary(mifem)     # data frame mifem (DAAG) ``````
``````##  outcome         age          yronset     premi    smstat   diabetes
##  live:974   Min.   :35.0   Min.   :85.0   y :311   c :390   y :248
##  dead:321   1st Qu.:57.0   1st Qu.:87.0   n :928   x :280   n :978
##             Median :63.0   Median :89.0   nk: 56   n :522   nk: 69
##             Mean   :60.9   Mean   :88.8            nk:103
##             3rd Qu.:66.0   3rd Qu.:91.0
##             Max.   :69.0   Max.   :93.0
##  highbp   hichol   angina   stroke
##  y :813   y :452   y :472   y : 153
##  n :406   n :655   n :724   n :1063
##  nk: 76   nk:188   nk: 99   nk:  79
##
##
## ``````
``````mifem.rpart <- rpart(outcome ~ ., method="class",
data = mifem, cp = 0.0025)

plotcp(mifem.rpart)  # Cross-validated error vs cp ``````

``printcp(mifem.rpart) # Tabular version of the same information ``
``````##
## Classification tree:
## rpart(formula = outcome ~ ., data = mifem, method = "class",
##     cp = 0.0025)
##
## Variables actually used in tree construction:
## [1] age      angina   diabetes hichol   premi    smstat   stroke   yronset
##
## Root node error: 321/1295 = 0.25
##
## n= 1295
##
##       CP nsplit rel error xerror  xstd
## 1 0.2025      0      1.00   1.00 0.048
## 2 0.0056      1      0.80   0.80 0.045
## 3 0.0047     13      0.72   0.85 0.046
## 4 0.0031     17      0.70   0.84 0.046
## 5 0.0025     18      0.69   0.85 0.046``````
``````mifemb.rpart <- prune(mifem.rpart, cp=0.03)

plot(mifemb.rpart)   # May be needed so that labels appear
text(mifemb.rpart, use.n=T, digits=3) ``````

``````## ss 11.5.1: The one-standard-deviation rule
## ss 11.5.2: Printed Information on Each Split
print(mifemb.rpart) ``````
``````## n= 1295
##
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
##
## 1) root 1295 321 live (0.7521 0.2479)
##   2) angina=y,n 1196 239 live (0.8002 0.1998) *
##   3) angina=nk 99  17 dead (0.1717 0.8283) *``````
``````## Sec 11.6: Detecting Email Spam~-- the Optimal Tree
spam7a.rpart <- rpart(formula = yesno ~ crl.tot + dollar +
bang + money + n000 + make,
method="class", data = spam7, cp = 0.001)

printcp(spam7a.rpart) ``````
``````##
## Classification tree:
## rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 +
##     make, data = spam7, method = "class", cp = 0.001)
##
## Variables actually used in tree construction:
## [1] bang    crl.tot dollar  money   n000
##
## Root node error: 1813/4601 = 0.39
##
## n= 4601
##
##        CP nsplit rel error xerror  xstd
## 1  0.4766      0      1.00   1.00 0.018
## 2  0.0756      1      0.52   0.56 0.015
## 3  0.0116      3      0.37   0.39 0.013
## 4  0.0105      4      0.36   0.38 0.013
## 5  0.0063      5      0.35   0.37 0.013
## 6  0.0055     10      0.32   0.36 0.013
## 7  0.0044     11      0.31   0.35 0.013
## 8  0.0039     12      0.31   0.34 0.013
## 9  0.0028     16      0.29   0.34 0.013
## 10 0.0022     17      0.29   0.33 0.013
## 11 0.0019     18      0.29   0.33 0.013
## 12 0.0017     20      0.28   0.34 0.013
## 13 0.0010     25      0.27   0.33 0.013``````
``````## Footnote Code
## Use prune.rpart() with cp = 0.03 (0.00276 < 0.03 < 0.00386),
## to prune back to nsplit=16.
spam7b.rpart <- prune(spam7a.rpart, cp=0.003)
plot(spam7b.rpart, uniform=TRUE)
text(spam7b.rpart, cex=0.75) ``````