Chapter 11: Tree-based Classification and Regression
Packages required: “DAAG”, “lattice”, “rpart”, “lme4”, “MASS”, “randomForest”, “grid”

The script that follows is designed to be executed as it stands. It sets up
functions that create Ch 11 graphs, then runs those functions. If executed by
clicking on the RStudio “Compile Notebook …” command, it will be processed
through R Markdown, generating a document that includes code, output and
graphs generated by executing the functions. If some needed packages are
missing, warning messages will appear. See further:
http://rmarkdown.rstudio.com/r_notebook_format.html

g11.1 <-
function(device="")
{
  if(device!="")hardcopy(width=4.8, height=4, device=device)
  oldpar <- par(mfrow=c(2,6),mar=c(4.6,2.6,1.6,1.1),mgp=c(1,.5,0),
                oma=c(0,3,0,0))
  on.exit(par(oldpar))
  if(!require(DAAG))return("Package 'DAAG' must be installed.")   
  nam <- c("crl.tot", "dollar", "bang", "money", "n000", "make")
  nr <- sample(1:dim(spam7)[1],500)
  yesno<-spam7$yesno[nr]
  spam7 <- spam7[nr,nam]
  nam2 <- names(spam7)
  nam2[2] <- "$"
  nam2[1] <- "Total runs"
  nam2[5] <- "000"
  spam7.2 <- spam7
  spam7.2[,1]<-log(spam7.2[,1]+0.5)
  spam7.2[,2:6]<-log(spam7.2[,2:6]+0.5)
  for (namtxt in nam){
    boxplot(split(spam7[,namtxt],yesno),cex=0.65,axes=F,boxwex=0.5)
    box()
    par(mgp=c(1,.75,0))
    axis(2)
    par(mgp=c(1,.25,0))
    axis(1,at=1:2,labels=c("n","y"))
    i <- match(namtxt,nam)
    mtext(side=1,line=1.75,nam2[i],adj=0.5)
    if(i==1)mtext(side=1,line=2.75,"of capitals")
  }
  xval <-c(.1,.2,.5,1,2,5,10,20,50,100,200,500,1000,2000)
  for (namtxt in nam){
    boxplot(split(spam7.2[,namtxt],yesno),cex=0.65,axes=F,boxwex=0.5)
    box()
    ranx <- range(spam7[,namtxt])
    yloc<-xval[xval>=min(ranx)&xval<max(ranx)]
    par(mgp=c(1,.75,0))
    axis(2,at=log(yloc+0.5),labels=paste(round(yloc,1)))
    par(mgp=c(1,.25,0))
    axis(1,at=1:2,labels=c("n","y"))
    i <- match(namtxt,nam)
    if(i==1){mtext(side=2,line=0.75,"(Logarithmic scales)",outer=T,at=0.25)
             mtext(side=1,line=2.75,"of capitals")}
    mtext(side=1,line=1.75,nam2[i],adj=0.5)
  }

  figtxt <- paste("Scatterplot matrix for selected variables, in 200 out",
                  "\nof 4601 rows in the SPAM data base.  Mail was all",
                  "\nreceived by one individual.  The figure on the left",
                  "\nis for untransformed data, while that on the right is",
                  "\nfor log transformed data.")
  cat(figtxt, "\n")
  if(device!="")dev.off()
}

g11.2 <-
function(device=""){
    if(device!="")hardcopy(width=3.0, height=2.75, device=device)
    nam <- c("crl.tot", "dollar", "bang", "money", "n000", "make")
    if(!require(rpart))return("Package 'rpart' must be installed.")      
    oldpar<-par(mar=c(1.1,1.1,0.6,0.6),mgp=c(1.5,.5,0),oma=c(0,0,0,0))
    on.exit(par(oldpar))
    if(!exists("spam7.rpart"))
      spam7.rpart<-rpart(formula = yesno ~ crl.tot + dollar + bang
                         + money + n000 + make, data=spam7)
    plot(spam7.rpart)
    par(xpd=TRUE)
    text(spam7.rpart)
    par(xpd=FALSE)
    if(device!="")dev.off()
  }

g11.3 <-
function(device="")
{
  if(device!="")hardcopy(width=2, height=2, device=device)
  if(!require(rpart))return("Package 'rpart' must be installed.")    
  nam <- c("crl.tot", "dollar", "bang", "money", "n000", "make")
  Criterion <- factor(paste("Leaf", 1:5))
  Node <- c(1,2,3,4,5)
  tree.df <- data.frame(Criterion = Criterion, Node = Node)
  nobs <- dim(tree.df)[[1]]
  u.tree <- rpart(Node ~ Criterion, data = tree.df,
                  control = list(minsplit = 2, minbucket = 1))
  oldpar <- par(mar = c(1.1, 1.1, 0.6, 1.1))
  on.exit(par(oldpar))
  plot(u.tree, uniform=T)
  par(xpd=T)
  text(u.tree)
  par(xpd=F)
  tx <- paste("Tree labelling.  For generating this illustrative tree,",
              "\nthe only term used to determine the split is a variable",
              "\nwith the name `Criterion'.  More generally, different",
              "\nvariable or factor names may appear at different nodes.",
              "\nTerminal nodes (`leaves') are numbered 1, ..., 5")
  cat(tx, "\n")
  if(device!="")dev.off()
  invisible()
}

g11.4 <-
function(device="")
{
  if(device!="")hardcopy(width=2, height=2, device=device)
  nam <- c("crl.tot", "dollar", "bang", "money", "n000", "make")
  fac <- factor(rep(paste("Leaf", 1:2), 3))
  x <- rep(1:3, rep(2, 3))
  Node <- 1:6
  assign("tree.df", data.frame(fac = fac, x = x, Node = Node))
  nobs <- dim(tree.df)[[1]]
  u.tree <- rpart(Node ~ fac + x, data = tree.df,
                  control = list(minsplit = 2, minbucket = 1, cp = 1e-009))
  oldpar <- par(mar = c(1.1, 1.1, 1.1, 1.1), mgp = c(1, 0.5, 0),
                oma=c(0.5,0.5,0,0))
  on.exit(par(oldpar))
  plot(u.tree, uniform=T)
  par(xpd=TRUE)
  text(u.tree)
  par(xpd=FALSE)
  figtxt <- paste(
                  "Here an illustrative tree is generated from",
                  "\none explanatory factor (`fac') and one explanatory",
                  "\nvariable (`x').  For factors, the levels that lead",
                  "\nto a branch to the left are given.  For variables,",
                  "\nthe range of values is given that leads to taking",
                  "\nthe left branch.", sep = "")
  cat(figtxt,"\n")
  if(device!="")dev.off()
  invisible()
}

g11.5 <-
function(cuts = c(1845, 2567.5, 2747.5, 3087.5, 3855), device="")
{
  if(!require(MASS))return("Package 'MASS' must be installed.")    
  if(device!="")hardcopy(width=2.75, height=2.75, device=device)
  titl <- paste("Mileage versus Weight, for cars described in US April 1990",
                "\nConsumer Reports.", sep = "")
  oldpar <- par(mar = c(4.1,4.1,1.6,1.1), mgp = c(2.5, 0.75, 0))
  par(mex = 1, cex = 1)
  on.exit(par(oldpar))
  assign("car.df",car.test.frame)
  attach(car.df)
  u.lo <- loess(Mileage~Weight, data = car.df, span = 2/3)
  plot(Mileage~Weight, data=car.df, xlab = "Weight", 
       ylab = "Miles per gallon", sub = "")
  xy <- loess.smooth(Weight, Mileage)
  ord<-order(xy$x)
  lines(xy$x[ord],xy$y[ord])
  new.df <- data.frame(Weight = cuts)
  hat <- predict(u.lo, newdata = new.df)
  new.df$hat <- hat
  print(summary(u.lo))
  detach(car.df)
  cat(titl,"\n")
  if(device!="")dev.off()
  invisible(new.df)
}

g11.6 <-
function(device="")
{
  if(device!="")hardcopy(width=5.0, height=2.4, device=device, pointsize=12)
  on.exit(par(oldpar))
  tx <- paste("Tree-based model for predicting Mileage given Weight,",
              "\nfor cars described in US April 1990 Consumer Reports.",
              "\nFor present illustrative purposes, split criteria have",
              "\nbeen changed from the RPART default, to increase the",
              "\nnumber of splits.  Notice that for this plot we have",
              "\nused uniform vertical spacing.")
  oldpar <- par(mar = c(1.6, 1.1, 1.6, 1.1), mgp = c(1, 0.5, 0),
                oma=c(0,1,0,0), xpd=TRUE)
  par(fig=c(0, 0.575, 0,1))
  on.exit(par(oldpar))
  names(car.test.frame)[6] <- "Wt"
  car.tree <- rpart(Mileage~Wt, data=car.test.frame, control = 
                    list(minsplit = 10, minbucket = 5, cp = 0.0001))
  plot(car.tree, uniform = T)
  par(xpd=TRUE)
  text(car.tree, digits = 3, use.n = T, cex=0.55)
  par(xpd=FALSE)
  x0 <- par()$usr[1]+diff(par()$usr[1:2])*0.05
  y0 <- par()$usr[4]+diff(par()$usr[3:4])*0.06
  text(x0,y0, "A", cex=0.85, xpd=TRUE)
  par(fig=c(0.55, 1, 0,1), new=TRUE)
  car.tree2 <- rpart(Mileage ~ Wt, data = car.test.frame)
  plot(car.tree2, uniform = F)
  par(xpd=TRUE)
  text(car.tree2, digits = 3, use.n = T, cex=0.55)
  par(xpd=FALSE)
  x0 <- par()$usr[1]+diff(par()$usr[1:2])*0.1
  y0 <- par()$usr[4]+diff(par()$usr[3:4])*0.06
  text(x0,y0, "B", cex=0.85, xpd=TRUE)
  if(device!="")dev.off()
  invisible(list(car.tree, car.tree2))
}

g11.7 <-
function(device="", seed=7)
{
  if(device!="")hardcopy(width=4.5, height=2.5, device=device)
  if(!is.null(seed))set.seed(seed)
  titl <- paste("R plot designed to assist in choosing tree size for the",
                "\ncar weight & mileage data.")
  oldpar <- par(mar = c(4.1, 4.1, 3.6, 2.6), mgp = c(2.75, 0.75, 0),
                mfrow=c(1,2), pty="s")
  on.exit(par(oldpar))
  car.rpart1 <- rpart(Price~., data=car.test.frame, cp=0.0001)
  car.rpart2 <- rpart(Price~., data=car.test.frame, cp=0.0001)
  car.rpart3 <- rpart(Price~., data=car.test.frame, cp=0.0001)
  pr1 <- printcp(car.rpart1)
  pr2 <- printcp(car.rpart2)
  pr3 <- printcp(car.rpart3)
  cp <- pr1[,"CP"]
  cp0 <- sqrt(cp*c(Inf,cp[-length(cp)]))
  nsplit <- pr1[,"nsplit"]
  plot(nsplit,pr1[,"rel error"], xlab="No. of splits",
       ylab="Relative error", type="b")
  axis(3,at=nsplit, labels=paste(round(cp0,3)))
  mtext(side=3, line=2, "Complexity parameter")
  mtext(side =3, line=2.5,"A", adj=-0.2)
  plot(nsplit,pr1[,"xerror"], xlab="No. of splits", 
       ylab="Xval relative error", type="b")
  lines(nsplit,pr2[,"xerror"],lty=2, type="b")
  lines(nsplit,pr2[,"xerror"],lty=3, type="b")
  lines(nsplit,pr3[,"xerror"],lty=3, type="b")
  axis(3,at=nsplit,labels=paste(round(cp0, 3)))
  mtext(side=3,line=2, "Complexity parameter")
  mtext(side =3, line=2.5,"B", adj=-0.2)
  cat(titl, "\n")
  if(device!="")dev.off() else par(mfrow=c(1,1))
  invisible()
}

g11.8 <-
function(device=""){
    if(device!="")hardcopy(width=2.5, height=1.75, device=device)
    oldpar<-par(mar=c(4.1,4.1,3.1,3.1),mgp=c(2,.5,0), oma=c(0,0,0,0), pty="s",
                las=0, tcl=-0.25)
    on.exit(par(oldpar))
    if(!require(DAAG))return("Package 'DAAG' must be installed.")     
    if(!exists("mifem.rpart"))
      mifem.rpart <- rpart(outcome ~ ., data=mifem, cp=0.0025)
    plotcp(mifem.rpart)
    if(device!="")dev.off()
  }

g11.9 <-
function(device=""){
    if(device!="")hardcopy(width=1.5, height=1, device=device, pointsize=9)
    if(!exists("mifemb.rpart"))
      mifem.rpart <- rpart(outcome ~ ., method="class",  
                           data = mifem, cp = 0.0025) 
    if(!exists("spam7b.rpart"))
      mifemb.rpart <- prune(mifem.rpart, cp=0.006)
    oldpar<-par(mar=c(1.6,1.6,1.1,1.6),mgp=c(1.1,.5,0),oma=c(0,0,0,0))
    on.exit(par(oldpar))
    plot(mifemb.rpart)
    par(xpd=TRUE)
    text(mifemb.rpart, use.n=T, digits=3, cex=0.65)
    par(xpd=FALSE)
    if(device!="")dev.off()
  }

g11.10 <-
function(device=""){
    if(device!="")hardcopy(width=4.5, height=3.75, device=device)
    if(!exists("spam7a.rpart"))
      spam7a.rpart <- rpart(yesno ~ crl.tot+dollar+bang+money+
                            n000+make, data=spam7, cp=0.001)
    if(!exists("spam7b.rpart"))spam7b.rpart <-
      prune(spam7a.rpart, cp=0.003)
    oldpar<-par(mar=c(1.1,1.1,0.6,1.1),mgp=c(1.5,.5,0),oma=c(0,0,0,0),  
                cex=0.8)
    on.exit(par(oldpar))
    plot(spam7b.rpart,uniform=T)
    text(spam7b.rpart,cex=0.9,digits=3)
    if(device!="")dev.off()
  }

g11.11 <-
function(acc.df=acctree.df, device=""){
    if(device!="")hardcopy(width=4, height=2, device=device)
    ## for(i in 1:100){cvmat2[i,] <- compare.acc.trees(); cat(i,"\n")}    
    oldpar<-par(mar=c(4.1,4.1, 1.6, 1.6), mgp=c(2.5, 0.75,0), pty="s",
                mfrow=c(1,2)
                )
    on.exit(par(oldpar))
    if(!require(randomForest))return("Package 'randomForest' must be installed.")      
acctree.mat <- matrix(0, nrow=100, ncol=8)
colnames(acctree.mat) <- c("rpSEcvI", "rpcvI", "rpSEtest", "rptest",
                           "n.SErule", "nre.min.12", "rfcvI", "rftest")
for(i in 1:100)acctree.mat[i,] <- 
   compareTreecalcs(data=spam7, fun=c("rpart", "randomForest"))
acctree.df <- data.frame(acctree.mat)
    lims <- range(acctree.mat[, c(4,7,8)], na.rm=TRUE)
    plot(rfcvI ~ rftest, data=acc.df, xlab="Error rate - subset II",
         ylab="OOB Error - fit to subset I", xlim=lims, ylim=lims)
    abline(0,1)
    mtext(side=3, line=0.25, "A", adj=0)
    plot(rptest ~ rftest, data=acc.df, xlab="Error rate - subset II",
         ylab="rpart Error rate, subset II",
         xlim=lims, ylim=lims)
    abline(0,1)
    mtext(side=3, line=0.25, "B", adj=0)
    if(device!="")dev.off()
  }

pkgs <- c("DAAG", "lattice", "rpart", "lme4", "MASS", "randomForest", "grid")
z <- sapply(pkgs, require, character.only=TRUE, warn.conflicts=FALSE, quietly=TRUE)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
if(any(!z)){
  notAvail <- paste(names(z)[!z], collapse=", ")
  warning(paste("The following packages should be installed:", notAvail))
} 

g11.1()

plot of chunk unnamed-chunk-1

## Scatterplot matrix for selected variables, in 200 out 
## of 4601 rows in the SPAM data base.  Mail was all 
## received by one individual.  The figure on the left 
## is for untransformed data, while that on the right is 
## for log transformed data.
g11.2()

plot of chunk unnamed-chunk-1

g11.3()

plot of chunk unnamed-chunk-1

## Tree labelling.  For generating this illustrative tree, 
## the only term used to determine the split is a variable 
## with the name `Criterion'.  More generally, different 
## variable or factor names may appear at different nodes. 
## Terminal nodes (`leaves') are numbered 1, ..., 5
g11.4()

plot of chunk unnamed-chunk-1

## Here an illustrative tree is generated from
## one explanatory factor (`fac') and one explanatory
## variable (`x').  For factors, the levels that lead
## to a branch to the left are given.  For variables,
## the range of values is given that leads to taking
## the left branch.
g11.5()

plot of chunk unnamed-chunk-1

## Call:
## loess(formula = Mileage ~ Weight, data = car.df, span = 2/3)
## 
## Number of Observations: 60 
## Equivalent Number of Parameters: 5.29 
## Residual Standard Error: 2.42 
## Trace of smoother matrix: 5.81 
## 
## Control settings:
##   normalize:  TRUE 
##   span       :  0.6667 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
## Mileage versus Weight, for cars described in US April 1990
## Consumer Reports.
g11.6()

plot of chunk unnamed-chunk-1

g11.7()
## 
## Regression tree:
## rpart(formula = Price ~ ., data = car.test.frame, cp = 1e-04)
## 
## Variables actually used in tree construction:
## [1] Country Type    Weight 
## 
## Root node error: 9.8e+08/60 = 1.6e+07
## 
## n= 60 
## 
##       CP nsplit rel error xerror  xstd
## 1 0.4321      0      1.00   1.02 0.208
## 2 0.1576      1      0.57   0.77 0.138
## 3 0.0975      2      0.41   0.61 0.127
## 4 0.0241      3      0.31   0.56 0.098
## 5 0.0001      4      0.29   0.54 0.105
## 
## Regression tree:
## rpart(formula = Price ~ ., data = car.test.frame, cp = 1e-04)
## 
## Variables actually used in tree construction:
## [1] Country Type    Weight 
## 
## Root node error: 9.8e+08/60 = 1.6e+07
## 
## n= 60 
## 
##       CP nsplit rel error xerror xstd
## 1 0.4321      0      1.00   1.02 0.21
## 2 0.1576      1      0.57   0.79 0.17
## 3 0.0975      2      0.41   0.70 0.14
## 4 0.0241      3      0.31   0.69 0.14
## 5 0.0001      4      0.29   0.69 0.14
## 
## Regression tree:
## rpart(formula = Price ~ ., data = car.test.frame, cp = 1e-04)
## 
## Variables actually used in tree construction:
## [1] Country Type    Weight 
## 
## Root node error: 9.8e+08/60 = 1.6e+07
## 
## n= 60 
## 
##       CP nsplit rel error xerror xstd
## 1 0.4321      0      1.00   1.04 0.21
## 2 0.1576      1      0.57   0.85 0.17
## 3 0.0975      2      0.41   0.65 0.14
## 4 0.0241      3      0.31   0.65 0.13
## 5 0.0001      4      0.29   0.66 0.14

plot of chunk unnamed-chunk-1

## R plot designed to assist in choosing tree size for the 
## car weight & mileage data.
g11.8()

plot of chunk unnamed-chunk-1

g11.9()

plot of chunk unnamed-chunk-1

g11.10()

plot of chunk unnamed-chunk-1

g11.11()
## Minimum cv error was for largest tree
## Perhaps try a smaller value of cp
## Minimum cv error was for largest tree
## Perhaps try a smaller value of cp
## Minimum cv error was for largest tree
## Perhaps try a smaller value of cp
## Minimum cv error was for largest tree
## Perhaps try a smaller value of cp
## Minimum cv error was for largest tree
## Perhaps try a smaller value of cp

plot of chunk unnamed-chunk-1