Chapter 11: Tree-based Classification and Regression
Packages required: “DAAG”, “lattice”, “rpart”, “lme4”, “MASS”, “randomForest”, “grid”
The script that follows is designed to be executed as it stands. It sets up
functions that create Ch 11 graphs, then runs those functions. If executed by
clicking on the RStudio “Compile Notebook …” command, it will be processed
through R Markdown, generating a document that includes code, output and
graphs generated by executing the functions. If some needed packages are
missing, warning messages will appear. See further:
http://rmarkdown.rstudio.com/r_notebook_format.html
g11.1 <-
function(device="")
{
if(device!="")hardcopy(width=4.8, height=4, device=device)
oldpar <- par(mfrow=c(2,6),mar=c(4.6,2.6,1.6,1.1),mgp=c(1,.5,0),
oma=c(0,3,0,0))
on.exit(par(oldpar))
if(!require(DAAG))return("Package 'DAAG' must be installed.")
nam <- c("crl.tot", "dollar", "bang", "money", "n000", "make")
nr <- sample(1:dim(spam7)[1],500)
yesno<-spam7$yesno[nr]
spam7 <- spam7[nr,nam]
nam2 <- names(spam7)
nam2[2] <- "$"
nam2[1] <- "Total runs"
nam2[5] <- "000"
spam7.2 <- spam7
spam7.2[,1]<-log(spam7.2[,1]+0.5)
spam7.2[,2:6]<-log(spam7.2[,2:6]+0.5)
for (namtxt in nam){
boxplot(split(spam7[,namtxt],yesno),cex=0.65,axes=F,boxwex=0.5)
box()
par(mgp=c(1,.75,0))
axis(2)
par(mgp=c(1,.25,0))
axis(1,at=1:2,labels=c("n","y"))
i <- match(namtxt,nam)
mtext(side=1,line=1.75,nam2[i],adj=0.5)
if(i==1)mtext(side=1,line=2.75,"of capitals")
}
xval <-c(.1,.2,.5,1,2,5,10,20,50,100,200,500,1000,2000)
for (namtxt in nam){
boxplot(split(spam7.2[,namtxt],yesno),cex=0.65,axes=F,boxwex=0.5)
box()
ranx <- range(spam7[,namtxt])
yloc<-xval[xval>=min(ranx)&xval<max(ranx)]
par(mgp=c(1,.75,0))
axis(2,at=log(yloc+0.5),labels=paste(round(yloc,1)))
par(mgp=c(1,.25,0))
axis(1,at=1:2,labels=c("n","y"))
i <- match(namtxt,nam)
if(i==1){mtext(side=2,line=0.75,"(Logarithmic scales)",outer=T,at=0.25)
mtext(side=1,line=2.75,"of capitals")}
mtext(side=1,line=1.75,nam2[i],adj=0.5)
}
figtxt <- paste("Scatterplot matrix for selected variables, in 200 out",
"\nof 4601 rows in the SPAM data base. Mail was all",
"\nreceived by one individual. The figure on the left",
"\nis for untransformed data, while that on the right is",
"\nfor log transformed data.")
cat(figtxt, "\n")
if(device!="")dev.off()
}
g11.2 <-
function(device=""){
if(device!="")hardcopy(width=3.0, height=2.75, device=device)
nam <- c("crl.tot", "dollar", "bang", "money", "n000", "make")
if(!require(rpart))return("Package 'rpart' must be installed.")
oldpar<-par(mar=c(1.1,1.1,0.6,0.6),mgp=c(1.5,.5,0),oma=c(0,0,0,0))
on.exit(par(oldpar))
if(!exists("spam7.rpart"))
spam7.rpart<-rpart(formula = yesno ~ crl.tot + dollar + bang
+ money + n000 + make, data=spam7)
plot(spam7.rpart)
par(xpd=TRUE)
text(spam7.rpart)
par(xpd=FALSE)
if(device!="")dev.off()
}
g11.3 <-
function(device="")
{
if(device!="")hardcopy(width=2, height=2, device=device)
if(!require(rpart))return("Package 'rpart' must be installed.")
nam <- c("crl.tot", "dollar", "bang", "money", "n000", "make")
Criterion <- factor(paste("Leaf", 1:5))
Node <- c(1,2,3,4,5)
tree.df <- data.frame(Criterion = Criterion, Node = Node)
nobs <- dim(tree.df)[[1]]
u.tree <- rpart(Node ~ Criterion, data = tree.df,
control = list(minsplit = 2, minbucket = 1))
oldpar <- par(mar = c(1.1, 1.1, 0.6, 1.1))
on.exit(par(oldpar))
plot(u.tree, uniform=T)
par(xpd=T)
text(u.tree)
par(xpd=F)
tx <- paste("Tree labelling. For generating this illustrative tree,",
"\nthe only term used to determine the split is a variable",
"\nwith the name `Criterion'. More generally, different",
"\nvariable or factor names may appear at different nodes.",
"\nTerminal nodes (`leaves') are numbered 1, ..., 5")
cat(tx, "\n")
if(device!="")dev.off()
invisible()
}
g11.4 <-
function(device="")
{
if(device!="")hardcopy(width=2, height=2, device=device)
nam <- c("crl.tot", "dollar", "bang", "money", "n000", "make")
fac <- factor(rep(paste("Leaf", 1:2), 3))
x <- rep(1:3, rep(2, 3))
Node <- 1:6
assign("tree.df", data.frame(fac = fac, x = x, Node = Node))
nobs <- dim(tree.df)[[1]]
u.tree <- rpart(Node ~ fac + x, data = tree.df,
control = list(minsplit = 2, minbucket = 1, cp = 1e-009))
oldpar <- par(mar = c(1.1, 1.1, 1.1, 1.1), mgp = c(1, 0.5, 0),
oma=c(0.5,0.5,0,0))
on.exit(par(oldpar))
plot(u.tree, uniform=T)
par(xpd=TRUE)
text(u.tree)
par(xpd=FALSE)
figtxt <- paste(
"Here an illustrative tree is generated from",
"\none explanatory factor (`fac') and one explanatory",
"\nvariable (`x'). For factors, the levels that lead",
"\nto a branch to the left are given. For variables,",
"\nthe range of values is given that leads to taking",
"\nthe left branch.", sep = "")
cat(figtxt,"\n")
if(device!="")dev.off()
invisible()
}
g11.5 <-
function(cuts = c(1845, 2567.5, 2747.5, 3087.5, 3855), device="")
{
if(!require(MASS))return("Package 'MASS' must be installed.")
if(device!="")hardcopy(width=2.75, height=2.75, device=device)
titl <- paste("Mileage versus Weight, for cars described in US April 1990",
"\nConsumer Reports.", sep = "")
oldpar <- par(mar = c(4.1,4.1,1.6,1.1), mgp = c(2.5, 0.75, 0))
par(mex = 1, cex = 1)
on.exit(par(oldpar))
assign("car.df",car.test.frame)
attach(car.df)
u.lo <- loess(Mileage~Weight, data = car.df, span = 2/3)
plot(Mileage~Weight, data=car.df, xlab = "Weight",
ylab = "Miles per gallon", sub = "")
xy <- loess.smooth(Weight, Mileage)
ord<-order(xy$x)
lines(xy$x[ord],xy$y[ord])
new.df <- data.frame(Weight = cuts)
hat <- predict(u.lo, newdata = new.df)
new.df$hat <- hat
print(summary(u.lo))
detach(car.df)
cat(titl,"\n")
if(device!="")dev.off()
invisible(new.df)
}
g11.6 <-
function(device="")
{
if(device!="")hardcopy(width=5.0, height=2.4, device=device, pointsize=12)
on.exit(par(oldpar))
tx <- paste("Tree-based model for predicting Mileage given Weight,",
"\nfor cars described in US April 1990 Consumer Reports.",
"\nFor present illustrative purposes, split criteria have",
"\nbeen changed from the RPART default, to increase the",
"\nnumber of splits. Notice that for this plot we have",
"\nused uniform vertical spacing.")
oldpar <- par(mar = c(1.6, 1.1, 1.6, 1.1), mgp = c(1, 0.5, 0),
oma=c(0,1,0,0), xpd=TRUE)
par(fig=c(0, 0.575, 0,1))
on.exit(par(oldpar))
names(car.test.frame)[6] <- "Wt"
car.tree <- rpart(Mileage~Wt, data=car.test.frame, control =
list(minsplit = 10, minbucket = 5, cp = 0.0001))
plot(car.tree, uniform = T)
par(xpd=TRUE)
text(car.tree, digits = 3, use.n = T, cex=0.55)
par(xpd=FALSE)
x0 <- par()$usr[1]+diff(par()$usr[1:2])*0.05
y0 <- par()$usr[4]+diff(par()$usr[3:4])*0.06
text(x0,y0, "A", cex=0.85, xpd=TRUE)
par(fig=c(0.55, 1, 0,1), new=TRUE)
car.tree2 <- rpart(Mileage ~ Wt, data = car.test.frame)
plot(car.tree2, uniform = F)
par(xpd=TRUE)
text(car.tree2, digits = 3, use.n = T, cex=0.55)
par(xpd=FALSE)
x0 <- par()$usr[1]+diff(par()$usr[1:2])*0.1
y0 <- par()$usr[4]+diff(par()$usr[3:4])*0.06
text(x0,y0, "B", cex=0.85, xpd=TRUE)
if(device!="")dev.off()
invisible(list(car.tree, car.tree2))
}
g11.7 <-
function(device="", seed=7)
{
if(device!="")hardcopy(width=4.5, height=2.5, device=device)
if(!is.null(seed))set.seed(seed)
titl <- paste("R plot designed to assist in choosing tree size for the",
"\ncar weight & mileage data.")
oldpar <- par(mar = c(4.1, 4.1, 3.6, 2.6), mgp = c(2.75, 0.75, 0),
mfrow=c(1,2), pty="s")
on.exit(par(oldpar))
car.rpart1 <- rpart(Price~., data=car.test.frame, cp=0.0001)
car.rpart2 <- rpart(Price~., data=car.test.frame, cp=0.0001)
car.rpart3 <- rpart(Price~., data=car.test.frame, cp=0.0001)
pr1 <- printcp(car.rpart1)
pr2 <- printcp(car.rpart2)
pr3 <- printcp(car.rpart3)
cp <- pr1[,"CP"]
cp0 <- sqrt(cp*c(Inf,cp[-length(cp)]))
nsplit <- pr1[,"nsplit"]
plot(nsplit,pr1[,"rel error"], xlab="No. of splits",
ylab="Relative error", type="b")
axis(3,at=nsplit, labels=paste(round(cp0,3)))
mtext(side=3, line=2, "Complexity parameter")
mtext(side =3, line=2.5,"A", adj=-0.2)
plot(nsplit,pr1[,"xerror"], xlab="No. of splits",
ylab="Xval relative error", type="b")
lines(nsplit,pr2[,"xerror"],lty=2, type="b")
lines(nsplit,pr2[,"xerror"],lty=3, type="b")
lines(nsplit,pr3[,"xerror"],lty=3, type="b")
axis(3,at=nsplit,labels=paste(round(cp0, 3)))
mtext(side=3,line=2, "Complexity parameter")
mtext(side =3, line=2.5,"B", adj=-0.2)
cat(titl, "\n")
if(device!="")dev.off() else par(mfrow=c(1,1))
invisible()
}
g11.8 <-
function(device=""){
if(device!="")hardcopy(width=2.5, height=1.75, device=device)
oldpar<-par(mar=c(4.1,4.1,3.1,3.1),mgp=c(2,.5,0), oma=c(0,0,0,0), pty="s",
las=0, tcl=-0.25)
on.exit(par(oldpar))
if(!require(DAAG))return("Package 'DAAG' must be installed.")
if(!exists("mifem.rpart"))
mifem.rpart <- rpart(outcome ~ ., data=mifem, cp=0.0025)
plotcp(mifem.rpart)
if(device!="")dev.off()
}
g11.9 <-
function(device=""){
if(device!="")hardcopy(width=1.5, height=1, device=device, pointsize=9)
if(!exists("mifemb.rpart"))
mifem.rpart <- rpart(outcome ~ ., method="class",
data = mifem, cp = 0.0025)
if(!exists("spam7b.rpart"))
mifemb.rpart <- prune(mifem.rpart, cp=0.006)
oldpar<-par(mar=c(1.6,1.6,1.1,1.6),mgp=c(1.1,.5,0),oma=c(0,0,0,0))
on.exit(par(oldpar))
plot(mifemb.rpart)
par(xpd=TRUE)
text(mifemb.rpart, use.n=T, digits=3, cex=0.65)
par(xpd=FALSE)
if(device!="")dev.off()
}
g11.10 <-
function(device=""){
if(device!="")hardcopy(width=4.5, height=3.75, device=device)
if(!exists("spam7a.rpart"))
spam7a.rpart <- rpart(yesno ~ crl.tot+dollar+bang+money+
n000+make, data=spam7, cp=0.001)
if(!exists("spam7b.rpart"))spam7b.rpart <-
prune(spam7a.rpart, cp=0.003)
oldpar<-par(mar=c(1.1,1.1,0.6,1.1),mgp=c(1.5,.5,0),oma=c(0,0,0,0),
cex=0.8)
on.exit(par(oldpar))
plot(spam7b.rpart,uniform=T)
text(spam7b.rpart,cex=0.9,digits=3)
if(device!="")dev.off()
}
g11.11 <-
function(acc.df=acctree.df, device=""){
if(device!="")hardcopy(width=4, height=2, device=device)
## for(i in 1:100){cvmat2[i,] <- compare.acc.trees(); cat(i,"\n")}
oldpar<-par(mar=c(4.1,4.1, 1.6, 1.6), mgp=c(2.5, 0.75,0), pty="s",
mfrow=c(1,2)
)
on.exit(par(oldpar))
if(!require(randomForest))return("Package 'randomForest' must be installed.")
acctree.mat <- matrix(0, nrow=100, ncol=8)
colnames(acctree.mat) <- c("rpSEcvI", "rpcvI", "rpSEtest", "rptest",
"n.SErule", "nre.min.12", "rfcvI", "rftest")
for(i in 1:100)acctree.mat[i,] <-
compareTreecalcs(data=spam7, fun=c("rpart", "randomForest"))
acctree.df <- data.frame(acctree.mat)
lims <- range(acctree.mat[, c(4,7,8)], na.rm=TRUE)
plot(rfcvI ~ rftest, data=acc.df, xlab="Error rate - subset II",
ylab="OOB Error - fit to subset I", xlim=lims, ylim=lims)
abline(0,1)
mtext(side=3, line=0.25, "A", adj=0)
plot(rptest ~ rftest, data=acc.df, xlab="Error rate - subset II",
ylab="rpart Error rate, subset II",
xlim=lims, ylim=lims)
abline(0,1)
mtext(side=3, line=0.25, "B", adj=0)
if(device!="")dev.off()
}
pkgs <- c("DAAG", "lattice", "rpart", "lme4", "MASS", "randomForest", "grid")
z <- sapply(pkgs, require, character.only=TRUE, warn.conflicts=FALSE, quietly=TRUE)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
if(any(!z)){
notAvail <- paste(names(z)[!z], collapse=", ")
warning(paste("The following packages should be installed:", notAvail))
}
g11.1()
## Scatterplot matrix for selected variables, in 200 out
## of 4601 rows in the SPAM data base. Mail was all
## received by one individual. The figure on the left
## is for untransformed data, while that on the right is
## for log transformed data.
g11.2()
g11.3()
## Tree labelling. For generating this illustrative tree,
## the only term used to determine the split is a variable
## with the name `Criterion'. More generally, different
## variable or factor names may appear at different nodes.
## Terminal nodes (`leaves') are numbered 1, ..., 5
g11.4()
## Here an illustrative tree is generated from
## one explanatory factor (`fac') and one explanatory
## variable (`x'). For factors, the levels that lead
## to a branch to the left are given. For variables,
## the range of values is given that leads to taking
## the left branch.
g11.5()
## Call:
## loess(formula = Mileage ~ Weight, data = car.df, span = 2/3)
##
## Number of Observations: 60
## Equivalent Number of Parameters: 5.29
## Residual Standard Error: 2.42
## Trace of smoother matrix: 5.81
##
## Control settings:
## normalize: TRUE
## span : 0.6667
## degree : 2
## family : gaussian
## surface : interpolate cell = 0.2
## Mileage versus Weight, for cars described in US April 1990
## Consumer Reports.
g11.6()
g11.7()
##
## Regression tree:
## rpart(formula = Price ~ ., data = car.test.frame, cp = 1e-04)
##
## Variables actually used in tree construction:
## [1] Country Type Weight
##
## Root node error: 9.8e+08/60 = 1.6e+07
##
## n= 60
##
## CP nsplit rel error xerror xstd
## 1 0.4321 0 1.00 1.02 0.208
## 2 0.1576 1 0.57 0.77 0.138
## 3 0.0975 2 0.41 0.61 0.127
## 4 0.0241 3 0.31 0.56 0.098
## 5 0.0001 4 0.29 0.54 0.105
##
## Regression tree:
## rpart(formula = Price ~ ., data = car.test.frame, cp = 1e-04)
##
## Variables actually used in tree construction:
## [1] Country Type Weight
##
## Root node error: 9.8e+08/60 = 1.6e+07
##
## n= 60
##
## CP nsplit rel error xerror xstd
## 1 0.4321 0 1.00 1.02 0.21
## 2 0.1576 1 0.57 0.79 0.17
## 3 0.0975 2 0.41 0.70 0.14
## 4 0.0241 3 0.31 0.69 0.14
## 5 0.0001 4 0.29 0.69 0.14
##
## Regression tree:
## rpart(formula = Price ~ ., data = car.test.frame, cp = 1e-04)
##
## Variables actually used in tree construction:
## [1] Country Type Weight
##
## Root node error: 9.8e+08/60 = 1.6e+07
##
## n= 60
##
## CP nsplit rel error xerror xstd
## 1 0.4321 0 1.00 1.04 0.21
## 2 0.1576 1 0.57 0.85 0.17
## 3 0.0975 2 0.41 0.65 0.14
## 4 0.0241 3 0.31 0.65 0.13
## 5 0.0001 4 0.29 0.66 0.14
## R plot designed to assist in choosing tree size for the
## car weight & mileage data.
g11.8()
g11.9()
g11.10()
g11.11()
## Minimum cv error was for largest tree
## Perhaps try a smaller value of cp
## Minimum cv error was for largest tree
## Perhaps try a smaller value of cp
## Minimum cv error was for largest tree
## Perhaps try a smaller value of cp
## Minimum cv error was for largest tree
## Perhaps try a smaller value of cp
## Minimum cv error was for largest tree
## Perhaps try a smaller value of cp