Chapter 13: Regression on Principal Component or Discriminant Scores
Packages required: “DAAG”, “lattice”, “grid”, “MASS”, “randomForest”, “splines”
The script that follows is designed to be executed as it stands. It sets up
functions that create Ch 13 graphs, then runs those functions. If executed by
clicking on the RStudio “Compile Notebook …” command, it will be processed
through R Markdown, generating a document that includes code, output and
graphs generated by executing the functions. If some needed packages are
missing, warning messages will appear. See further:
http://rmarkdown.rstudio.com/r_notebook_format.html
g13.1 <-
function (device = "")
{
oldpal <- palette(gray(c(0, 0.6)))
if (device != "")
hardcopy(width = 2.75, height = 2.75, device = device)
oldpar <- par(mar = par()$mar + c(-0.5, 0, -2, 0))
on.exit(par(oldpar))
require(DAAG)
not.na <- apply(socsupport[, 9:19], 1, function(x) !any(is.na(x)))
not.na[36] <- FALSE
ss.pcp1 <- princomp(socsupport[not.na, 9:19], cor = TRUE)$scores[,
1]
attach(socsupport)
plot(BDI[not.na] ~ ss.pcp1, col = as.numeric(gender[not.na]),
pch = as.numeric(gender[not.na]), xlab = "First principal component",
ylab = "BDI")
topleft <- par()$usr[c(1, 4)]
legend(topleft[1], topleft[2], col = 1:2, pch = 1:2, legend = levels(gender),
cex = 0.7)
detach(socsupport)
if (device != "")
dev.off()
palette(oldpal)
invisible()
}
g13.2 <-
function (device = "")
{
if (device != "")
hardcopy(width = 5, height = 1.8, trellis = TRUE, color = TRUE,
device = device, pointsize = c(6, 3))
dsetnames <- c("nsw-ctl", "nsw-trt", "psid3")
colrs <- c("gray45", "gray45", "black")
lty <- c("solid", "21", "solid")
lwd <- c(1.5, 1.5, 1)
trellis.par.set(axis.components = list(left = list(pad1 = 0.4),
bottom = list(pad1 = 0.5)))
denplot <- function(yvar = "re75", offset = 100, ylim = c(0,
0.52), from = NULL, at = c(0.1, 0.2, 0.3, 0.4, 0.5),
labels = paste(at), bw = "nrd0", ylab = "Density", takelog = TRUE,
col.axis = "black") {
nzre <- unlist(lapply(list(subset(nswdemo, trt == 0),
subset(nswdemo, trt == 1), psid3), function(x) {
z <- x[, yvar]
z[z > 0]
}))
num <- unlist(lapply(list(subset(nswdemo, trt == 0),
subset(nswdemo, trt == 1), psid3), function(x) {
z <- x[, yvar]
sum(z > 0)
}))
xy <- data.frame(nzre = nzre, fac = factor(rep(dsetnames,
num), levels = dsetnames))
if (takelog) {
y <- log(xy$nzre + offset)
xlab <- paste("log(", yvar, "+", offset, ")", sep = "")
}
else {
y <- xy$nzre
xlab <- yvar
}
densityplot(~y, groups = fac, data = xy, bw = bw, from = from,
aspect=1.5,
scales = list(y = list(at = at, labels = labels,
col = col.axis), tck = 0.25), plot.points = FALSE,
par.settings = simpleTheme(col = colrs, lty = lty,
lwd = lwd), ylim = ylim, ylab = ylab, xlab = xlab)
}
print(denplot(), position = c(0.02, 0, 0.32, 0.985))
print(denplot(ylab = " ", yvar = "re78"), position = c(0.24,
0, 0.54, 0.985), newpage = FALSE)
print(denplot(ylab = " ", yvar = "age", takelog = FALSE,
ylim = c(0, 0.08), from = 16, at = c(0.02, 0.04, 0.06,
0.08), labels = c(".02", ".04", ".06", ".08")), position = c(0.46,
0, 0.76, 0.985), newpage = FALSE)
print(denplot(ylab = " ", yvar = "educ", takelog = FALSE,
ylim = c(0, 0.275), bw = 0.5, at = c(0.1, 0.2, 0.3, 0.4)),
position = c(0.68, 0, 0.98, 0.985), newpage = FALSE)
if(!require(grid))return("Package 'grid' must be installed.")
pushViewport(viewport(y = unit(1, "npc"), gp = gpar(cex = 0.65),
width = 0.5, height = unit(1.5, "lines"), just = c("centre",
"top")))
draw.key(key = list(text = list(dsetnames), lines = list(lty = lty,
lwd = lwd, col = colrs), columns = 3, between = 1), draw = TRUE)
popViewport(0)
if (device != "")
dev.off()
}
g13.3 <-
function (device = "", control = psid1, bw = TRUE, plotvalues="Density", ratio.number=TRUE)
{
overlapDensity <-
function (x0, x1, ratio = c(0.05, 20), ratio.number = FALSE,
plotvalues = c("Density", "Numbers"), gpnames = c("Control", "Treatment"),
cutoffs = c(lower = TRUE, upper = TRUE), bw = FALSE,
xlab = "Score", ylab = NULL,
col = 1:2, lty = 1:2, lwd = c(1, 1))
{
if (is.null(plotvalues))
plotvalues <- ""
if (length(plotvalues) > 1)
plotvalues <- plotvalues[1]
if (plotvalues %in% c("Numbers", "Density"))
plotit <- TRUE else plotit <- FALSE
ran <- range(c(x0, x1))
if (all(cutoffs)) {
d0 <- density(x0, from = ran[1], to = ran[2])
d1 <- density(x1, from = ran[1], to = ran[2])
}
else if (cutoffs[1]) {
d0 <- density(x0, from = ran[1])
d1 <- density(x1, from = ran[1])
}
else if (cutoffs[2]) {
d0 <- density(x0, to = ran[2])
d1 <- density(x1, to = ran[2])
}
else {
d0 <- density(x0)
d1 <- density(x1)
}
n0 <- length(x0)
n1 <- length(x1)
f0 <- d0$y
f1 <- d1$y
if (plotvalues %in% "Number") {
pf0 <- d0$y * n0
pf1 <- d1$y * n1
}
else {
pf0 <- d0$y
pf1 <- d1$y
}
if (plotit) {
xlim <- range(c(d0$x, d1$x), na.rm = TRUE)
if (plotvalues == "Number" & is.null(ylab))
ylab <- "Density x total frequency"
else if (plotvalues == "Density" & is.null(ylab))
ylab <- "Density"
ylim <- range(c(0, pf0, pf1))
ylim[2] <- ylim[2] + 0.1 * diff(ylim)
plot(d1$x, pf1, xlim = xlim, xlab = xlab, xaxt = "n",
bty = "n", yaxs = "i", ylim = ylim, ylab = ylab,
main = "", type = "n")
axis(1)
box(bty = "L")
lines(d0$x, pf0, col = col[1], lty = lty[1], lwd = lwd[1])
if (bw & lty[2] > 1)
lines(d1$x, pf1, col = col[2], lty = 1)
lines(d1$x, f1, col = col[2], lty = lty[2], lwd = lwd[2])
xpos <- par()$usr[2]
ypos <- par()$usr[4]
legend(xpos, ypos, lty = lty, lwd = lwd, col = col, cex = 0.8,
legend = gpnames, bty = "n", xjust = 1)
}
if (is.null(ratio))
return()
d0 <- density(x0, from = xlim[1], to = xlim[2])
d1 <- density(x1, from = xlim[1], to = xlim[2])
x01 <- d0$x
f0 <- d0$y
f1 <- d1$y
if (ratio.number) {
cf0 <- f0 * n0
cf1 <- f1 * n1
}
else {
cf0 <- f0
cf1 <- f1
}
cf0[cf0 < 0] <- 0
cf1[cf1 < 0] <- 0
fmin <- pmin(cf0, cf1)
fmax <- max(fmin)
subs <- match(fmax, fmin)
xmid <- x01[subs]
flow <- ratio[1]
fhi <- ratio[2]
lochoose <- x01 < xmid & (cf0 <= flow * cf1 | cf1 <= cf0 *
flow)
if (any(lochoose))
xlim[1] <- max(x01[lochoose])
else xlim[1] <- min(x01)
hichoose <- x01 > xmid & (cf0 >= fhi * cf1 | cf1 >= cf0 *
fhi)
if (any(hichoose))
xlim[2] <- min(x01[hichoose])
else xlim[2] <- max(x01)
if (plotit) {
if(ratio.number)midlab <- "ratio: number" else midlab <- "ratio: p.d."
axis(3, at = xlim, labels = paste(signif(xlim, 4)), mgp = c(3,
0.5, 0), col = "gray45", line = 1, cex.axis = 0.8)
xlim3 <- c(xlim[1], mean(xlim), xlim[2])
axis(3, at = xlim3, line = 1, labels = c(paste("1:", round(1/ratio[1]),
sep = ""), midlab, paste(round(ratio[2]), ":1", sep = "")),
mgp = c(-3, -1, 0), col = "gray", cex.axis = 0.7,
lty = 0)
}
xlim
}
if (device != "")
hardcopy(width = 4, height = 2.4, device = device, trellis = FALSE,
pointsize = c(8, 5))
oldpar <- par(mar = c(4.1, 3.6, 4.6, 1.1), mgp = c(2.25,
0.5, 0), mfrow = c(1, 2))
on.exit(par(oldpar))
if(!require(DAAG))return("Package 'DAAG' must be installed.")
if(!require(randomForest))return("Package 'randomForest' must be installed.")
nsw <- rbind(control, subset(nswdemo, trt == 1))
trt <- nsw$trt
nsw$trt <- factor(trt, labels = c("Control (psid1)", "Treatment"))
nsw$fac74 <- factor(nsw$re74 > 0, exclude = NULL)
table(nsw$fac74)
levels(nsw$fac74) <- c("0", "gt0", "<NA>")
set.seed(41)
nsw.rf <- randomForest(trt ~ ., data = nsw[, -c(7, 8, 10)],
sampsize = c(297, 297))
if(!require(MASS))return("Package 'MASS' must be installed.")
if(!require(splines))return("Package 'splines' must be installed.")
nsw.lda <- lda(trt ~ ns(age, 2) + ns(educ, 2) + black + hisp +
fac74 + ns(log(re75 + 100), 3), CV = TRUE, prior = c(0.5,
0.5), data = nsw)
logit <- function(p, offset = 0.001) log((p + offset)/(1 +
offset - p))
pred.rf <- logit(predict(nsw.rf, type = "prob")[, 2])
xlim <- overlapDensity(pred.rf[trt == 0], pred.rf[trt ==
1], ratio = c(1/20, 50), col = c("black", "gray45"),
lty = c(2, 1), lwd = c(1, 2), ratio.number = ratio.number,
plotvalues = plotvalues)
print("rf cutoff at -1.5")
print(table(nsw$trt, pred.rf > -1.5))
mtext(side = 3, line = 3, adj = -0.05, "A: Scores from randomForest()")
pred.lda <- logit(nsw.lda$posterior[, 2])
xlim <- overlapDensity(pred.lda[trt == 0], pred.lda[trt ==
1], ratio = c(1/20, 50), ylab = "", col = c("black",
"gray45"), lty = c(2, 1), lwd = c(1, 2), ratio.number = ratio.number,
plotvalues = plotvalues)
print("lda cutoff at -4")
print(table(nsw$trt, pred.lda > -4))
mtext(side = 3, line = 3, adj = -0.05, "B: Scores from lda()")
if (device != "")
dev.off()
}
g13.4 <-
function (device = "", hat = pred.rf, bw = TRUE, offset = 30)
{
if(!require(splines))return("Package 'splines' must be installed.")
if (device != "")
hardcopy(width = 5.25, height = 5, device = device, trellis = TRUE,
pointsize = c(8, 4))
if(!require(DAAG))return("Package 'DAAG' must be installed.")
if(!require(randomForest))return("Package 'randomForest' must be installed.")
nsw <- rbind(psid1, subset(nswdemo, trt == 1))
nsw$fac74 <- factor(nsw$re74 > 0, exclude = NULL)
table(nsw$fac74)
levels(nsw$fac74) <- c("0", "gt0", "<NA>")
set.seed(41)
nsw.rf <- randomForest(as.factor(trt) ~ ., data = nsw[, -c(7:8,
10)], sampsize = c(297, 297))
logit <- function(p, offset = 0.001) log((p + offset)/(1 +
offset - p))
trt <- nsw$trt
lprob.rf <- logit(predict(nsw.rf, type = "prob")[, 2])
nsw.lda <- lda(trt ~ ns(age, 2) + ns(educ, 2) + black + hisp +
fac74 + ns(log(re75 + 100), 2), prior = c(0.5, 0.5),
data = nsw)
lprob.lda <- logit(predict(nsw.lda)$posterior[, 2])
print(table(nsw$trt[lprob.lda > -4]))
nswa <- nsw[lprob.rf > -1.5, ]
nswb <- nsw[lprob.lda > -4, ]
print(table(nswa$trt))
nswa.rf <- randomForest(as.factor(trt) ~ ., data = nswa[,
-c(7:8, 10)])
lproba.rf <- logit(predict(nswa.rf, type = "prob")[, 2])
nswb.lda <- lda(trt ~ ns(age, 2) + ns(educ, 2) + black +
hisp + fac74 + ns(log(re75 + 100), 3), prior = c(0.5,
0.5), data = nswb)
lprobb.lda <- logit(predict(nswb.lda)$posterior[, 2])
gph.key <- list(space = "top", columns = 2, points = list(col = c("black",
"gray"), pch = c(1, 3), cex = 0.3), lines = list(lty = c("21",
"solid"), lwd = c(1.5, 1.5), col = c("black", "gray")),
text = list(c("psid1 controls", "experimental treatment")),
padding.text = 1)
gph0 <- xyplot(1 ~ 1, panel = function(x, y, ...) {
}, xlab = "", ylab = "", scales = list(draw = F), key = gph.key,
par.settings = list(axis.line = list(col = "transparent")))
gph1 <- xyplot(age + educ + log(re75 + offset) ~ lproba.rf,
groups = trt, layout = c(3, 1), data = nswa, type = c("p",
"smooth"), span = 0.4, aspect = 1, par.settings = simpleTheme(lwd = c(1.5,
1.5), col = c("black", "gray"), alpha.points = c(0.5,
1), pch = c(1, 3), cex = 0.3, lty = c("21", "solid")),
scales = list(y = list(relation = "free"), tck = 0.5),
xlab = "", par.strip.text = list(cex = 0.9), main = textGrob("A: Random forest scores (filtered data)",
x = unit(0.075, "npc"), y = unit(0.25, "npc"), just = "left",
gp = gpar(cex = 1)))
gph2 <- xyplot(age + educ + log(re75 + offset) ~ lprobb.lda,
groups = trt, layout = c(3, 1), data = nswb, type = c("p",
"smooth"), span = 0.4, aspect = 1, par.settings = simpleTheme(lwd = c(1.5,
1.5), col = c("black", "gray"), pch = c(1, 3), cex = 0.3,
alpha.points = c(0.5, 1), lty = c("21", "solid")),
scales = list(y = list(relation = "free"), tck = 0.5),
xlab = "Score", par.strip.text = list(cex = 0.9), main = textGrob("B: Linear discriminant analysis scores (filtered data)",
x = unit(0.075, "npc"), y = unit(0, "npc"), just = "left",
gp = gpar(cex = 1)))
print(gph0, position = c(0, 0.475, 1, 1))
print(gph1, position = c(0, 0.45, 1, 0.925), newpage = FALSE)
print(gph2, position = c(0, 0, 1, 0.475), newpage = FALSE)
if (device != "")
dev.off()
invisible()
}
g13.5 <-
function (device = "", bw = TRUE)
{
logit <- function(p, offset=0.001)log((p+offset)/(1+offset-p))
if(!require(splines))return("Package 'splines' must be installed.")
if(!require(lattice))return("Package 'lattice' must be installed.")
if (device != "")
hardcopy(width = 5.5, height = 2.75, device = device,
trellis = TRUE, pointsize = c(8, 4))
if(!require(DAAG))return("Package 'DAAG' must be installed.")
if(!require(randomForest))return("Package 'randomForest' must be installed.")
nsw <- rbind(psid1, subset(nswdemo, trt == 1))
nsw$trt <- factor(nsw$trt, labels = c("Control (psid1)",
"Treated (experimental)"))
nsw$fac74 <- factor(nsw$re74 > 0, exclude = NULL)
table(nsw$fac74)
levels(nsw$fac74) <- c("0", "gt0", "<NA>")
set.seed(41)
nsw.rf <- randomForest(trt ~ ., data = nsw[, -c(7:8, 10)],
sampsize = c(297, 297))
lprob.rf <- logit(predict(nsw.rf, type = "prob")[, 2])
nswa <- nsw[lprob.rf > -1.5, ]
nswa.rf <- randomForest(trt ~ ., data = nswa[, -c(7:8, 10)],
proximity = TRUE)
lproba.rf <- predict(nswa.rf, type = "prob")[, 2]
dmat <- 1 - nswa.rf$proximity
dmat[dmat == 0] <- 0.001
pts <- cmdscale(dmat)
rf.mds <- isoMDS(dmat, pts)
print(summary(rf.mds))
isoScores <- rf.mds$points
cutpts <- c(0, round(quantile(lproba.rf, c(1/3, 2/3)), 2),
1)
cutp <- cut(lproba.rf, breaks = cutpts, include.lowest = TRUE)
gph1 <- xyplot(isoScores[, 2] ~ isoScores[, 1] | cutp, groups = nswa$trt,
aspect = 1, scale = list(tck = 0.6), xlab = "Co-ordinate 1",
ylab = "Co-ordinate 2", auto.key = list(columns = 2),
par.settings = simpleTheme(col = c("black", "gray"),
pch = c(1, 3)))
print(table(cutp))
print(update(gph1, layout = c(3, 1)))
if (device != "")
dev.off()
invisible(list(cmd = pts, iso = rf.mds$points, trt = nswa$trt,
pred = lproba.rf))
}
pkgs <- c("DAAG", "lattice", "grid", "MASS", "randomForest", "splines")
z <- sapply(pkgs, require, character.only=TRUE, warn.conflicts=FALSE, quietly=TRUE)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
if(any(!z)){
notAvail <- paste(names(z)[!z], collapse=", ")
warning(paste("The following packages should be installed:", notAvail))
}
g13.1()
g13.2()
g13.3()
## [1] "rf cutoff at -1.5"
##
## FALSE TRUE
## Control (psid1) 2175 315
## Treatment 8 289
## [1] "lda cutoff at -4"
##
## FALSE TRUE
## Control (psid1) 2161 329
## Treatment 16 281
g13.4()
##
## 0 1
## 340 272
##
## 0 1
## 315 289
## Warning: variables are collinear
g13.5()
## initial value 51.466491
## iter 5 value 30.996301
## iter 10 value 29.282968
## iter 15 value 27.753379
## iter 20 value 27.100097
## final value 27.035117
## converged
## Length Class Mode
## points 1208 -none- numeric
## stress 1 -none- numeric
## cutp
## [0,0.11] (0.11,0.83] (0.83,1]
## 202 201 201