#' --- #' title: "Code for 'Data Analysis And Graphics Using R', 3rd edn, CUP, 2010" #' author: "John Maindonald and John Braun" #' date: "Oct 3, 2014" #' --- #' Chapter 13: Regression on Principal Component or Discriminant Scores #' Packages required: "DAAG", "lattice", "grid", "MASS", "randomForest", "splines" #' #' The script that follows is designed to be executed as it stands. It sets up #' functions that create Ch 13 graphs, then runs those functions. If executed by #' clicking on the RStudio "Compile Notebook ..." command, it will be processed #' through R Markdown, generating a document that includes code, output and #' graphs generated by executing the functions. If some needed packages are #' missing, warning messages will appear. See further: #' http://rmarkdown.rstudio.com/r_notebook_format.html g13.1 <- function (device = "") { oldpal <- palette(gray(c(0, 0.6))) if (device != "") hardcopy(width = 2.75, height = 2.75, device = device) oldpar <- par(mar = par()$mar + c(-0.5, 0, -2, 0)) on.exit(par(oldpar)) require(DAAG) not.na <- apply(socsupport[, 9:19], 1, function(x) !any(is.na(x))) not.na[36] <- FALSE ss.pcp1 <- princomp(socsupport[not.na, 9:19], cor = TRUE)$scores[, 1] attach(socsupport) plot(BDI[not.na] ~ ss.pcp1, col = as.numeric(gender[not.na]), pch = as.numeric(gender[not.na]), xlab = "First principal component", ylab = "BDI") topleft <- par()$usr[c(1, 4)] legend(topleft[1], topleft[2], col = 1:2, pch = 1:2, legend = levels(gender), cex = 0.7) detach(socsupport) if (device != "") dev.off() palette(oldpal) invisible() } g13.2 <- function (device = "") { if (device != "") hardcopy(width = 5, height = 1.8, trellis = TRUE, color = TRUE, device = device, pointsize = c(6, 3)) dsetnames <- c("nsw-ctl", "nsw-trt", "psid3") colrs <- c("gray45", "gray45", "black") lty <- c("solid", "21", "solid") lwd <- c(1.5, 1.5, 1) trellis.par.set(axis.components = list(left = list(pad1 = 0.4), bottom = list(pad1 = 0.5))) denplot <- function(yvar = "re75", offset = 100, ylim = c(0, 0.52), from = NULL, at = c(0.1, 0.2, 0.3, 0.4, 0.5), labels = paste(at), bw = "nrd0", ylab = "Density", takelog = TRUE, col.axis = "black") { nzre <- unlist(lapply(list(subset(nswdemo, trt == 0), subset(nswdemo, trt == 1), psid3), function(x) { z <- x[, yvar] z[z > 0] })) num <- unlist(lapply(list(subset(nswdemo, trt == 0), subset(nswdemo, trt == 1), psid3), function(x) { z <- x[, yvar] sum(z > 0) })) xy <- data.frame(nzre = nzre, fac = factor(rep(dsetnames, num), levels = dsetnames)) if (takelog) { y <- log(xy$nzre + offset) xlab <- paste("log(", yvar, "+", offset, ")", sep = "") } else { y <- xy$nzre xlab <- yvar } densityplot(~y, groups = fac, data = xy, bw = bw, from = from, aspect=1.5, scales = list(y = list(at = at, labels = labels, col = col.axis), tck = 0.25), plot.points = FALSE, par.settings = simpleTheme(col = colrs, lty = lty, lwd = lwd), ylim = ylim, ylab = ylab, xlab = xlab) } print(denplot(), position = c(0.02, 0, 0.32, 0.985)) print(denplot(ylab = " ", yvar = "re78"), position = c(0.24, 0, 0.54, 0.985), newpage = FALSE) print(denplot(ylab = " ", yvar = "age", takelog = FALSE, ylim = c(0, 0.08), from = 16, at = c(0.02, 0.04, 0.06, 0.08), labels = c(".02", ".04", ".06", ".08")), position = c(0.46, 0, 0.76, 0.985), newpage = FALSE) print(denplot(ylab = " ", yvar = "educ", takelog = FALSE, ylim = c(0, 0.275), bw = 0.5, at = c(0.1, 0.2, 0.3, 0.4)), position = c(0.68, 0, 0.98, 0.985), newpage = FALSE) if(!require(grid))return("Package 'grid' must be installed.") pushViewport(viewport(y = unit(1, "npc"), gp = gpar(cex = 0.65), width = 0.5, height = unit(1.5, "lines"), just = c("centre", "top"))) draw.key(key = list(text = list(dsetnames), lines = list(lty = lty, lwd = lwd, col = colrs), columns = 3, between = 1), draw = TRUE) popViewport(0) if (device != "") dev.off() } g13.3 <- function (device = "", control = psid1, bw = TRUE, plotvalues="Density", ratio.number=TRUE) { overlapDensity <- function (x0, x1, ratio = c(0.05, 20), ratio.number = FALSE, plotvalues = c("Density", "Numbers"), gpnames = c("Control", "Treatment"), cutoffs = c(lower = TRUE, upper = TRUE), bw = FALSE, xlab = "Score", ylab = NULL, col = 1:2, lty = 1:2, lwd = c(1, 1)) { if (is.null(plotvalues)) plotvalues <- "" if (length(plotvalues) > 1) plotvalues <- plotvalues[1] if (plotvalues %in% c("Numbers", "Density")) plotit <- TRUE else plotit <- FALSE ran <- range(c(x0, x1)) if (all(cutoffs)) { d0 <- density(x0, from = ran[1], to = ran[2]) d1 <- density(x1, from = ran[1], to = ran[2]) } else if (cutoffs[1]) { d0 <- density(x0, from = ran[1]) d1 <- density(x1, from = ran[1]) } else if (cutoffs[2]) { d0 <- density(x0, to = ran[2]) d1 <- density(x1, to = ran[2]) } else { d0 <- density(x0) d1 <- density(x1) } n0 <- length(x0) n1 <- length(x1) f0 <- d0$y f1 <- d1$y if (plotvalues %in% "Number") { pf0 <- d0$y * n0 pf1 <- d1$y * n1 } else { pf0 <- d0$y pf1 <- d1$y } if (plotit) { xlim <- range(c(d0$x, d1$x), na.rm = TRUE) if (plotvalues == "Number" & is.null(ylab)) ylab <- "Density x total frequency" else if (plotvalues == "Density" & is.null(ylab)) ylab <- "Density" ylim <- range(c(0, pf0, pf1)) ylim[2] <- ylim[2] + 0.1 * diff(ylim) plot(d1$x, pf1, xlim = xlim, xlab = xlab, xaxt = "n", bty = "n", yaxs = "i", ylim = ylim, ylab = ylab, main = "", type = "n") axis(1) box(bty = "L") lines(d0$x, pf0, col = col[1], lty = lty[1], lwd = lwd[1]) if (bw & lty[2] > 1) lines(d1$x, pf1, col = col[2], lty = 1) lines(d1$x, f1, col = col[2], lty = lty[2], lwd = lwd[2]) xpos <- par()$usr[2] ypos <- par()$usr[4] legend(xpos, ypos, lty = lty, lwd = lwd, col = col, cex = 0.8, legend = gpnames, bty = "n", xjust = 1) } if (is.null(ratio)) return() d0 <- density(x0, from = xlim[1], to = xlim[2]) d1 <- density(x1, from = xlim[1], to = xlim[2]) x01 <- d0$x f0 <- d0$y f1 <- d1$y if (ratio.number) { cf0 <- f0 * n0 cf1 <- f1 * n1 } else { cf0 <- f0 cf1 <- f1 } cf0[cf0 < 0] <- 0 cf1[cf1 < 0] <- 0 fmin <- pmin(cf0, cf1) fmax <- max(fmin) subs <- match(fmax, fmin) xmid <- x01[subs] flow <- ratio[1] fhi <- ratio[2] lochoose <- x01 < xmid & (cf0 <= flow * cf1 | cf1 <= cf0 * flow) if (any(lochoose)) xlim[1] <- max(x01[lochoose]) else xlim[1] <- min(x01) hichoose <- x01 > xmid & (cf0 >= fhi * cf1 | cf1 >= cf0 * fhi) if (any(hichoose)) xlim[2] <- min(x01[hichoose]) else xlim[2] <- max(x01) if (plotit) { if(ratio.number)midlab <- "ratio: number" else midlab <- "ratio: p.d." axis(3, at = xlim, labels = paste(signif(xlim, 4)), mgp = c(3, 0.5, 0), col = "gray45", line = 1, cex.axis = 0.8) xlim3 <- c(xlim[1], mean(xlim), xlim[2]) axis(3, at = xlim3, line = 1, labels = c(paste("1:", round(1/ratio[1]), sep = ""), midlab, paste(round(ratio[2]), ":1", sep = "")), mgp = c(-3, -1, 0), col = "gray", cex.axis = 0.7, lty = 0) } xlim } if (device != "") hardcopy(width = 4, height = 2.4, device = device, trellis = FALSE, pointsize = c(8, 5)) oldpar <- par(mar = c(4.1, 3.6, 4.6, 1.1), mgp = c(2.25, 0.5, 0), mfrow = c(1, 2)) on.exit(par(oldpar)) if(!require(DAAG))return("Package 'DAAG' must be installed.") if(!require(randomForest))return("Package 'randomForest' must be installed.") nsw <- rbind(control, subset(nswdemo, trt == 1)) trt <- nsw$trt nsw$trt <- factor(trt, labels = c("Control (psid1)", "Treatment")) nsw$fac74 <- factor(nsw$re74 > 0, exclude = NULL) table(nsw$fac74) levels(nsw$fac74) <- c("0", "gt0", "") set.seed(41) nsw.rf <- randomForest(trt ~ ., data = nsw[, -c(7, 8, 10)], sampsize = c(297, 297)) if(!require(MASS))return("Package 'MASS' must be installed.") if(!require(splines))return("Package 'splines' must be installed.") nsw.lda <- lda(trt ~ ns(age, 2) + ns(educ, 2) + black + hisp + fac74 + ns(log(re75 + 100), 3), CV = TRUE, prior = c(0.5, 0.5), data = nsw) logit <- function(p, offset = 0.001) log((p + offset)/(1 + offset - p)) pred.rf <- logit(predict(nsw.rf, type = "prob")[, 2]) xlim <- overlapDensity(pred.rf[trt == 0], pred.rf[trt == 1], ratio = c(1/20, 50), col = c("black", "gray45"), lty = c(2, 1), lwd = c(1, 2), ratio.number = ratio.number, plotvalues = plotvalues) print("rf cutoff at -1.5") print(table(nsw$trt, pred.rf > -1.5)) mtext(side = 3, line = 3, adj = -0.05, "A: Scores from randomForest()") pred.lda <- logit(nsw.lda$posterior[, 2]) xlim <- overlapDensity(pred.lda[trt == 0], pred.lda[trt == 1], ratio = c(1/20, 50), ylab = "", col = c("black", "gray45"), lty = c(2, 1), lwd = c(1, 2), ratio.number = ratio.number, plotvalues = plotvalues) print("lda cutoff at -4") print(table(nsw$trt, pred.lda > -4)) mtext(side = 3, line = 3, adj = -0.05, "B: Scores from lda()") if (device != "") dev.off() } g13.4 <- function (device = "", hat = pred.rf, bw = TRUE, offset = 30) { if(!require(splines))return("Package 'splines' must be installed.") if (device != "") hardcopy(width = 5.25, height = 5, device = device, trellis = TRUE, pointsize = c(8, 4)) if(!require(DAAG))return("Package 'DAAG' must be installed.") if(!require(randomForest))return("Package 'randomForest' must be installed.") nsw <- rbind(psid1, subset(nswdemo, trt == 1)) nsw$fac74 <- factor(nsw$re74 > 0, exclude = NULL) table(nsw$fac74) levels(nsw$fac74) <- c("0", "gt0", "") set.seed(41) nsw.rf <- randomForest(as.factor(trt) ~ ., data = nsw[, -c(7:8, 10)], sampsize = c(297, 297)) logit <- function(p, offset = 0.001) log((p + offset)/(1 + offset - p)) trt <- nsw$trt lprob.rf <- logit(predict(nsw.rf, type = "prob")[, 2]) nsw.lda <- lda(trt ~ ns(age, 2) + ns(educ, 2) + black + hisp + fac74 + ns(log(re75 + 100), 2), prior = c(0.5, 0.5), data = nsw) lprob.lda <- logit(predict(nsw.lda)$posterior[, 2]) print(table(nsw$trt[lprob.lda > -4])) nswa <- nsw[lprob.rf > -1.5, ] nswb <- nsw[lprob.lda > -4, ] print(table(nswa$trt)) nswa.rf <- randomForest(as.factor(trt) ~ ., data = nswa[, -c(7:8, 10)]) lproba.rf <- logit(predict(nswa.rf, type = "prob")[, 2]) nswb.lda <- lda(trt ~ ns(age, 2) + ns(educ, 2) + black + hisp + fac74 + ns(log(re75 + 100), 3), prior = c(0.5, 0.5), data = nswb) lprobb.lda <- logit(predict(nswb.lda)$posterior[, 2]) gph.key <- list(space = "top", columns = 2, points = list(col = c("black", "gray"), pch = c(1, 3), cex = 0.3), lines = list(lty = c("21", "solid"), lwd = c(1.5, 1.5), col = c("black", "gray")), text = list(c("psid1 controls", "experimental treatment")), padding.text = 1) gph0 <- xyplot(1 ~ 1, panel = function(x, y, ...) { }, xlab = "", ylab = "", scales = list(draw = F), key = gph.key, par.settings = list(axis.line = list(col = "transparent"))) gph1 <- xyplot(age + educ + log(re75 + offset) ~ lproba.rf, groups = trt, layout = c(3, 1), data = nswa, type = c("p", "smooth"), span = 0.4, aspect = 1, par.settings = simpleTheme(lwd = c(1.5, 1.5), col = c("black", "gray"), alpha.points = c(0.5, 1), pch = c(1, 3), cex = 0.3, lty = c("21", "solid")), scales = list(y = list(relation = "free"), tck = 0.5), xlab = "", par.strip.text = list(cex = 0.9), main = textGrob("A: Random forest scores (filtered data)", x = unit(0.075, "npc"), y = unit(0.25, "npc"), just = "left", gp = gpar(cex = 1))) gph2 <- xyplot(age + educ + log(re75 + offset) ~ lprobb.lda, groups = trt, layout = c(3, 1), data = nswb, type = c("p", "smooth"), span = 0.4, aspect = 1, par.settings = simpleTheme(lwd = c(1.5, 1.5), col = c("black", "gray"), pch = c(1, 3), cex = 0.3, alpha.points = c(0.5, 1), lty = c("21", "solid")), scales = list(y = list(relation = "free"), tck = 0.5), xlab = "Score", par.strip.text = list(cex = 0.9), main = textGrob("B: Linear discriminant analysis scores (filtered data)", x = unit(0.075, "npc"), y = unit(0, "npc"), just = "left", gp = gpar(cex = 1))) print(gph0, position = c(0, 0.475, 1, 1)) print(gph1, position = c(0, 0.45, 1, 0.925), newpage = FALSE) print(gph2, position = c(0, 0, 1, 0.475), newpage = FALSE) if (device != "") dev.off() invisible() } g13.5 <- function (device = "", bw = TRUE) { logit <- function(p, offset=0.001)log((p+offset)/(1+offset-p)) if(!require(splines))return("Package 'splines' must be installed.") if(!require(lattice))return("Package 'lattice' must be installed.") if (device != "") hardcopy(width = 5.5, height = 2.75, device = device, trellis = TRUE, pointsize = c(8, 4)) if(!require(DAAG))return("Package 'DAAG' must be installed.") if(!require(randomForest))return("Package 'randomForest' must be installed.") nsw <- rbind(psid1, subset(nswdemo, trt == 1)) nsw$trt <- factor(nsw$trt, labels = c("Control (psid1)", "Treated (experimental)")) nsw$fac74 <- factor(nsw$re74 > 0, exclude = NULL) table(nsw$fac74) levels(nsw$fac74) <- c("0", "gt0", "") set.seed(41) nsw.rf <- randomForest(trt ~ ., data = nsw[, -c(7:8, 10)], sampsize = c(297, 297)) lprob.rf <- logit(predict(nsw.rf, type = "prob")[, 2]) nswa <- nsw[lprob.rf > -1.5, ] nswa.rf <- randomForest(trt ~ ., data = nswa[, -c(7:8, 10)], proximity = TRUE) lproba.rf <- predict(nswa.rf, type = "prob")[, 2] dmat <- 1 - nswa.rf$proximity dmat[dmat == 0] <- 0.001 pts <- cmdscale(dmat) rf.mds <- isoMDS(dmat, pts) print(summary(rf.mds)) isoScores <- rf.mds$points cutpts <- c(0, round(quantile(lproba.rf, c(1/3, 2/3)), 2), 1) cutp <- cut(lproba.rf, breaks = cutpts, include.lowest = TRUE) gph1 <- xyplot(isoScores[, 2] ~ isoScores[, 1] | cutp, groups = nswa$trt, aspect = 1, scale = list(tck = 0.6), xlab = "Co-ordinate 1", ylab = "Co-ordinate 2", auto.key = list(columns = 2), par.settings = simpleTheme(col = c("black", "gray"), pch = c(1, 3))) print(table(cutp)) print(update(gph1, layout = c(3, 1))) if (device != "") dev.off() invisible(list(cmd = pts, iso = rf.mds$points, trt = nswa$trt, pred = lproba.rf)) } pkgs <- c("DAAG", "lattice", "grid", "MASS", "randomForest", "splines") z <- sapply(pkgs, require, character.only=TRUE, warn.conflicts=FALSE, quietly=TRUE) if(any(!z)){ notAvail <- paste(names(z)[!z], collapse=", ") warning(paste("The following packages should be installed:", notAvail)) } g13.1() g13.2() g13.3() #+ fig.width=7, fig.height=8 g13.4() #+ g13.5()