Chapter 13: Regression on Principal Component or Discriminant Scores
Packages required: “DAAG”, “lattice”, “grid”, “MASS”, “randomForest”, “splines”

The script that follows is designed to be executed as it stands. It sets up
functions that create Ch 13 graphs, then runs those functions. If executed by
clicking on the RStudio “Compile Notebook …” command, it will be processed
through R Markdown, generating a document that includes code, output and
graphs generated by executing the functions. If some needed packages are
missing, warning messages will appear. See further:
http://rmarkdown.rstudio.com/r_notebook_format.html

g13.1 <- 
function (device = "") 
{
    oldpal <- palette(gray(c(0, 0.6)))
    if (device != "") 
        hardcopy(width = 2.75, height = 2.75, device = device)
    oldpar <- par(mar = par()$mar + c(-0.5, 0, -2, 0))
    on.exit(par(oldpar))
    require(DAAG)
    not.na <- apply(socsupport[, 9:19], 1, function(x) !any(is.na(x)))
    not.na[36] <- FALSE
    ss.pcp1 <- princomp(socsupport[not.na, 9:19], cor = TRUE)$scores[, 
        1]
    attach(socsupport)
    plot(BDI[not.na] ~ ss.pcp1, col = as.numeric(gender[not.na]), 
        pch = as.numeric(gender[not.na]), xlab = "First principal component", 
        ylab = "BDI")
    topleft <- par()$usr[c(1, 4)]
    legend(topleft[1], topleft[2], col = 1:2, pch = 1:2, legend = levels(gender), 
        cex = 0.7)
    detach(socsupport)
    if (device != "") 
        dev.off()
    palette(oldpal)
    invisible()
}

g13.2 <-
function (device = "") 
{
    if (device != "") 
        hardcopy(width = 5, height = 1.8, trellis = TRUE, color = TRUE, 
            device = device, pointsize = c(6, 3))
    dsetnames <- c("nsw-ctl", "nsw-trt", "psid3")
    colrs <- c("gray45", "gray45", "black")
    lty <- c("solid", "21", "solid")
    lwd <- c(1.5, 1.5, 1)
    trellis.par.set(axis.components = list(left = list(pad1 = 0.4), 
        bottom = list(pad1 = 0.5)))
    denplot <- function(yvar = "re75", offset = 100, ylim = c(0, 
        0.52), from = NULL, at = c(0.1, 0.2, 0.3, 0.4, 0.5), 
        labels = paste(at), bw = "nrd0", ylab = "Density", takelog = TRUE, 
        col.axis = "black") {
        nzre <- unlist(lapply(list(subset(nswdemo, trt == 0), 
            subset(nswdemo, trt == 1), psid3), function(x) {
            z <- x[, yvar]
            z[z > 0]
        }))
        num <- unlist(lapply(list(subset(nswdemo, trt == 0), 
            subset(nswdemo, trt == 1), psid3), function(x) {
            z <- x[, yvar]
            sum(z > 0)
        }))
        xy <- data.frame(nzre = nzre, fac = factor(rep(dsetnames, 
            num), levels = dsetnames))
        if (takelog) {
            y <- log(xy$nzre + offset)
            xlab <- paste("log(", yvar, "+", offset, ")", sep = "")
        }
        else {
            y <- xy$nzre
            xlab <- yvar
        }
        densityplot(~y, groups = fac, data = xy, bw = bw, from = from, 
                    aspect=1.5,
            scales = list(y = list(at = at, labels = labels, 
                col = col.axis), tck = 0.25), plot.points = FALSE, 
            par.settings = simpleTheme(col = colrs, lty = lty, 
                lwd = lwd), ylim = ylim, ylab = ylab, xlab = xlab)
    }
    print(denplot(), position = c(0.02, 0, 0.32, 0.985))
    print(denplot(ylab = "  ", yvar = "re78"), position = c(0.24, 
        0, 0.54, 0.985), newpage = FALSE)
    print(denplot(ylab = "  ", yvar = "age", takelog = FALSE, 
        ylim = c(0, 0.08), from = 16, at = c(0.02, 0.04, 0.06, 
            0.08), labels = c(".02", ".04", ".06", ".08")), position = c(0.46, 
        0, 0.76, 0.985), newpage = FALSE)
    print(denplot(ylab = "  ", yvar = "educ", takelog = FALSE, 
        ylim = c(0, 0.275), bw = 0.5, at = c(0.1, 0.2, 0.3, 0.4)), 
        position = c(0.68, 0, 0.98, 0.985), newpage = FALSE)
    if(!require(grid))return("Package 'grid' must be installed.")    
    pushViewport(viewport(y = unit(1, "npc"), gp = gpar(cex = 0.65), 
        width = 0.5, height = unit(1.5, "lines"), just = c("centre", 
            "top")))
    draw.key(key = list(text = list(dsetnames), lines = list(lty = lty, 
        lwd = lwd, col = colrs), columns = 3, between = 1), draw = TRUE)
    popViewport(0)
    if (device != "") 
        dev.off()
}

g13.3 <-
function (device = "", control = psid1, bw = TRUE, plotvalues="Density", ratio.number=TRUE) 
{
  overlapDensity <-
function (x0, x1, ratio = c(0.05, 20), ratio.number = FALSE, 
          plotvalues = c("Density", "Numbers"), gpnames = c("Control", "Treatment"), 
          cutoffs = c(lower = TRUE, upper = TRUE), bw = FALSE, 
          xlab = "Score", ylab = NULL, 
          col = 1:2, lty = 1:2, lwd = c(1, 1)) 
{
    if (is.null(plotvalues)) 
        plotvalues <- ""
    if (length(plotvalues) > 1) 
        plotvalues <- plotvalues[1]
    if (plotvalues %in% c("Numbers", "Density")) 
        plotit <- TRUE else plotit <- FALSE
    ran <- range(c(x0, x1))
    if (all(cutoffs)) {
        d0 <- density(x0, from = ran[1], to = ran[2])
        d1 <- density(x1, from = ran[1], to = ran[2])
    }
    else if (cutoffs[1]) {
        d0 <- density(x0, from = ran[1])
        d1 <- density(x1, from = ran[1])
    }
    else if (cutoffs[2]) {
        d0 <- density(x0, to = ran[2])
        d1 <- density(x1, to = ran[2])
    }
    else {
        d0 <- density(x0)
        d1 <- density(x1)
    }
    n0 <- length(x0)
    n1 <- length(x1)
    f0 <- d0$y
    f1 <- d1$y
    if (plotvalues %in% "Number") {
        pf0 <- d0$y * n0
        pf1 <- d1$y * n1
    }
    else {
        pf0 <- d0$y
        pf1 <- d1$y
    }
    if (plotit) {
        xlim <- range(c(d0$x, d1$x), na.rm = TRUE)
        if (plotvalues == "Number" & is.null(ylab)) 
            ylab <- "Density x total frequency"
        else if (plotvalues == "Density" & is.null(ylab)) 
            ylab <- "Density"
        ylim <- range(c(0, pf0, pf1))
        ylim[2] <- ylim[2] + 0.1 * diff(ylim)
        plot(d1$x, pf1, xlim = xlim, xlab = xlab, xaxt = "n", 
            bty = "n", yaxs = "i", ylim = ylim, ylab = ylab, 
            main = "", type = "n")
        axis(1)
        box(bty = "L")
        lines(d0$x, pf0, col = col[1], lty = lty[1], lwd = lwd[1])
        if (bw & lty[2] > 1) 
            lines(d1$x, pf1, col = col[2], lty = 1)
        lines(d1$x, f1, col = col[2], lty = lty[2], lwd = lwd[2])
        xpos <- par()$usr[2]
        ypos <- par()$usr[4]
        legend(xpos, ypos, lty = lty, lwd = lwd, col = col, cex = 0.8, 
            legend = gpnames, bty = "n", xjust = 1)
    }
    if (is.null(ratio)) 
        return()
    d0 <- density(x0, from = xlim[1], to = xlim[2])
    d1 <- density(x1, from = xlim[1], to = xlim[2])
    x01 <- d0$x
    f0 <- d0$y
    f1 <- d1$y
    if (ratio.number) {
        cf0 <- f0 * n0
        cf1 <- f1 * n1
    }
    else {
        cf0 <- f0
        cf1 <- f1
    }
    cf0[cf0 < 0] <- 0
    cf1[cf1 < 0] <- 0
    fmin <- pmin(cf0, cf1)
    fmax <- max(fmin)
    subs <- match(fmax, fmin)
    xmid <- x01[subs]
    flow <- ratio[1]
    fhi <- ratio[2]
    lochoose <- x01 < xmid & (cf0 <= flow * cf1 | cf1 <= cf0 * 
        flow)
    if (any(lochoose)) 
        xlim[1] <- max(x01[lochoose])
    else xlim[1] <- min(x01)
    hichoose <- x01 > xmid & (cf0 >= fhi * cf1 | cf1 >= cf0 * 
        fhi)
    if (any(hichoose)) 
        xlim[2] <- min(x01[hichoose])
    else xlim[2] <- max(x01)
    if (plotit) {
        if(ratio.number)midlab <- "ratio: number" else midlab <- "ratio: p.d."
        axis(3, at = xlim, labels = paste(signif(xlim, 4)), mgp = c(3, 
            0.5, 0), col = "gray45", line = 1, cex.axis = 0.8)
        xlim3 <- c(xlim[1], mean(xlim), xlim[2])
        axis(3, at = xlim3, line = 1, labels = c(paste("1:", round(1/ratio[1]), 
            sep = ""), midlab, paste(round(ratio[2]), ":1", sep = "")), 
            mgp = c(-3, -1, 0), col = "gray", cex.axis = 0.7, 
            lty = 0)
    }
    xlim
}
    if (device != "") 
        hardcopy(width = 4, height = 2.4, device = device, trellis = FALSE, 
            pointsize = c(8, 5))
    oldpar <- par(mar = c(4.1, 3.6, 4.6, 1.1), mgp = c(2.25, 
        0.5, 0), mfrow = c(1, 2))
    on.exit(par(oldpar))
    if(!require(DAAG))return("Package 'DAAG' must be installed.")  
    if(!require(randomForest))return("Package 'randomForest' must be installed.")  
    nsw <- rbind(control, subset(nswdemo, trt == 1))
    trt <- nsw$trt
    nsw$trt <- factor(trt, labels = c("Control (psid1)", "Treatment"))
    nsw$fac74 <- factor(nsw$re74 > 0, exclude = NULL)
    table(nsw$fac74)
    levels(nsw$fac74) <- c("0", "gt0", "<NA>")
    set.seed(41)
    nsw.rf <- randomForest(trt ~ ., data = nsw[, -c(7, 8, 10)], 
        sampsize = c(297, 297))
    if(!require(MASS))return("Package 'MASS' must be installed.") 
    if(!require(splines))return("Package 'splines' must be installed.")           
    nsw.lda <- lda(trt ~ ns(age, 2) + ns(educ, 2) + black + hisp + 
        fac74 + ns(log(re75 + 100), 3), CV = TRUE, prior = c(0.5, 
        0.5), data = nsw)
    logit <- function(p, offset = 0.001) log((p + offset)/(1 + 
        offset - p))
    pred.rf <- logit(predict(nsw.rf, type = "prob")[, 2])
    xlim <- overlapDensity(pred.rf[trt == 0], pred.rf[trt == 
        1], ratio = c(1/20, 50), col = c("black", "gray45"), 
        lty = c(2, 1), lwd = c(1, 2), ratio.number = ratio.number, 
        plotvalues = plotvalues)
    print("rf cutoff at -1.5")
    print(table(nsw$trt, pred.rf > -1.5))
    mtext(side = 3, line = 3, adj = -0.05, "A: Scores from randomForest()")
    pred.lda <- logit(nsw.lda$posterior[, 2])
    xlim <- overlapDensity(pred.lda[trt == 0], pred.lda[trt == 
        1], ratio = c(1/20, 50), ylab = "", col = c("black", 
        "gray45"), lty = c(2, 1), lwd = c(1, 2), ratio.number = ratio.number, 
        plotvalues = plotvalues)
    print("lda cutoff at -4")
    print(table(nsw$trt, pred.lda > -4))
    mtext(side = 3, line = 3, adj = -0.05, "B: Scores from lda()")
    if (device != "") 
        dev.off()
}

g13.4 <-
function (device = "", hat = pred.rf, bw = TRUE, offset = 30) 
{
    if(!require(splines))return("Package 'splines' must be installed.")   
    if (device != "") 
        hardcopy(width = 5.25, height = 5, device = device, trellis = TRUE, 
            pointsize = c(8, 4))
    if(!require(DAAG))return("Package 'DAAG' must be installed.") 
    if(!require(randomForest))return("Package 'randomForest' must be installed.")      
    nsw <- rbind(psid1, subset(nswdemo, trt == 1))
    nsw$fac74 <- factor(nsw$re74 > 0, exclude = NULL)
    table(nsw$fac74)
    levels(nsw$fac74) <- c("0", "gt0", "<NA>")
    set.seed(41)
    nsw.rf <- randomForest(as.factor(trt) ~ ., data = nsw[, -c(7:8, 
        10)], sampsize = c(297, 297))
    logit <- function(p, offset = 0.001) log((p + offset)/(1 + 
        offset - p))
    trt <- nsw$trt
    lprob.rf <- logit(predict(nsw.rf, type = "prob")[, 2])
    nsw.lda <- lda(trt ~ ns(age, 2) + ns(educ, 2) + black + hisp + 
        fac74 + ns(log(re75 + 100), 2), prior = c(0.5, 0.5), 
        data = nsw)
    lprob.lda <- logit(predict(nsw.lda)$posterior[, 2])
    print(table(nsw$trt[lprob.lda > -4]))
    nswa <- nsw[lprob.rf > -1.5, ]
    nswb <- nsw[lprob.lda > -4, ]
    print(table(nswa$trt))
    nswa.rf <- randomForest(as.factor(trt) ~ ., data = nswa[, 
        -c(7:8, 10)])
    lproba.rf <- logit(predict(nswa.rf, type = "prob")[, 2])
    nswb.lda <- lda(trt ~ ns(age, 2) + ns(educ, 2) + black + 
        hisp + fac74 + ns(log(re75 + 100), 3), prior = c(0.5, 
        0.5), data = nswb)
    lprobb.lda <- logit(predict(nswb.lda)$posterior[, 2])
    gph.key <- list(space = "top", columns = 2, points = list(col = c("black", 
        "gray"), pch = c(1, 3), cex = 0.3), lines = list(lty = c("21", 
        "solid"), lwd = c(1.5, 1.5), col = c("black", "gray")), 
        text = list(c("psid1 controls", "experimental treatment")), 
        padding.text = 1)
    gph0 <- xyplot(1 ~ 1, panel = function(x, y, ...) {
    }, xlab = "", ylab = "", scales = list(draw = F), key = gph.key, 
        par.settings = list(axis.line = list(col = "transparent")))
    gph1 <- xyplot(age + educ + log(re75 + offset) ~ lproba.rf, 
        groups = trt, layout = c(3, 1), data = nswa, type = c("p", 
            "smooth"), span = 0.4, aspect = 1, par.settings = simpleTheme(lwd = c(1.5, 
            1.5), col = c("black", "gray"), alpha.points = c(0.5, 
            1), pch = c(1, 3), cex = 0.3, lty = c("21", "solid")), 
        scales = list(y = list(relation = "free"), tck = 0.5), 
        xlab = "", par.strip.text = list(cex = 0.9), main = textGrob("A: Random forest scores (filtered data)", 
            x = unit(0.075, "npc"), y = unit(0.25, "npc"), just = "left", 
            gp = gpar(cex = 1)))
    gph2 <- xyplot(age + educ + log(re75 + offset) ~ lprobb.lda, 
        groups = trt, layout = c(3, 1), data = nswb, type = c("p", 
            "smooth"), span = 0.4, aspect = 1, par.settings = simpleTheme(lwd = c(1.5, 
            1.5), col = c("black", "gray"), pch = c(1, 3), cex = 0.3, 
            alpha.points = c(0.5, 1), lty = c("21", "solid")), 
        scales = list(y = list(relation = "free"), tck = 0.5), 
        xlab = "Score", par.strip.text = list(cex = 0.9), main = textGrob("B: Linear discriminant analysis scores (filtered data)", 
            x = unit(0.075, "npc"), y = unit(0, "npc"), just = "left", 
            gp = gpar(cex = 1)))
    print(gph0, position = c(0, 0.475, 1, 1))
    print(gph1, position = c(0, 0.45, 1, 0.925), newpage = FALSE)
    print(gph2, position = c(0, 0, 1, 0.475), newpage = FALSE)
    if (device != "") 
        dev.off()
    invisible()
}

g13.5 <-
function (device = "", bw = TRUE) 
{
  logit <- function(p, offset=0.001)log((p+offset)/(1+offset-p)) 
    if(!require(splines))return("Package 'splines' must be installed.") 
    if(!require(lattice))return("Package 'lattice' must be installed.")     
    if (device != "") 
        hardcopy(width = 5.5, height = 2.75, device = device, 
            trellis = TRUE, pointsize = c(8, 4))
    if(!require(DAAG))return("Package 'DAAG' must be installed.") 
    if(!require(randomForest))return("Package 'randomForest' must be installed.")     
    nsw <- rbind(psid1, subset(nswdemo, trt == 1))
    nsw$trt <- factor(nsw$trt, labels = c("Control (psid1)", 
        "Treated (experimental)"))
    nsw$fac74 <- factor(nsw$re74 > 0, exclude = NULL)
    table(nsw$fac74)
    levels(nsw$fac74) <- c("0", "gt0", "<NA>")
    set.seed(41)
    nsw.rf <- randomForest(trt ~ ., data = nsw[, -c(7:8, 10)], 
        sampsize = c(297, 297))
    lprob.rf <- logit(predict(nsw.rf, type = "prob")[, 2])
    nswa <- nsw[lprob.rf > -1.5, ]
    nswa.rf <- randomForest(trt ~ ., data = nswa[, -c(7:8, 10)], 
        proximity = TRUE)
    lproba.rf <- predict(nswa.rf, type = "prob")[, 2]
    dmat <- 1 - nswa.rf$proximity
    dmat[dmat == 0] <- 0.001
    pts <- cmdscale(dmat)
    rf.mds <- isoMDS(dmat, pts)
    print(summary(rf.mds))
    isoScores <- rf.mds$points
    cutpts <- c(0, round(quantile(lproba.rf, c(1/3, 2/3)), 2), 
        1)
    cutp <- cut(lproba.rf, breaks = cutpts, include.lowest = TRUE)
    gph1 <- xyplot(isoScores[, 2] ~ isoScores[, 1] | cutp, groups = nswa$trt, 
        aspect = 1, scale = list(tck = 0.6), xlab = "Co-ordinate 1", 
        ylab = "Co-ordinate 2", auto.key = list(columns = 2), 
        par.settings = simpleTheme(col = c("black", "gray"), 
            pch = c(1, 3)))
    print(table(cutp))
    print(update(gph1, layout = c(3, 1)))
    if (device != "") 
        dev.off()
    invisible(list(cmd = pts, iso = rf.mds$points, trt = nswa$trt, 
        pred = lproba.rf))
}

pkgs <- c("DAAG", "lattice", "grid", "MASS", "randomForest", "splines")
z <- sapply(pkgs, require, character.only=TRUE, warn.conflicts=FALSE, quietly=TRUE)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
if(any(!z)){
  notAvail <- paste(names(z)[!z], collapse=", ")
  warning(paste("The following packages should be installed:", notAvail))
} 

g13.1() 

plot of chunk unnamed-chunk-1

g13.2()

plot of chunk unnamed-chunk-1

g13.3()
## [1] "rf cutoff at -1.5"
##                  
##                   FALSE TRUE
##   Control (psid1)  2175  315
##   Treatment           8  289

plot of chunk unnamed-chunk-1

## [1] "lda cutoff at -4"
##                  
##                   FALSE TRUE
##   Control (psid1)  2161  329
##   Treatment          16  281
g13.4()
## 
##   0   1 
## 340 272 
## 
##   0   1 
## 315 289
## Warning: variables are collinear

plot of chunk unnamed-chunk-2

g13.5()
## initial  value 51.466491 
## iter   5 value 30.996301
## iter  10 value 29.282968
## iter  15 value 27.753379
## iter  20 value 27.100097
## final  value 27.035117 
## converged
##        Length Class  Mode   
## points 1208   -none- numeric
## stress    1   -none- numeric
## cutp
##    [0,0.11] (0.11,0.83]    (0.83,1] 
##         202         201         201

plot of chunk unnamed-chunk-3