## Chapter 13: Regression on Principal Component or Discriminant Scores

## The following version of overlapDensity() is needed, which should
## be in DAAG_1.21

overlapDensity <-
function (x0, x1, ratio = c(0.05, 20), ratio.number = FALSE,
plotvalues = c("Density", "Numbers"), gpnames = c("Control", "Treatment"),
cutoffs = c(lower = TRUE, upper = TRUE), bw = FALSE,
xlab = "Score", ylab = NULL,
col = 1:2, lty = 1:2, lwd = c(1, 1))
{
if (is.null(plotvalues))
plotvalues <- ""
if (length(plotvalues) > 1)
plotvalues <- plotvalues[1]
if (plotvalues %in% c("Numbers", "Density"))
plotit <- TRUE else plotit <- FALSE
ran <- range(c(x0, x1))
if (all(cutoffs)) {
d0 <- density(x0, from = ran[1], to = ran[2])
d1 <- density(x1, from = ran[1], to = ran[2])
}
else if (cutoffs[1]) {
d0 <- density(x0, from = ran[1])
d1 <- density(x1, from = ran[1])
}
else if (cutoffs[2]) {
d0 <- density(x0, to = ran[2])
d1 <- density(x1, to = ran[2])
}
else {
d0 <- density(x0)
d1 <- density(x1)
}
n0 <- length(x0)
n1 <- length(x1)
f0 <- d0$y f1 <- d1$y
if (plotvalues %in% "Number") {
pf0 <- d0$y * n0 pf1 <- d1$y * n1
}
else {
pf0 <- d0$y pf1 <- d1$y
}
if (plotit) {
xlim <- range(c(d0$x, d1$x), na.rm = TRUE)
if (plotvalues == "Number" & is.null(ylab))
ylab <- "Density x total frequency"
else if (plotvalues == "Density" & is.null(ylab))
ylab <- "Density"
ylim <- range(c(0, pf0, pf1))
ylim[2] <- ylim[2] + 0.1 * diff(ylim)
plot(d1$x, pf1, xlim = xlim, xlab = xlab, xaxt = "n", bty = "n", yaxs = "i", ylim = ylim, ylab = ylab, main = "", type = "n") axis(1) box(bty = "L") lines(d0$x, pf0, col = col[1], lty = lty[1], lwd = lwd[1])
if (bw & lty[2] > 1)
lines(d1$x, pf1, col = col[2], lty = 1) lines(d1$x, f1, col = col[2], lty = lty[2], lwd = lwd[2])
xpos <- par()$usr[2] ypos <- par()$usr[4]
legend(xpos, ypos, lty = lty, lwd = lwd, col = col, cex = 0.8,
legend = gpnames, bty = "n", xjust = 1)
}
if (is.null(ratio))
return()
d0 <- density(x0, from = xlim[1], to = xlim[2])
d1 <- density(x1, from = xlim[1], to = xlim[2])
x01 <- d0$x f0 <- d0$y
f1 <- d1$y if (ratio.number) { cf0 <- f0 * n0 cf1 <- f1 * n1 } else { cf0 <- f0 cf1 <- f1 } cf0[cf0 < 0] <- 0 cf1[cf1 < 0] <- 0 fmin <- pmin(cf0, cf1) fmax <- max(fmin) subs <- match(fmax, fmin) xmid <- x01[subs] flow <- ratio[1] fhi <- ratio[2] lochoose <- x01 < xmid & (cf0 <= flow * cf1 | cf1 <= cf0 * flow) if (any(lochoose)) xlim[1] <- max(x01[lochoose]) else xlim[1] <- min(x01) hichoose <- x01 > xmid & (cf0 >= fhi * cf1 | cf1 >= cf0 * fhi) if (any(hichoose)) xlim[2] <- min(x01[hichoose]) else xlim[2] <- max(x01) if (plotit) { if(ratio.number)midlab <- "ratio: number" else midlab <- "ratio: p.d." axis(3, at = xlim, labels = paste(signif(xlim, 4)), mgp = c(3, 0.5, 0), col = "gray45", line = 1, cex.axis = 0.8) xlim3 <- c(xlim[1], mean(xlim), xlim[2]) axis(3, at = xlim3, line = 1, labels = c(paste("1:", round(1/ratio[1]), sep = ""), midlab, paste(round(ratio[2]), ":1", sep = "")), mgp = c(-3, -1, 0), col = "gray", cex.axis = 0.7, lty = 0) } xlim } ## Sec 13.1: Principal Component Scores in Regression ## Principal components: data frame socsupport (DAAG)  library(DAAG) ## Loading required package: lattice ## ## Attaching package: 'DAAG' ## ## The following object is masked _by_ '.GlobalEnv': ## ## overlapDensity ss.pr1 <- princomp(as.matrix(na.omit(socsupport[, 9:19])), cor=TRUE) pairs(ss.pr1$scores[, 1:3]) 

sort(-ss.pr1$scores[,1], decr=TRUE)[1:10] # Note the outlier  ## 25 47 48 58 13 34 53 61 23 87 ## 4.062 3.796 3.603 3.536 3.417 3.332 3.259 3.165 3.026 3.018 ## Alternative to pairs(), using the lattice function splom()  splom(~ss.pr1$scores[, 1:3]) 

not.na <- complete.cases(socsupport[, 9:19])
not.na[36] <- FALSE
ss.pr <- princomp(as.matrix(socsupport[not.na, 9:19]), cor=TRUE)

summary(ss.pr)     # Examine the contributions of the components 
## Importance of components:
##                        Comp.1 Comp.2 Comp.3  Comp.4  Comp.5  Comp.6
## Standard deviation     2.3937 1.2190 1.1366 0.84477 0.75447 0.69536
## Proportion of Variance 0.5209 0.1351 0.1174 0.06488 0.05175 0.04396
## Cumulative Proportion  0.5209 0.6560 0.7734 0.83833 0.89007 0.93403
##                         Comp.7  Comp.8  Comp.9  Comp.10  Comp.11
## Standard deviation     0.49726 0.45610 0.35952 0.295553 0.231892
## Proportion of Variance 0.02248 0.01891 0.01175 0.007941 0.004889
## Cumulative Proportion  0.95651 0.97542 0.98717 0.995111 1.000000
ss.lm <- lm(BDI[not.na] ~ ss.pr$scores[, 1:6], data=socsupport) summary(ss.lm)$coef 
##                           Estimate Std. Error t value  Pr(>|t|)
## (Intercept)                10.4607     0.8934 11.7088 3.489e-19
## ss.pr$scores[, 1:6]Comp.1 1.3113 0.3732 3.5134 7.229e-04 ## ss.pr$scores[, 1:6]Comp.2   0.3959     0.7329  0.5402 5.905e-01
## ss.pr$scores[, 1:6]Comp.3 0.6036 0.7860 0.7679 4.447e-01 ## ss.pr$scores[, 1:6]Comp.4   1.4248     1.0576  1.3473 1.816e-01
## ss.pr$scores[, 1:6]Comp.5 2.1459 1.1841 1.8122 7.362e-02 ## ss.pr$scores[, 1:6]Comp.6   1.2882     1.2848  1.0027 3.190e-01
ss.pr$loadings[, 1]  ## emotional emotionalsat tangible tangiblesat affect ## -0.3201 -0.2979 -0.2467 -0.2887 -0.3069 ## affectsat psi psisat esupport psupport ## -0.2883 -0.3628 -0.3322 -0.2886 -0.2846 ## supsources ## -0.2846 ## Footnote Code ## Plot first principal component score against BDI attach(socsupport) plot(BDI[not.na] ~ ss.pr$scores[ ,1], col=as.numeric(gender[not.na]),
pch=as.numeric(gender[not.na]), xlab ="1st principal component",
ylab="BDI")
topleft <- par()\$usr[c(1,4)]
legend(topleft[1], topleft[2], col=1:2, pch=1:2, legend=levels(gender))