overlapDensity <-
function (x0, x1, ratio = c(0.05, 20), ratio.number = FALSE,
plotvalues = c("Density", "Numbers"), gpnames = c("Control", "Treatment"),
cutoffs = c(lower = TRUE, upper = TRUE), bw = FALSE,
xlab = "Score", ylab = NULL,
col = 1:2, lty = 1:2, lwd = c(1, 1))
{
if (is.null(plotvalues))
plotvalues <- ""
if (length(plotvalues) > 1)
plotvalues <- plotvalues[1]
if (plotvalues %in% c("Numbers", "Density"))
plotit <- TRUE else plotit <- FALSE
ran <- range(c(x0, x1))
if (all(cutoffs)) {
d0 <- density(x0, from = ran[1], to = ran[2])
d1 <- density(x1, from = ran[1], to = ran[2])
}
else if (cutoffs[1]) {
d0 <- density(x0, from = ran[1])
d1 <- density(x1, from = ran[1])
}
else if (cutoffs[2]) {
d0 <- density(x0, to = ran[2])
d1 <- density(x1, to = ran[2])
}
else {
d0 <- density(x0)
d1 <- density(x1)
}
n0 <- length(x0)
n1 <- length(x1)
f0 <- d0$y
f1 <- d1$y
if (plotvalues %in% "Number") {
pf0 <- d0$y * n0
pf1 <- d1$y * n1
}
else {
pf0 <- d0$y
pf1 <- d1$y
}
if (plotit) {
xlim <- range(c(d0$x, d1$x), na.rm = TRUE)
if (plotvalues == "Number" & is.null(ylab))
ylab <- "Density x total frequency"
else if (plotvalues == "Density" & is.null(ylab))
ylab <- "Density"
ylim <- range(c(0, pf0, pf1))
ylim[2] <- ylim[2] + 0.1 * diff(ylim)
plot(d1$x, pf1, xlim = xlim, xlab = xlab, xaxt = "n",
bty = "n", yaxs = "i", ylim = ylim, ylab = ylab,
main = "", type = "n")
axis(1)
box(bty = "L")
lines(d0$x, pf0, col = col[1], lty = lty[1], lwd = lwd[1])
if (bw & lty[2] > 1)
lines(d1$x, pf1, col = col[2], lty = 1)
lines(d1$x, f1, col = col[2], lty = lty[2], lwd = lwd[2])
xpos <- par()$usr[2]
ypos <- par()$usr[4]
legend(xpos, ypos, lty = lty, lwd = lwd, col = col, cex = 0.8,
legend = gpnames, bty = "n", xjust = 1)
}
if (is.null(ratio))
return()
d0 <- density(x0, from = xlim[1], to = xlim[2])
d1 <- density(x1, from = xlim[1], to = xlim[2])
x01 <- d0$x
f0 <- d0$y
f1 <- d1$y
if (ratio.number) {
cf0 <- f0 * n0
cf1 <- f1 * n1
}
else {
cf0 <- f0
cf1 <- f1
}
cf0[cf0 < 0] <- 0
cf1[cf1 < 0] <- 0
fmin <- pmin(cf0, cf1)
fmax <- max(fmin)
subs <- match(fmax, fmin)
xmid <- x01[subs]
flow <- ratio[1]
fhi <- ratio[2]
lochoose <- x01 < xmid & (cf0 <= flow * cf1 | cf1 <= cf0 *
flow)
if (any(lochoose))
xlim[1] <- max(x01[lochoose])
else xlim[1] <- min(x01)
hichoose <- x01 > xmid & (cf0 >= fhi * cf1 | cf1 >= cf0 *
fhi)
if (any(hichoose))
xlim[2] <- min(x01[hichoose])
else xlim[2] <- max(x01)
if (plotit) {
if(ratio.number)midlab <- "ratio: number" else midlab <- "ratio: p.d."
axis(3, at = xlim, labels = paste(signif(xlim, 4)), mgp = c(3,
0.5, 0), col = "gray45", line = 1, cex.axis = 0.8)
xlim3 <- c(xlim[1], mean(xlim), xlim[2])
axis(3, at = xlim3, line = 1, labels = c(paste("1:", round(1/ratio[1]),
sep = ""), midlab, paste(round(ratio[2]), ":1", sep = "")),
mgp = c(-3, -1, 0), col = "gray", cex.axis = 0.7,
lty = 0)
}
xlim
}
library(DAAG)
## Loading required package: lattice
##
## Attaching package: 'DAAG'
##
## The following object is masked _by_ '.GlobalEnv':
##
## overlapDensity
ss.pr1 <- princomp(as.matrix(na.omit(socsupport[, 9:19])), cor=TRUE)
pairs(ss.pr1$scores[, 1:3])
sort(-ss.pr1$scores[,1], decr=TRUE)[1:10]
## 25 47 48 58 13 34 53 61 23 87
## 4.062 3.796 3.603 3.536 3.417 3.332 3.259 3.165 3.026 3.018
splom(~ss.pr1$scores[, 1:3])
not.na <- complete.cases(socsupport[, 9:19])
not.na[36] <- FALSE
ss.pr <- princomp(as.matrix(socsupport[not.na, 9:19]), cor=TRUE)
summary(ss.pr)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## Standard deviation 2.3937 1.2190 1.1366 0.84477 0.75447 0.69536
## Proportion of Variance 0.5209 0.1351 0.1174 0.06488 0.05175 0.04396
## Cumulative Proportion 0.5209 0.6560 0.7734 0.83833 0.89007 0.93403
## Comp.7 Comp.8 Comp.9 Comp.10 Comp.11
## Standard deviation 0.49726 0.45610 0.35952 0.295553 0.231892
## Proportion of Variance 0.02248 0.01891 0.01175 0.007941 0.004889
## Cumulative Proportion 0.95651 0.97542 0.98717 0.995111 1.000000
ss.lm <- lm(BDI[not.na] ~ ss.pr$scores[, 1:6], data=socsupport)
summary(ss.lm)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.4607 0.8934 11.7088 3.489e-19
## ss.pr$scores[, 1:6]Comp.1 1.3113 0.3732 3.5134 7.229e-04
## ss.pr$scores[, 1:6]Comp.2 0.3959 0.7329 0.5402 5.905e-01
## ss.pr$scores[, 1:6]Comp.3 0.6036 0.7860 0.7679 4.447e-01
## ss.pr$scores[, 1:6]Comp.4 1.4248 1.0576 1.3473 1.816e-01
## ss.pr$scores[, 1:6]Comp.5 2.1459 1.1841 1.8122 7.362e-02
## ss.pr$scores[, 1:6]Comp.6 1.2882 1.2848 1.0027 3.190e-01
ss.pr$loadings[, 1]
## emotional emotionalsat tangible tangiblesat affect
## -0.3201 -0.2979 -0.2467 -0.2887 -0.3069
## affectsat psi psisat esupport psupport
## -0.2883 -0.3628 -0.3322 -0.2886 -0.2846
## supsources
## -0.2846
attach(socsupport)
plot(BDI[not.na] ~ ss.pr$scores[ ,1], col=as.numeric(gender[not.na]),
pch=as.numeric(gender[not.na]), xlab ="1st principal component",
ylab="BDI")
topleft <- par()$usr[c(1,4)]
legend(topleft[1], topleft[2], col=1:2, pch=1:2, legend=levels(gender))
detach(socsupport)
vnames <- c("trt","educ","age","re75","re78")
nsw <- rbind(psid1, subset(nswdemo, trt==1))
round(sapply(nsw[,c("re75","re78")], function(x)unique(sort(x))[2]))
## re75 re78
## 72 52
nsw[,c("re75","re78")] <- log(nsw[,c("re75","re78")] + 30)
lab <- c(vnames[2:3], paste("log\n", vnames[-(1:3)], "+", 30))
nsw$trt <- factor(nsw$trt, labels=c("Control (psid1)","Treatment"))
splom(~ nsw[,vnames[-1]], type=c("p","smooth"), groups=nsw$trt,
varnames=lab, auto.key=list(columns=2))
nsw$fac74 <- with(nsw, factor(re74>0, exclude=NULL))
table(nsw$fac74)
##
## FALSE TRUE <NA>
## 346 2329 112
levels(nsw$fac74) <- c("0","gt0","<NA>")
nswlm <-
function(control=psid1, df1=2, log78=TRUE, offset=30, printit=TRUE){
nsw0 <- rbind(control, subset(nswdemo, trt==1))
nsw0$fac74 <- factor(nsw0$re74>0, exclude=NULL)
levels(nsw0$fac74) <- c("0","gt0","<NA>")
if(log78) nsw.lm <- lm(log(re78+offset) ~ trt + ns(age,df1) +
ns(educ,df1) + black + hisp + fac74 +
ns(log(re75+offset),df1), data=nsw0) else
nsw.lm <- lm(re78 ~ trt + ns(age,df1) + ns(educ,df1) + black +
hisp + fac74 + ns(log(re75+offset),df1),
data=nsw0)
if(printit) print(summary(nsw.lm))
trtvec <- unlist(summary(nsw.lm)$coef["trt", 1:2])
trtEst <- c(trtvec[1], c(trtvec[1]+trtvec[2]*c(-1.96,1.96)))
if(log78) {
trtEst <- c(trtEst[1], exp(trtEst[1]), exp(trtEst[-1]))
names(trtEst)=c("Est.","exp(Est.)","CIlower","CIupper")
} else
names(trtEst)=c("Est.","CIlower","CIupper")
if(printit) print(trtEst)
invisible(list(obj=nsw.lm, est=trtEst))
}
library(splines)
nsw.lm1 <- nswlm(control=psid1)$nsw.lm
##
## Call:
## lm(formula = log(re78 + offset) ~ trt + ns(age, df1) + ns(educ,
## df1) + black + hisp + fac74 + ns(log(re75 + offset), df1),
## data = nsw0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.620 -0.076 0.288 0.645 5.707
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.48161 0.36190 15.15 < 2e-16
## trt 0.98715 0.16378 6.03 1.9e-09
## ns(age, df1)1 -0.64903 0.26352 -2.46 0.014
## ns(age, df1)2 -0.66844 0.12896 -5.18 2.3e-07
## ns(educ, df1)1 0.12596 0.57757 0.22 0.827
## ns(educ, df1)2 0.23131 0.14966 1.55 0.122
## black 0.00495 0.08349 0.06 0.953
## hisp 0.44056 0.17538 2.51 0.012
## fac74gt0 1.19602 0.15453 7.74 1.4e-14
## fac74<NA> -0.74259 0.23994 -3.09 0.002
## ns(log(re75 + offset), df1)1 5.39445 0.26616 20.27 < 2e-16
## ns(log(re75 + offset), df1)2 5.49189 0.29732 18.47 < 2e-16
##
## Residual standard error: 1.72 on 2775 degrees of freedom
## Multiple R-squared: 0.425, Adjusted R-squared: 0.423
## F-statistic: 186 on 11 and 2775 DF, p-value: <2e-16
##
## Est. exp(Est.) CIlower CIupper
## 0.9872 2.6836 1.9467 3.6994
nswlm(control=subset(nswdemo, trt=0))
##
## Call:
## lm(formula = log(re78 + offset) ~ trt + ns(age, df1) + ns(educ,
## df1) + black + hisp + fac74 + ns(log(re75 + offset), df1),
## data = nsw0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.420 -2.839 0.833 1.801 4.042
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.9463 0.6598 13.56 < 2e-16
## trt 0.3455 0.1508 2.29 0.02220
## ns(age, df1)1 -1.0077 0.4683 -2.15 0.03163
## ns(age, df1)2 0.8808 0.7024 1.25 0.21018
## ns(educ, df1)1 -1.5264 1.0817 -1.41 0.15849
## ns(educ, df1)2 1.4609 0.5966 2.45 0.01450
## black -0.9360 0.2532 -3.70 0.00023
## hisp -0.0177 0.3373 -0.05 0.95816
## fac74gt0 -0.4887 0.3106 -1.57 0.11598
## fac74<NA> -0.4863 0.3042 -1.60 0.11026
## ns(log(re75 + offset), df1)1 1.4305 0.5363 2.67 0.00777
## ns(log(re75 + offset), df1)2 1.5888 0.4467 3.56 0.00039
##
## Residual standard error: 2.35 on 1007 degrees of freedom
## Multiple R-squared: 0.0629, Adjusted R-squared: 0.0527
## F-statistic: 6.15 on 11 and 1007 DF, p-value: 7.88e-10
##
## Est. exp(Est.) CIlower CIupper
## 0.3455 1.4126 1.0511 1.8985
nswlm(control=psid1, log78=FALSE)
##
## Call:
## lm(formula = re78 ~ trt + ns(age, df1) + ns(educ, df1) + black +
## hisp + fac74 + ns(log(re75 + offset), df1), data = nsw0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46247 -4931 -896 3910 114471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3370 2211 1.52 0.12758
## trt 2938 1001 2.94 0.00335
## ns(age, df1)1 -489 1610 -0.30 0.76153
## ns(age, df1)2 -1319 788 -1.67 0.09432
## ns(educ, df1)1 3788 3529 1.07 0.28318
## ns(educ, df1)2 7921 914 8.66 < 2e-16
## black -741 510 -1.45 0.14660
## hisp 2283 1072 2.13 0.03321
## fac74gt0 3530 944 3.74 0.00019
## fac74<NA> -253 1466 -0.17 0.86314
## ns(log(re75 + offset), df1)1 23023 1626 14.16 < 2e-16
## ns(log(re75 + offset), df1)2 67235 1816 37.01 < 2e-16
##
## Residual standard error: 10500 on 2775 degrees of freedom
## Multiple R-squared: 0.548, Adjusted R-squared: 0.546
## F-statistic: 305 on 11 and 2775 DF, p-value: <2e-16
##
## Est. CIlower CIupper
## 2937.8 976.6 4899.0
url <- 'http://maths-people.anu.edu.au/~johnm/r-book/xtra-data/cps3.RData'
if(file.exists('cps3.RData'))check <- 0 else
check <- download.file(url, destfile='cps3.RData')
if(!check){
load('cps3.RData')
for (z in list(psid1,psid2,psid3,cps1,cps2,cps3))
print(nswlm(control=z, printit=FALSE)$est)
}
## Est. exp(Est.) CIlower CIupper
## 0.9872 2.6836 1.9467 3.6994
## Est. exp(Est.) CIlower CIupper
## 0.6124 1.8449 1.0056 3.3847
## Est. exp(Est.) CIlower CIupper
## 0.9172 2.5023 1.2135 5.1598
## Est. exp(Est.) CIlower CIupper
## 0.8478 2.3345 1.7463 3.1208
## Est. exp(Est.) CIlower CIupper
## 0.473 1.605 1.078 2.389
## Est. exp(Est.) CIlower CIupper
## 0.4877 1.6285 0.9571 2.7709
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
nsw.rf <- randomForest(trt ~ ., data=nsw[, -c(7:8,10)],
sampsize=c(297,297))
nsw.rf
##
## Call:
## randomForest(formula = trt ~ ., data = nsw[, -c(7:8, 10)], sampsize = c(297, 297))
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 4.49%
## Confusion matrix:
## Control (psid1) Treatment class.error
## Control (psid1) 2383 107 0.04297
## Treatment 18 279 0.06061
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:DAAG':
##
## hills
library(splines)
nsw.lda <- lda(trt ~ ns(age,2) + ns(educ,2) + black + hisp +
fac74 + ns(log(re75+30),3),
CV=TRUE, prior=c(.5,.5), data=nsw)
tab <- table(nsw.lda$class, nsw$trt)
1 - sum(tab[row(tab)==col(tab)])/sum(tab)
## [1] 0.04306
logit <- function(p, offset=0.001)log((p+offset)/(1+offset-p))
tnum <- unclass(nsw$trt)
sc.rf <- logit(predict(nsw.rf, type="prob")[,2])
oldpar <- par(mar = c(4.1, 3.6, 4.6, 1.1), mgp = c(2.25,
0.5, 0), mfrow = c(1, 2))
overlapDensity(sc.rf[tnum==1], sc.rf[tnum==2], ratio=c(1/20, 50),
ratio.number=TRUE, plotvalues="Density")
## [1] -0.8788 6.9088
nsw.lda <- lda(trt ~ ns(age,2) + ns(educ,2) + black + hisp + fac74 +
ns(log(re75+30),3), prior=c(.5,.5), CV=TRUE, data=nsw)
sc.lda <- logit(nsw.lda$posterior[,2])
overlapDensity(sc.lda[tnum==1], sc.lda[tnum==2], ratio=c(1/20, 50),
ratio.number=TRUE, plotvalues="Density")
## [1] -4.446 3.097
par(oldpar)
nswa <- nsw[sc.rf>-1.5, ]
nswa.rf <- randomForest(trt ~ ., data=nswa[, -c(7:8,10)])
proba.rf <- predict(nswa.rf, type="prob")[,2]
sca.rf <- logit(proba.rf)
xyplot(age + educ + log(re75+30) ~ sca.rf, groups=trt, layout=c(3,1),
data=nswa, type=c("p","smooth"), span=0.4, aspect=1,
par.settings=simpleTheme(lwd=c(2,1.5), col=c("gray", "black"),
pch=c(20,3), cex=0.5, lty=c("solid","21")),
scales=list(y=list(relation="free"), tck=0.5),
auto.key=list(columns=2, points=TRUE, lines=TRUE,
text=c("psid1 controls", "experimental treatment")),
xlab="Scores, derived using randomForest()")
nswa.rf <- randomForest(trt ~ ., data=nswa[, -c(7:8,10)],
proximity=TRUE)
dmat <- 1-nswa.rf$proximity + 0.001
proba.rf <- predict(nswa.rf, type="prob")[,2]
pts <- cmdscale(dmat)
ordScores <- isoMDS(dmat, pts)$points
## initial value 50.632155
## iter 5 value 31.625056
## iter 10 value 30.116153
## iter 15 value 29.178852
## iter 20 value 28.480480
## iter 20 value 28.456780
## iter 20 value 28.441345
## final value 28.441345
## converged
cutpts <- c(0, round(quantile(proba.rf, c(1/3,2/3)), 2), 1)
cutp <- cut(proba.rf, breaks=cutpts, include.lowest=TRUE)
xyplot(ordScores[,2] ~ ordScores[,1]|cutp, groups=nswa$trt,
xlab="Co-ordinate 1", ylab="Co-ordinate 2",
auto.key=list(columns=2), aspect=1, layout=c(3,1),
par.settings=simpleTheme(col=c("black","gray"), pch=c(1,3)))
sca.rf <- logit(proba.rf)
rf.glm <- glm(I(re78>0) ~ ns(sca.rf,2)+trt, data=nswa,
family=binomial)
## Warning: glm.fit: algorithm did not converge
summary(rf.glm)
##
## Call:
## glm(formula = I(re78 > 0) ~ ns(sca.rf, 2) + trt, family = binomial,
## data = nswa)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## 2.41e-06 2.41e-06 2.41e-06 2.41e-06 2.41e-06
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.66e+01 9.29e+04 0 1
## ns(sca.rf, 2)1 9.31e-06 2.16e+05 0 1
## ns(sca.rf, 2)2 -2.48e-06 7.80e+04 0 1
## trtTreatment 2.79e-06 4.51e+04 0 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 0.0000e+00 on 595 degrees of freedom
## Residual deviance: 3.4577e-09 on 592 degrees of freedom
## AIC: 8
##
## Number of Fisher Scoring iterations: 25
rf.lm <- lm(log(re78+30) ~ ns(sca.rf,2)+trt, data=nswa,
subset = re78>0)
summary(rf.lm)
##
## Call:
## lm(formula = log(re78 + 30) ~ ns(sca.rf, 2) + trt, data = nswa,
## subset = re78 > 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.1167 -0.0983 0.0338 0.0531 0.1118
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.61050 0.01829 197.37 <2e-16
## ns(sca.rf, 2)1 0.00763 0.04248 0.18 0.86
## ns(sca.rf, 2)2 -0.01831 0.01535 -1.19 0.23
## trtTreatment 0.00623 0.00888 0.70 0.48
##
## Residual standard error: 0.0701 on 592 degrees of freedom
## Multiple R-squared: 0.00421, Adjusted R-squared: -0.000835
## F-statistic: 0.834 on 3 and 592 DF, p-value: 0.475
round(summary(lm(log(re78+30) ~ trt, data=nswdemo,
subset=re78>0))$coef, 4)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.5313 0.0603 141.4027 0.0000
## trt 0.0043 0.0912 0.0476 0.9621