Chapter 12, Figures 6 to 10: Multivariate Data Exploration and Discrimination
Packages required: “MASS”, “multtest”, “hddplot”
The script that follows is designed to be executed as it stands. It sets up
functions that create Ch 12 graphs, then runs those functions. If executed by
clicking on the RStudio “Compile Notebook …” command, it will be processed
through R Markdown, generating a document that includes code, output and
graphs generated by executing the functions. If some needed packages are
missing, warning messages will appear. See further:
http://rmarkdown.rstudio.com/r_notebook_format.html
g12.6 <-
function(df.all=Golub, info=golubInfo, test="f", use=NULL,
m=15, device="", seed=57, new=F, offleft = -0.25,
texfrac = 0.225, colr=c("red","blue","gray40", "magenta")){
if(device!="") hardcopy(width=4.5, height=2.5, trellis=F,
color=T, pointsize=7, device=device)
if(!require(MASS))return("Package 'MASS' must be installed.")
attach(golubInfo)
## Identify allB samples for that are BM:f or BM:m or PB:m
subsetB <- cancer=="allB" & tissue.mf%in%c("BM:f","BM:m","PB:m")
tissue.mfB <- tissue.mf[subsetB, drop=TRUE]
GolubB <- Golub[, subsetB]
G.PBf <- Golub[, tissue.mf=="PB:f"
& cancer=="allB", drop=FALSE]
detach(golubInfo)
set.seed(41)
rGolubB <- matrix(rnorm(prod(dim(GolubB))), nrow=dim(GolubB)[1])
rownames(rGolubB) <- rownames(Golub)
rG.PBf <- matrix(rnorm(prod(dim(G.PBf))), nrow=dim(G.PBf)[1])
oldpar <- par(mar=c(4.1, 3.6, 2.6, 0.1), mgp=c(2.5,0.5,0), mfrow=c(1,2),
oma=c(0,0.6,0,1.1), pty="s")
on.exit(par(oldpar))
if(!is.null(seed))set.seed(seed)
plot2 <- function(x = GolubB, cl=cl,
x.omit=Golub.PBf,
cl.omit="PB:f", ncol = length(cl),
nfeatures=12, device = "", seed = 37,
pretext="", colr=1:3,
levnames = NULL,
ylab="2nd discriminant function"){
cl <- factor(cl)
if(!is.null(levnames))levels(cl) <- levnames
ord15 <- orderFeatures(x, cl=cl)[1:m]
dfB <- t(x[ord15, ])
dfB.lda <- lda(dfB, grouping=cl)
scores <- predict(dfB.lda, dimen=2)$x
df.PBf <- data.frame(t(x.omit[ord15, drop=FALSE]))
colnames(df.PBf) <- colnames(dfB)
scores.other <- predict(dfB.lda, newdata=df.PBf)$x
scoreplot(list(scores=scores, cl=cl, other=scores.other,
cl.other=cl.omit, nfeatures=nfeatures),
prefix.title=pretext, adj.title=0,
params=list(other=list(pch=4, cex=1.5)),
xlab="1st discriminant function", ylab=ylab)
## mtext(side=3, line=1, paste(pretext,
## nf.use, "features"), adj=0)
}
plot2(x = GolubB, cl = tissue.mfB, x.omit=G.PBf, cl.omit="PB:f",
nfeatures=m, device = "", seed = 37,
ylab="2nd discriminant function",
colr=colr,
pretext="A: ALL B-cell:")
plot2(x = rGolubB, cl = tissue.mfB,
x.omit=rG.PBf, cl.omit="Gp 4",
nfeatures=m, device = "", seed = 37,
colr=colr,
levnames = c("Gp 1", "Gp 2", "Gp 3"),
pretext="B: Random data:", ylab="")
if(device!="")dev.off()
}
g12.7 <-
function(device=""){
if(device!="") hardcopy(width=4.15, height=2.25, trellis=F,
color=T, pointsize=8, device=device)
if(!require(MASS))return("Package 'MASS' must be installed.")
if(!exists("GoluB.maxT")){
attach(golubInfo)
## Identify allB samples for that are BM:f or BM:m or PB:m
subsetB <- cancer=="allB" & tissue.mf%in%c("BM:f","BM:m","PB:m")
## Form vector that identifies these as BM:f or BM:m or PB:m
tissue.mfB <- tissue.mf[subsetB, drop=TRUE]
## Separate off the relevant columns of the matrix Golub
GolubB <- Golub[, subsetB]
detach(golubInfo)
if(!require(multtest))return("Package 'multtest' must be installed.")
GolubB.maxT <- mt.maxT(GolubB, unclass(tissue.mfB)-1, test="f",
B=100000)
}
## Compare calculated F-statistics with permutation distribution
oldpar <- par(mar=c(4.1,4.1,1.6,0.6), mgp=c(2.5,.75,0),
mfrow=c(1,2), pty="s", tcl=-0.35)
qqthin(qf(1-GolubB.maxT$rawp,2,28), GolubB.maxT$teststat,
xlab="Quantiles of permutation F-values",
ylab="Observed F-statistics", adj.xlab=0.55)
mtext(side=3, line=0.5, "A", adj=0)
abline(0, 1, lty=2)
## Compare calculated F-statistics with theoretical F-distribution
qqthin(qf(ppoints(7129),2,28), GolubB.maxT$teststat,
xlab="Quantiles of F - theoretical", adj.xlab=0.55,
ylab="Observed F-statistics")
mtext(side=3, line=0.5, "B", adj=0)
abline(0, 1, lty=2)
if(device!="")dev.off()
}
g12.8 <-
function(device="", x=Golub.BM, nseq=NULL, cl=cancer.BM,
seed=29, nfeatures=c(14,10), colr = c("red", "blue", "magenta")){
oldpar <- par(mar=c(3.6, 3.6, 2.6, 0.1), mgp=c(2.25,0.5,0), mfrow=c(1,2),
oma=c(0,0.6,0,0.1), pty="s")
on.exit(par(oldpar))
Golub.BM <- with(golubInfo, Golub[, BM.PB=="BM"])
cancer.BM <- with(golubInfo, cancer[BM.PB=="BM"])
if(device!="")hardcopy(width = 4.5, height = 2.5, trellis = F,
color=T, pointsize=7, device=device)
gp.id <- divideUp(cl, nset=2, seed=seed)
plotTrainTest(x=x, nfeatures=nfeatures, cl=cl,
traintest=gp.id)
if(device!="")dev.off()
}
g12.9 <-
function(device="", cv1=tissue.mfB.cv, cv2=rtissue.mfB.cv,
badcv1=tissue.mfB.badcv, badcv2=rtissue.mfB.badcv, nseq=NULL){
if(device!="") hardcopy(width=5.25, height=2.75, trellis=F,
color=TRUE, pointsize=8, device=device)
oldpar <- par(mar=c(4.1, 3.1, 2.6, 0.1), mgp=c(2.5,0.5,0), mfrow=c(1,2),
oma=c(0,0.6,0,1.1))
on.exit(par(oldpar))
## Accuracy measures are cv: tissue.mfB.cv$acc.cv
## Resubstitution (red points): tissue.mfB.badcv$acc.resub
## "Select once" (gray): tissue.mfB.badcv$acc.sel1
plot.acc <- function(cv=cv1, badcv=badcv1, nseq=NULL, badnseq=NULL,
AB="", ylab="Predictive accuracy",
add.legend=TRUE){
maxg <- min(c(length(badcv$acc.resub), length(cv$acc.cv)))
if(is.null(nseq))nseq <- 1:maxg
plot(nseq, badcv$acc.resub[1:maxg], ylim=c(0,1), type="n", yaxs="i",
xlab="Number of features selected", ylab=ylab)
par(xpd=T)
points(nseq, badcv$acc.resub[1:maxg], col=2, type="b", lty=2, pch=0,
cex=0.8)
par(xpd=FALSE)
points(nseq, badcv$acc.sel1[1:maxg], col="gray40", pch=3, cex=0.8)
lines(lowess(nseq, badcv$acc.sel1[1:maxg], f=.325, iter=0), col="gray40",
lty=2)
points(nseq, cv$acc.cv[1:maxg], col="blue", pch=1, cex=0.8)
lines(lowess(nseq, cv$acc.cv[1:maxg], f=.325, iter=0), col="blue",lwd=2)
xy <- par()$usr[c(1,3)]
if(add.legend)
legend(xy[1], xy[2], xjust=0, yjust=0,
legend=c("Resubstitution accuracy",
"Defective cross-validation",
"Cross-validation - select at each fold"),
lty=c(1,2,1), lwd=c(1,1,2), pch=c(0,3,1),
col=c("red","gray40","blue"), cex=0.875)
mtext(side=3,line=0.35, AB, adj=0)
}
plot.acc(cv1, badcv1, AB="A: Golub data (as for Figure 12.9)")
plot.acc(cv2, badcv2, ylab="", AB="B: Random data", add.legend=FALSE)
if(device!="")dev.off()
}
g12.10 <-
function(device=""){
if(device!="") hardcopy(width=5.25, height=2.75, trellis=F,
color=TRUE, pointsize=8, device=device)
oldpar <- par(mar = c(4.1, 3.6, 2.6, 0.1), mgp = c(2.25, 0.5,
0), mfrow = c(1, 2), oma = c(0, 0.6, 0, 1.1), pty="s")
on.exit(par(oldpar))
attach(golubInfo)
scoreplot(scorelist = tissue.mfB.scores, cl.circle=NULL,
prefix="A: B-cell subset -", xlab="Discriminant function 1",
ylab="Discriminant function 2",
adj.title=0)
scoreplot(scorelist=BMonly.scores, cl.circle=tissue.mfB,
circle=tissue.mfB%in%c("BM:f","BM:m"),
params=list(circle=list(cex=1.3, col=c("pink","cyan")),
points=list(cex=0.65)), xlab="Discriminant function 1",
ylab="",
prefix="B: BM samples -", adj.title=0)
detach(golubInfo)
if(device!="")dev.off()
}
g12.11 <-
function(device=""){
if(device!="") hardcopy(width=5.25, height=2.75, trellis=F,
color=TRUE, pointsize=8, device=device)
oldpar <- par(mar = c(4.1, 3.6, 2.6, 0.1), mgp = c(2.25, 0.5,
0), mfrow = c(1, 2), oma = c(0, 0.6, 0, 1.1), pty="s")
on.exit(par(oldpar))
attach(golubInfo)
## tissue.mfB.cv <- cvdisc(dsetB, cl=tissue.mfB, nf.use=1:27)
## tissue.mfB.scores <-
## cvscores(cvlist = tissue.mfB.cv,
## nfeatures = 3, ndisc = NULL, cl.other = NULL)
scoreplot(scorelist = tissue.mfB.scores, cl.circle=NULL,
prefix="A: B-cell subset -", xlab="Discriminant function 1",
ylab="Discriminant function 2",
adj.title=0)
## BMonly.scores <-
## cvscores(cvlist=BMonly.cv, nfeatures=11, cl.other=NULL)
scoreplot(scorelist=BMonly.scores, cl.circle=tissue.mfB,
circle=tissue.mfB%in%c("BM:f","BM:m"),
params=list(circle=list(cex=1.3, col=c("pink","cyan")),
points=list(cex=0.65)), xlab="Discriminant function 1",
ylab="",
prefix="B: BM samples -", adj.title=0)
detach(golubInfo)
if(device!="")dev.off()
}
pkgs <- c("MASS", "multtest", "hddplot")
z <- sapply(pkgs, require, character.only=TRUE, warn.conflicts=FALSE, quietly=TRUE)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist
##
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
if(any(!z)){
notAvail <- paste(names(z)[!z], collapse=", ")
warning(paste("The following packages should be installed:", notAvail))
}
g12.6()
g12.7()
## b=1000 b=2000 b=3000 b=4000 b=5000 b=6000 b=7000 b=8000 b=9000 b=10000
## b=11000 b=12000 b=13000 b=14000 b=15000 b=16000 b=17000 b=18000 b=19000 b=20000
## b=21000 b=22000 b=23000 b=24000 b=25000 b=26000 b=27000 b=28000 b=29000 b=30000
## b=31000 b=32000 b=33000 b=34000 b=35000 b=36000 b=37000 b=38000 b=39000 b=40000
## b=41000 b=42000 b=43000 b=44000 b=45000 b=46000 b=47000 b=48000 b=49000 b=50000
## b=51000 b=52000 b=53000 b=54000 b=55000 b=56000 b=57000 b=58000 b=59000 b=60000
## b=61000 b=62000 b=63000 b=64000 b=65000 b=66000 b=67000 b=68000 b=69000 b=70000
## b=71000 b=72000 b=73000 b=74000 b=75000 b=76000 b=77000 b=78000 b=79000 b=80000
## b=81000 b=82000 b=83000 b=84000 b=85000 b=86000 b=87000 b=88000 b=89000 b=90000
## b=91000 b=92000 b=93000 b=94000 b=95000 b=96000 b=97000 b=98000 b=99000 b=100000
## [1] "Graph retains 302 points."
## [1] "Graph retains 274 points."
g12.8()
attach(golubInfo)
subsetB <- cancer=="allB" & tissue.mf%in%c("BM:f","BM:m","PB:m")
tissue.mfB <- factor(tissue.mf[subsetB])
dsetB <- Golub[,subsetB]
options(warning=FALSE)
tissue.mfB.badcv <- defectiveCVdisc(dsetB, cl=tissue.mfB,
nfeatures=1:23)
tissue.mfB.cv <- cvdisc(dsetB, cl=tissue.mfB, nfeatures=1:23)
##
## Preliminary per fold calculations
## 1:2:3:4:5:6:7:8:9:10:
## Show each choice of number of features:
## 1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16:17:18:19:20:21:22:23:
##
## Accuracy Best accuracy, less 1SD Best accuracy
## (Cross-validation) 0.85 (3 features) 0.9 (4 features)
detach(golubInfo)
rdsetB <- matrix(rnorm(prod(dim(dsetB))), nrow=dim(dsetB)[1])
rtissue.mfB.cv <- cvdisc(rdsetB, cl=tissue.mfB, nfeatures=1:23)
## [1] "Input rows (features) are not named. Names"
## [1] "1:7129 will be assigned."
##
## Preliminary per fold calculations
## 1:2:3:4:5:6:7:8:9:10:
## Show each choice of number of features:
## 1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16:17:18:19:20:21:22:23:
##
## Accuracy Best accuracy, less 1SD Best accuracy
## (Cross-validation) 0.66 (2 features) 0.74 (4 features)
rtissue.mfB.badcv <- defectiveCVdisc(rdsetB, cl=tissue.mfB,
nfeatures=1:23)
## [1] "Input rows (features) are not named. Names"
## [1] "1:7129 will be assigned."
options(warning=TRUE)
g12.9()
tissue.mfB.scores <-
cvscores(cvlist = tissue.mfB.cv,
nfeatures = 3, ndisc = NULL, cl.other = NULL)
## 1:2:3:4:5:6:7:8:9:10
## 1:2:3:4:5:6:7:8:9:10
Golub.BM <- with(golubInfo, Golub[, BM.PB=="BM"])
cancer.BM <- with(golubInfo, cancer[BM.PB=="BM"])
BMonly.cv <- cvdisc(Golub.BM, cl=cancer.BM, nfeatures=1:25,
nfold=c(10,4))
##
## Preliminary per fold calculations
## 1:2:3:4:5:6:7:8:9:10:1:2:3:4:5:6:7:8:9:10:1:2:3:4:5:6:7:8:9:10:1:2:3:4:5:6:7:8:9:10:
## Show each choice of number of features:
## 1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16:17:18:19:20:21:22:23:24:25:
##
## Accuracy Best accuracy, less 1SD Best accuracy
## (Cross-validation) 0.9 (19 features) 0.94 (23 features)
BMonly.scores <-
cvscores(cvlist=BMonly.cv, nfeatures=11, cl.other=NULL)
## 1:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:10
## 1:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:10
g12.10()
## g12.11()
gdump <-
function(fnam=NULL, prefix="~/dbeta/figures/figs",
xtras=c("hardcopy","renum.fun","renum.files"), splitchar="/ch"){
if(is.null(fnam)){
path <- getwd()
pathtag <- strsplit(path, "/ch", fixed=TRUE)[[1]]
fnam <- paste(prefix, pathtag[length(pathtag)],
".R", sep="")
}
else fnam <- paste(prefix, fnam, sep="/")
objnames <- c(objects(pattern="^g", envir=sys.frame(0)), xtras)
cat("\nDump to file:", fnam, "\n")
print(objnames)
dump(objnames, fnam)
}
renum.fun <-
function(from.prefix="g", to.prefix="g",from=7:11, to=6:10, doit=F){
path <- getwd()
pathtag <- strsplit(path, "/ch", fixed=TRUE)[[1]]
endbit <- pathtag[length(pathtag)]
to.prefix <- paste(to.prefix, endbit, sep="")
if(is.null(from.prefix))from.prefix <- to.prefix
for(i in 1:length(to))
{txt<-paste(to.prefix,".",to[i]," <- ", from.prefix,".",from[i],sep="")
if(doit)eval(parse(text=txt),envir=sys.frame(0))
print(txt)
if(from.prefix!=to.prefix){
rm.txt <- paste("rm(",from.prefix,".",from[i],")",sep="")
if(doit)eval(parse(text=rm.txt),envir=sys.frame(0))
print(rm.txt)
}
}
}
renum.files <-
function(from.prefix="~/dbeta/Art/", to.prefix="~/dbeta/Art/",
from=7:11, to=6:10, doit=F){
path <- getwd()
pathtag <- strsplit(path, "/ch", fixed=TRUE)[[1]]
endbit <- pathtag[length(pathtag)]
if(nchar(endbit)==2)chap <- paste(endbit) else
chap <- paste("0",endbit,sep="")
from.prefix <- paste(from.prefix, chap, "-", sep="")
to.prefix <- paste(to.prefix, chap, "-", sep="")
for(i in 1:length(from)){
if (from[i]<=9) ltext <- paste("0",from[i],sep="")
else ltext <- paste(from[i])
if (to[i]<=9) rtext <- paste("0",to[i],sep="")
else rtext <- paste(to[i])
txt<-paste("mv ", from.prefix, ltext, ".pdf", " ",
to.prefix, rtext, ".pdf", sep="")
backup<-paste("cp ", from.prefix, ltext, ".pdf", " ",
"archive", sep="")
if(doit)system(backup)
if(doit)system(txt)
print(backup)
print(txt)
}
}