Chapter 12, Figures 6 to 10: Multivariate Data Exploration and Discrimination
Packages required: “MASS”, “multtest”, “hddplot”

The script that follows is designed to be executed as it stands. It sets up
functions that create Ch 12 graphs, then runs those functions. If executed by
clicking on the RStudio “Compile Notebook …” command, it will be processed
through R Markdown, generating a document that includes code, output and
graphs generated by executing the functions. If some needed packages are
missing, warning messages will appear. See further:
http://rmarkdown.rstudio.com/r_notebook_format.html

g12.6 <-
function(df.all=Golub, info=golubInfo, test="f", use=NULL,
           m=15, device="", seed=57, new=F, offleft = -0.25,
           texfrac = 0.225, colr=c("red","blue","gray40", "magenta")){
    if(device!="") hardcopy(width=4.5, height=2.5, trellis=F,
                            color=T, pointsize=7, device=device)
    if(!require(MASS))return("Package 'MASS' must be installed.")  
    attach(golubInfo)
    ## Identify allB samples for that are BM:f or BM:m or PB:m
    subsetB <- cancer=="allB" & tissue.mf%in%c("BM:f","BM:m","PB:m")
    tissue.mfB <- tissue.mf[subsetB, drop=TRUE]
    GolubB <- Golub[, subsetB]
    G.PBf <- Golub[, tissue.mf=="PB:f" 
                                    & cancer=="allB", drop=FALSE]
    detach(golubInfo)
    set.seed(41)
    rGolubB <- matrix(rnorm(prod(dim(GolubB))), nrow=dim(GolubB)[1])
    rownames(rGolubB) <- rownames(Golub)
    rG.PBf <- matrix(rnorm(prod(dim(G.PBf))), nrow=dim(G.PBf)[1])
    oldpar <- par(mar=c(4.1, 3.6, 2.6, 0.1), mgp=c(2.5,0.5,0), mfrow=c(1,2),
                  oma=c(0,0.6,0,1.1), pty="s")    
    on.exit(par(oldpar))
    if(!is.null(seed))set.seed(seed)
    
    plot2 <- function(x = GolubB, cl=cl,
                      x.omit=Golub.PBf,
                      cl.omit="PB:f", ncol = length(cl), 
                      nfeatures=12, device = "", seed = 37,
                      pretext="", colr=1:3,
                      levnames = NULL,
                      ylab="2nd discriminant function"){
      cl <- factor(cl)
      if(!is.null(levnames))levels(cl) <- levnames
      ord15 <- orderFeatures(x, cl=cl)[1:m]
      dfB <- t(x[ord15, ])
      dfB.lda <-  lda(dfB, grouping=cl)
      scores <- predict(dfB.lda, dimen=2)$x
      df.PBf <- data.frame(t(x.omit[ord15, drop=FALSE]))
      colnames(df.PBf) <- colnames(dfB)
      scores.other <- predict(dfB.lda, newdata=df.PBf)$x
      scoreplot(list(scores=scores, cl=cl, other=scores.other,
                     cl.other=cl.omit, nfeatures=nfeatures),
                prefix.title=pretext, adj.title=0,
                params=list(other=list(pch=4, cex=1.5)),
                xlab="1st discriminant function", ylab=ylab)
##      mtext(side=3, line=1, paste(pretext,
##                      nf.use, "features"), adj=0)
      
    }
    plot2(x = GolubB, cl = tissue.mfB, x.omit=G.PBf, cl.omit="PB:f",
          nfeatures=m, device = "", seed = 37,
          ylab="2nd discriminant function",
          colr=colr,
          pretext="A: ALL B-cell:")
    plot2(x = rGolubB, cl = tissue.mfB,
            x.omit=rG.PBf, cl.omit="Gp 4",
          nfeatures=m, device = "", seed = 37,
          colr=colr,
          levnames = c("Gp 1", "Gp 2", "Gp 3"),
          pretext="B: Random data:", ylab="")
    if(device!="")dev.off()
  }

g12.7 <-
function(device=""){
    if(device!="") hardcopy(width=4.15, height=2.25, trellis=F,
                            color=T, pointsize=8, device=device)
    if(!require(MASS))return("Package 'MASS' must be installed.")  
    if(!exists("GoluB.maxT")){
      attach(golubInfo)
      ## Identify allB samples for that are BM:f or BM:m or PB:m
      subsetB <- cancer=="allB" & tissue.mf%in%c("BM:f","BM:m","PB:m")
      ## Form vector that identifies these as BM:f or BM:m or PB:m
      tissue.mfB <- tissue.mf[subsetB, drop=TRUE]
      ## Separate off the relevant columns of the matrix Golub
      GolubB <- Golub[, subsetB]
      detach(golubInfo)
      if(!require(multtest))return("Package 'multtest' must be installed.")  
      GolubB.maxT <- mt.maxT(GolubB, unclass(tissue.mfB)-1, test="f",  
                       B=100000) 
    }
    ## Compare calculated F-statistics with permutation distribution
    oldpar <- par(mar=c(4.1,4.1,1.6,0.6), mgp=c(2.5,.75,0),
                  mfrow=c(1,2), pty="s", tcl=-0.35)
    qqthin(qf(1-GolubB.maxT$rawp,2,28), GolubB.maxT$teststat,
           xlab="Quantiles of permutation F-values",
           ylab="Observed F-statistics", adj.xlab=0.55)
    mtext(side=3, line=0.5, "A", adj=0)
    abline(0, 1, lty=2)
    ## Compare calculated F-statistics with theoretical F-distribution
    qqthin(qf(ppoints(7129),2,28), GolubB.maxT$teststat,
           xlab="Quantiles of F - theoretical", adj.xlab=0.55,
           ylab="Observed F-statistics")
    mtext(side=3, line=0.5, "B", adj=0)
    abline(0, 1, lty=2)
    if(device!="")dev.off()
  }

g12.8 <-
function(device="",  x=Golub.BM, nseq=NULL, cl=cancer.BM,
           seed=29, nfeatures=c(14,10), colr = c("red", "blue", "magenta")){
    oldpar <- par(mar=c(3.6, 3.6, 2.6, 0.1), mgp=c(2.25,0.5,0), mfrow=c(1,2),
                  oma=c(0,0.6,0,0.1), pty="s")    
    on.exit(par(oldpar))
    Golub.BM <- with(golubInfo, Golub[, BM.PB=="BM"])
    cancer.BM <- with(golubInfo, cancer[BM.PB=="BM"])
    if(device!="")hardcopy(width = 4.5, height = 2.5, trellis = F,
                           color=T, pointsize=7, device=device)
    gp.id <- divideUp(cl, nset=2, seed=seed)
    plotTrainTest(x=x, nfeatures=nfeatures, cl=cl,
                  traintest=gp.id)
    if(device!="")dev.off()
  }

g12.9 <-
function(device="", cv1=tissue.mfB.cv, cv2=rtissue.mfB.cv,
           badcv1=tissue.mfB.badcv, badcv2=rtissue.mfB.badcv, nseq=NULL){
    if(device!="") hardcopy(width=5.25, height=2.75, trellis=F,
                            color=TRUE, pointsize=8, device=device)
    oldpar <- par(mar=c(4.1, 3.1, 2.6, 0.1), mgp=c(2.5,0.5,0), mfrow=c(1,2),
                  oma=c(0,0.6,0,1.1))
    on.exit(par(oldpar))
        ## Accuracy measures are cv: tissue.mfB.cv$acc.cv
    ## Resubstitution (red points): tissue.mfB.badcv$acc.resub
    ## "Select once" (gray):  tissue.mfB.badcv$acc.sel1
    plot.acc <- function(cv=cv1, badcv=badcv1, nseq=NULL, badnseq=NULL,
                         AB="", ylab="Predictive accuracy",
                         add.legend=TRUE){
      maxg <- min(c(length(badcv$acc.resub), length(cv$acc.cv)))
      if(is.null(nseq))nseq <- 1:maxg
      plot(nseq, badcv$acc.resub[1:maxg], ylim=c(0,1), type="n", yaxs="i",
           xlab="Number of features selected", ylab=ylab)
      par(xpd=T)
      points(nseq, badcv$acc.resub[1:maxg], col=2, type="b", lty=2, pch=0,
             cex=0.8)
      par(xpd=FALSE)
      points(nseq, badcv$acc.sel1[1:maxg], col="gray40", pch=3, cex=0.8)
      lines(lowess(nseq, badcv$acc.sel1[1:maxg], f=.325, iter=0), col="gray40",
            lty=2)
      points(nseq, cv$acc.cv[1:maxg], col="blue", pch=1, cex=0.8)
      lines(lowess(nseq, cv$acc.cv[1:maxg], f=.325, iter=0), col="blue",lwd=2)
      xy <- par()$usr[c(1,3)]
      if(add.legend)
        legend(xy[1], xy[2], xjust=0, yjust=0,
               legend=c("Resubstitution accuracy",
                 "Defective cross-validation",
                 "Cross-validation - select at each fold"),
               lty=c(1,2,1), lwd=c(1,1,2), pch=c(0,3,1),
               col=c("red","gray40","blue"), cex=0.875)
      mtext(side=3,line=0.35, AB, adj=0)
    }
    plot.acc(cv1, badcv1, AB="A: Golub data (as for Figure 12.9)")
    plot.acc(cv2, badcv2, ylab="", AB="B: Random data", add.legend=FALSE)
    if(device!="")dev.off()
  }

g12.10 <-
function(device=""){
    if(device!="") hardcopy(width=5.25, height=2.75, trellis=F,
                            color=TRUE, pointsize=8, device=device)
    oldpar <- par(mar = c(4.1, 3.6, 2.6, 0.1), mgp = c(2.25, 0.5,        
         0), mfrow = c(1, 2), oma = c(0, 0.6, 0, 1.1), pty="s")       
    on.exit(par(oldpar))
    attach(golubInfo)
    scoreplot(scorelist = tissue.mfB.scores, cl.circle=NULL,
           prefix="A: B-cell subset -", xlab="Discriminant function 1",
           ylab="Discriminant function 2",
           adj.title=0)
    scoreplot(scorelist=BMonly.scores, cl.circle=tissue.mfB, 
           circle=tissue.mfB%in%c("BM:f","BM:m"),
           params=list(circle=list(cex=1.3, col=c("pink","cyan")),
             points=list(cex=0.65)), xlab="Discriminant function 1",
              ylab="",
           prefix="B: BM samples -", adj.title=0)
    detach(golubInfo)
    if(device!="")dev.off()
  }

g12.11 <-
function(device=""){
    if(device!="") hardcopy(width=5.25, height=2.75, trellis=F,
                            color=TRUE, pointsize=8, device=device)
    oldpar <- par(mar = c(4.1, 3.6, 2.6, 0.1), mgp = c(2.25, 0.5,        
         0), mfrow = c(1, 2), oma = c(0, 0.6, 0, 1.1), pty="s")       
    on.exit(par(oldpar))
    attach(golubInfo)
    ##  tissue.mfB.cv <- cvdisc(dsetB, cl=tissue.mfB, nf.use=1:27)
    ##  tissue.mfB.scores <-                                      
    ##      cvscores(cvlist = tissue.mfB.cv,                     
    ##               nfeatures = 3, ndisc = NULL, cl.other = NULL)
    scoreplot(scorelist = tissue.mfB.scores, cl.circle=NULL,
           prefix="A: B-cell subset -", xlab="Discriminant function 1",
           ylab="Discriminant function 2",
           adj.title=0)
    ##    BMonly.scores <-
    ##        cvscores(cvlist=BMonly.cv, nfeatures=11, cl.other=NULL)
    scoreplot(scorelist=BMonly.scores, cl.circle=tissue.mfB, 
           circle=tissue.mfB%in%c("BM:f","BM:m"),
           params=list(circle=list(cex=1.3, col=c("pink","cyan")),
             points=list(cex=0.65)), xlab="Discriminant function 1",
              ylab="",
           prefix="B: BM samples -", adj.title=0)
    detach(golubInfo)
    if(device!="")dev.off()
  }

pkgs <- c("MASS", "multtest", "hddplot")
z <- sapply(pkgs, require, character.only=TRUE, warn.conflicts=FALSE, quietly=TRUE)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist
## 
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
if(any(!z)){
  notAvail <- paste(names(z)[!z], collapse=", ")
  warning(paste("The following packages should be installed:", notAvail))
} 

g12.6()

plot of chunk unnamed-chunk-1

g12.7()
## b=1000   b=2000  b=3000  b=4000  b=5000  b=6000  b=7000  b=8000  b=9000  b=10000 
## b=11000  b=12000 b=13000 b=14000 b=15000 b=16000 b=17000 b=18000 b=19000 b=20000 
## b=21000  b=22000 b=23000 b=24000 b=25000 b=26000 b=27000 b=28000 b=29000 b=30000 
## b=31000  b=32000 b=33000 b=34000 b=35000 b=36000 b=37000 b=38000 b=39000 b=40000 
## b=41000  b=42000 b=43000 b=44000 b=45000 b=46000 b=47000 b=48000 b=49000 b=50000 
## b=51000  b=52000 b=53000 b=54000 b=55000 b=56000 b=57000 b=58000 b=59000 b=60000 
## b=61000  b=62000 b=63000 b=64000 b=65000 b=66000 b=67000 b=68000 b=69000 b=70000 
## b=71000  b=72000 b=73000 b=74000 b=75000 b=76000 b=77000 b=78000 b=79000 b=80000 
## b=81000  b=82000 b=83000 b=84000 b=85000 b=86000 b=87000 b=88000 b=89000 b=90000 
## b=91000  b=92000 b=93000 b=94000 b=95000 b=96000 b=97000 b=98000 b=99000 b=100000    
## [1] "Graph retains 302 points."

plot of chunk unnamed-chunk-1

## [1] "Graph retains 274 points."
g12.8()

plot of chunk unnamed-chunk-1

    attach(golubInfo)
    subsetB <- cancer=="allB" & tissue.mf%in%c("BM:f","BM:m","PB:m")
    tissue.mfB <- factor(tissue.mf[subsetB])
    dsetB <- Golub[,subsetB]
    options(warning=FALSE)
    tissue.mfB.badcv <- defectiveCVdisc(dsetB, cl=tissue.mfB,
                                   nfeatures=1:23)
    tissue.mfB.cv <- cvdisc(dsetB, cl=tissue.mfB, nfeatures=1:23)
## 
##  Preliminary per fold calculations 
## 1:2:3:4:5:6:7:8:9:10:
##  Show each choice of number of features: 
## 1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16:17:18:19:20:21:22:23:
##                                                              
## Accuracy           Best accuracy, less 1SD   Best accuracy   
## (Cross-validation) 0.85 (3 features)         0.9 (4 features)
    detach(golubInfo)
    rdsetB <- matrix(rnorm(prod(dim(dsetB))), nrow=dim(dsetB)[1])
    rtissue.mfB.cv <- cvdisc(rdsetB, cl=tissue.mfB, nfeatures=1:23)
## [1] "Input rows (features) are not named. Names"
## [1] "1:7129 will be assigned."
## 
##  Preliminary per fold calculations 
## 1:2:3:4:5:6:7:8:9:10:
##  Show each choice of number of features: 
## 1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16:17:18:19:20:21:22:23:
##                                                               
## Accuracy           Best accuracy, less 1SD   Best accuracy    
## (Cross-validation) 0.66 (2 features)         0.74 (4 features)
    rtissue.mfB.badcv <- defectiveCVdisc(rdsetB, cl=tissue.mfB,
                                   nfeatures=1:23)
## [1] "Input rows (features) are not named. Names"
## [1] "1:7129 will be assigned."
    options(warning=TRUE)

g12.9()

plot of chunk unnamed-chunk-1

     tissue.mfB.scores <-                                      
         cvscores(cvlist = tissue.mfB.cv,                     
                  nfeatures = 3, ndisc = NULL, cl.other = NULL) 
## 1:2:3:4:5:6:7:8:9:10
## 1:2:3:4:5:6:7:8:9:10
     Golub.BM <- with(golubInfo, Golub[, BM.PB=="BM"]) 
     cancer.BM <- with(golubInfo, cancer[BM.PB=="BM"])
     BMonly.cv <- cvdisc(Golub.BM, cl=cancer.BM, nfeatures=1:25, 
                         nfold=c(10,4)) 
## 
##  Preliminary per fold calculations 
## 1:2:3:4:5:6:7:8:9:10:1:2:3:4:5:6:7:8:9:10:1:2:3:4:5:6:7:8:9:10:1:2:3:4:5:6:7:8:9:10:
##  Show each choice of number of features: 
## 1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16:17:18:19:20:21:22:23:24:25:
##                                                                
## Accuracy           Best accuracy, less 1SD   Best accuracy     
## (Cross-validation) 0.9 (19 features)         0.94 (23 features)
     BMonly.scores <-
           cvscores(cvlist=BMonly.cv, nfeatures=11, cl.other=NULL)
## 1:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:10
## 1:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:10
g12.10()