## Chapter 12 ## Sec 12.3: *High-dimensional data, classification, and plots ## What groups are of interest? library(hddplot) data(golubInfo) with(golubInfo, table(cancer, tissue.mf)) attach(golubInfo) ## Identify allB samples for that are BM:f or BM:m or PB:m subsetB <- cancer=="allB" & tissue.mf%in%c("BM:f","BM:m","PB:m") ## Form vector that identifies these as BM:f or BM:m or PB:m tissue.mfB <- tissue.mf[subsetB, drop=TRUE] ## Separate off the relevant columns of the matrix Golub data(Golub) GolubB <- Golub[, subsetB] detach(golubInfo) ## ss 12.3.1: Classifications and associated graphs ## Preliminary data manipulation ## Try e.g. boxplot(data.frame(GolubB[, 1:20])) # First 20 columns (observations) ## Random selection of 20 rows (features) boxplot(data.frame(GolubB[sample(1:7129, 20), ])) ## ss 12.3.2: Flawed graphs ## Footnote code ## Uses orderFeatures() (hddplot); see below ord15 <- orderFeatures(GolubB, cl=tissue.mfB)[1:15] ## Calculations for the left panel ## Transpose to observations by features dfB15 <- data.frame(t(GolubB[ord15, ])) library(MASS) dfB15.lda <- lda(dfB15, grouping=tissue.mfB) scores <- predict(dfB15.lda, dimen=2)\$x ## Scores for the single PB:f observation attach(golubInfo) df.PBf <- data.frame(t(Golub[ord15, tissue.mf=="PB:f" & cancer=="allB", drop=FALSE])) scores.PBf <- predict(dfB15.lda, newdata=df.PBf, dimen=2)\$x detach(golubInfo) ## Warning! The plot that now follows may be misleading! ## Use scoreplot(), from the hddplot package scoreplot(list(scores=scores, cl=tissue.mfB, other=scores.PBf, cl.other="PB:f")) ## Footnote code simscores <- simulateScores(nrow=7129, cl=rep(1:3, c(19,10,2)), cl.other=4, nfeatures=15, seed=41) # Returns list elements: scores, cl, scores.other & cl.other scoreplot(simscores) ## The following changes parameter settings scoreplot(simscores, params=list(points=list(pch=11:14))) ## Cross-validation to determine the optimum number of features ## Cross-validation to determine the optimum number of features ## Accuracy measure will be: tissue.mfB.cv\$acc.cv tissue.mfB.cv <- cvdisc(GolubB, cl=tissue.mfB, nfeatures=1:23, nfold=c(10,4)) # 10-fold CV (x4) ## Defective measures will be in acc.resub (resubstitution) ## and acc.sel1 (select features prior to cross-validation) tissue.mfB.badcv <- defectiveCVdisc(GolubB, cl=tissue.mfB, foldids=tissue.mfB.cv\$folds, nfeatures=1:23) ## Now plot these various curves. The blue is the true! plot(1:23, tissue.mfB.cv\$acc.cv, ylim=c(0.45,1), col="blue", pch=16) points(1:23, tissue.mfB.badcv\$acc.resub, col=2, pch=11) points(1:23, tissue.mfB.badcv\$acc.sel1, col=3, pch=12) ## NB: Warning messages have been omitted ## ## Calculations and plots for random normal data: set.seed(43) rGolubB <- matrix(rnorm(prod(dim(GolubB))), nrow=dim(GolubB)[1]) rtissue.mfB.cv <- cvdisc(rGolubB, cl=tissue.mfB, nfeatures=1:23, nfold=c(10,4)) rtissue.mfB.badcv <- defectiveCVdisc(rGolubB, cl=tissue.mfB, nfeatures=1:23, foldids=rtissue.mfB.cv\$folds) plot(1:23, rtissue.mfB.cv\$acc.cv, ylim=c(0.25,1), col="blue", pch=16) points(1:23, rtissue.mfB.badcv\$acc.resub, col=2, pch=11) points(1:23, rtissue.mfB.badcv\$acc.sel1, col=3, pch=12) ## Which features? genelist <- matrix(tissue.mfB.cv\$genelist[1:3, ,], nrow=3) tab <- table(genelist, row(genelist)) ord <- order(tab[,1], tab[,2], decreasing=TRUE) tab[ord,] ## Cross-validation: bone marrow (\texttt{BM}) samples only BMonly.cv <- cvdisc(Golub.BM, cl=cancer.BM, nfeatures=1:25, nfold=c(10,4)) ## ss 12.3.4: Graphs derived from the cross-validation process ## Panel A: Uses tissue.mfB.acc from above tissue.mfB.scores <- cvscores(cvlist = tissue.mfB.cv, nfeatures = 3, cl.other = NULL) scoreplot(scorelist = tissue.mfB.scores, cl.circle=NULL, prefix="B-cell subset -") ## Footnote code BMonly.scores <- cvscores(cvlist=BMonly.cv, nfeatures=19, cl.other=NULL) scoreplot(scorelist=BMonly.scores, cl.circle=tissue.mfB, circle=tissue.mfB%in%c("BM:f","BM:m"), params=list(circle=list(col=c("cyan","gray"))), prefix="B: BM samples -")