## Chapter 12: Multivariate Data Exploration and Discrimination
## Sec 12.1: Multivariate Exploratory Data Analysis
## ss 12.1.1: Scatterplot matrices
## Scatterplot matrix, columns 9-11 of possum (DAAG).
## Colors distinguish sexes; symbols distinguish sites
library(DAAG)
## Loading required package: lattice
library(lattice)
colr <- c("red", "blue")
pchr <- c(3,4,0,8,2,10,1)
ss <- expand.grid(site=1:7, sex=1:2) # Site varies fastest
parset <- with(ss, simpleTheme(pch=pchr[site], col=colr[sex]))
sitenames <- c("Cambarville","Bellbird","Whian Whian", "Byrangery",
"Conondale ","Allyn River", "Bulburin")
## Add column sexsite to possum; will be used again below
possum$sexsite <- paste(possum$sex, possum$site, sep="-")
splom(possum[, c(9:11)], groups = possum$sexsite,
col = colr[ss$sex], par.settings=parset,
varnames=c("tail\nlength","foot\nlength","ear conch\nlength"),
key = list(text=list(sitenames), points=list(pch=pchr), columns=3))

## Cloud plot of earconch, taill and footlgth
cloud(earconch~taill+footlgth, data=possum, pch=pchr, groups=site,
par.settings=simpleTheme(pch=c(3,4,0,8,2,10,1)),
auto.key = list(space="top", corner=c(0,1), columns=3, between=1,
text=sitenames, between.columns=2))

# auto.key takes its symbols (pch) from par.settings
## ss 12.1.2: Principal components analysis
## Preliminary data scrutiny
## Principal components calculations: possum[, 6:14] (DAAG)
possum.prc <- princomp(na.omit(possum[, 6:14]))
## Footnote Code
## Plot of principal components: possum[, 6:14]
here<- complete.cases(possum[, 6:14])
colr <- c("red", "blue")
pchr <- c(3,4,0,8,2,10,1)
ss <- expand.grid(site=1:7, sex=1:2) # Site varies fastest
xyplot(possum.prc$scores[, 2] ~ possum.prc$scores[, 1], aspect="iso",
groups = possum$sexsite[here], col = colr[ss$sex], pch = pchr[ss$site],
xlab="1st Principal Component", ylab="2nd Principal Component",
key=list(points = list(pch=pchr),
text=list(c("Cambarville", "Bellbird", "Whian Whian",
"Byrangery", "Conondale", "Allyn River",
"Bulburin" )), columns=4))

summary(possum.prc, loadings=TRUE, digits=2)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## Standard deviation 6.7999 5.0331 2.69929 2.16009 1.73722 1.59890
## Proportion of Variance 0.4981 0.2729 0.07849 0.05027 0.03251 0.02754
## Cumulative Proportion 0.4981 0.7710 0.84951 0.89978 0.93229 0.95983
## Comp.7 Comp.8 Comp.9
## Standard deviation 1.28597 1.1111 0.916957
## Proportion of Variance 0.01782 0.0133 0.009058
## Cumulative Proportion 0.97764 0.9909 1.000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## hdlngth 0.413 0.282 0.339 -0.185 0.695 0.277 -0.184
## skullw 0.296 0.269 0.540 -0.338 -0.519 -0.276 0.259 0.112
## totlngth 0.518 0.315 -0.648 -0.156 -0.226 -0.145 0.336
## taill 0.251 -0.350 -0.194 0.437 -0.753 0.106
## footlgth 0.514 -0.468 -0.336 0.633
## earconch 0.309 -0.650 0.249 -0.584 0.208 -0.172
## eye 0.195 0.242 0.942
## chest 0.219 0.175 0.174 -0.177 -0.189 -0.763 -0.404 0.267
## belly 0.246 0.178 0.134 0.891 -0.102 0.239 0.144
## The stability of the principal components plot
## Bootstrap principal components calculations: possum (DAAG)
library(ggplot2)
## Sample from all rows where there are no missing values
rowsfrom <- (1:dim(possum)[1])[complete.cases(possum[, 6:14])]
n <- length(rowsfrom); ntimes <- 3
bootscores <- data.frame(matrix(0, nrow=ntimes*n, ncol=2))
names(bootscores) <- c("scores1","scores2")
allsamp <- numeric(ntimes*n)
for (i in 1:ntimes){
samprows <- sample(rowsfrom, n, replace=TRUE)
bootscores[n*(i-1)+(1:n), 1:2] <-
princomp(possum[samprows, 6:14])$scores[, 1:2]
allsamp[n*(i-1)+(1:n)] <- samprows
}
bootscores[, c("sex","Pop")] <- possum[allsamp, c("sex","Pop")]
bootscores$sampleID <- factor(rep(1:ntimes, rep(n,ntimes)))
quickplot(x=scores1, y=scores2, colour=sex,
shape=Pop, facets=.~sampleID, data=bootscores) +
scale_colour_manual(values=c("m"="black","f"="gray45")) +
xlab("First Principal Component") +
ylab("Second Principal Component")

## NB, in the call to scale_color_manual(): 'values', not 'value'
## ss 12.1.3: Multi-dimensional scaling
## Distance measures
## Ordination
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:DAAG':
##
## hills
d.possum <- dist(possum[,6:14]) # Euclidean distance matrix
possum.sammon <- sammon(d.possum, k=2)
## Initial stress : 0.06067
## stress after 1 iters: 0.06038
plot(possum.sammon$points) # Plot 2nd vs 1st ordinates

possum.isoMDS <- isoMDS(d.possum, k=2)
## initial value 13.911522
## iter 5 value 10.680065
## iter 10 value 10.329701
## iter 10 value 10.322480
## iter 10 value 10.318589
## final value 10.318589
## converged
plot(possum.isoMDS$points)

## Binary data
## Sec 12.2: Discriminant Analysis
## ss 12.2.1: Example -- plant architecture
## Notation
## ss 12.2.2: Logistic discriminant analysis
leaf17.glm <- glm(arch ~ logwid + loglen, family=binomial,
data=leafshape17)
options(digits=3)
summary(leaf17.glm)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -15.287 4.13 -3.703 0.000213
## logwid 0.185 1.57 0.118 0.906224
## loglen 5.269 1.96 2.684 0.007269
## Predictive accuracy
leaf17.cv <- CVbinary(leaf17.glm)
##
## Fold: 10 6 9 2 3 4 1 8 7 5
## Internal estimate of accuracy = 0.82
## Cross-validation estimate of accuracy = 0.754
tCV <- table(leafshape17$arch, round(leaf17.cv$cv)) # CV
cbind(tCV, c(tCV[1,1], class.acc=tCV[2,2])/(tCV[,1]+tCV[,2]))
## 0 1
## 0 35 6 0.854
## 1 9 11 0.550
sum(tCV[row(tCV)==col(tCV)])/sum(tCV) # Overall accuracy
## [1] 0.754
tR <- table(leafshape17$arch, round(leaf17.cv$internal))
cbind(tR, c(tR[1,1], class.acc=tR[2,2])/(tR[,1]+tR[,2]))
## 0 1
## 0 37 4 0.902
## 1 7 13 0.650
sum(tR[row(tR)==col(tR)])/sum(tR) # Overall accuracy
## [1] 0.82
## ss 12.2.3: Linear discriminant analysis
library(MASS)
## Discriminant analysis; data frame leafshape17 (DAAG)
leaf17.lda <- lda(arch ~ logwid+loglen, data=leafshape17)
leaf17.lda
## Call:
## lda(arch ~ logwid + loglen, data = leafshape17)
##
## Prior probabilities of groups:
## 0 1
## 0.672 0.328
##
## Group means:
## logwid loglen
## 0 1.43 2.46
## 1 1.87 2.99
##
## Coefficients of linear discriminants:
## LD1
## logwid 0.156
## loglen 3.066
## Assessments of predictive accuracy
leaf17cv.lda <- lda(arch ~ logwid+loglen, data=leafshape17,
CV=TRUE)
## the list element 'class' gives the predicted class
## The list element 'posterior' holds posterior probabilities
tab <- table(leafshape17$arch, leaf17cv.lda$class)
cbind(tab, c(tab[1,1], class.acc=tab[2,2])/(tab[,1]+tab[,2]))
## 0 1
## 0 37 4 0.902
## 1 8 12 0.600
sum(tab[row(tab)==col(tab)])/sum(tab)
## [1] 0.803
## ss 12.2.4: An example with more than two groups
## Footnote Code
## Linear discriminant calculations for possum data
possum.lda <- lda(site ~ hdlngth + skullw + totlngth + taill + footlgth +
earconch + eye + chest + belly, data=na.omit(possum))
# na.omit() omits any rows that have one or more missing values
options(digits=4)
possum.lda$svd # Examine the singular values
## [1] 15.4757 3.6967 3.1505 1.5236 1.2206 0.7772
plot(possum.lda, dimen=3)

# Scatterplot matrix - scores on 1st 3 canonical variates (Figure 12.5)
possum.lda
## Call:
## lda(site ~ hdlngth + skullw + totlngth + taill + footlgth + earconch +
## eye + chest + belly, data = na.omit(possum))
##
## Prior probabilities of groups:
## 1 2 3 4 5 6 7
## 0.32673 0.09901 0.06931 0.06931 0.12871 0.12871 0.17822
##
## Group means:
## hdlngth skullw totlngth taill footlgth earconch eye chest belly
## 1 93.72 57.20 89.71 36.38 73.00 52.58 15.02 27.88 33.27
## 2 90.17 55.57 82.00 34.55 70.59 52.15 14.36 26.80 31.20
## 3 94.57 58.90 88.07 37.21 66.60 45.26 16.07 27.57 34.86
## 4 97.61 61.69 92.24 39.71 68.93 45.84 15.47 29.64 34.57
## 5 92.18 56.23 86.92 37.65 64.73 43.87 15.38 26.65 32.04
## 6 89.25 54.19 84.54 37.65 63.07 43.97 15.34 25.23 31.50
## 7 92.63 57.23 85.69 37.69 65.74 45.86 14.47 26.14 31.92
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3 LD4 LD5 LD6
## hdlngth -0.149398 0.08482 -0.24269 -0.02715 -0.08417 -0.18666
## skullw -0.025583 0.06244 -0.24225 0.10557 -0.14920 0.14122
## totlngth 0.116459 0.25458 0.32571 -0.17954 -0.08165 -0.11641
## taill -0.457716 -0.06902 -0.45324 -0.18352 0.30127 0.51219
## footlgth 0.292598 -0.02507 -0.03122 0.04571 0.06193 -0.10474
## earconch 0.580885 -0.06261 -0.08760 -0.07833 -0.03111 0.26918
## eye -0.054781 0.02837 0.77627 0.45217 0.21962 0.33476
## chest 0.102067 0.07237 0.02164 0.26416 0.67135 -0.04717
## belly 0.007293 -0.03328 0.10804 0.10749 -0.33763 0.18131
##
## Proportion of trace:
## LD1 LD2 LD3 LD4 LD5 LD6
## 0.8953 0.0511 0.0371 0.0087 0.0056 0.0023
## Sec 12.3: *High-dimensional data, classification, and plots
## What groups are of interest?
library(hddplot)
data(golubInfo)
with(golubInfo, table(cancer, tissue.mf))
## tissue.mf
## cancer BM:NA BM:f BM:m PB:NA PB:f PB:m
## allB 4 19 10 2 1 2
## allT 0 0 8 0 0 1
## aml 16 2 3 1 1 2
attach(golubInfo)
## Identify allB samples for that are BM:f or BM:m or PB:m
subsetB <- cancer=="allB" & tissue.mf%in%c("BM:f","BM:m","PB:m")
## Form vector that identifies these as BM:f or BM:m or PB:m
tissue.mfB <- tissue.mf[subsetB, drop=TRUE]
## Separate off the relevant columns of the matrix Golub
data(Golub) # NB: variables (rows) by cases (columns)
GolubB <- Golub[, subsetB]
detach(golubInfo)
## ss 12.3.1: Classifications and associated graphs
## Preliminary data manipulation
## Footnote Code
## Display distributions for the first 20 observations
boxplot(data.frame(GolubB[, 1:20])) # First 20 columns (observations)

## Random selection of 20 rows (features)
boxplot(data.frame(GolubB[sample(1:7129, 20), ]))

## ss 12.3.2: Flawed graphs
## Footnote Code
## Uses orderFeatures() (hddplot); see below
ord15 <- orderFeatures(GolubB, cl=tissue.mfB)[1:15]
## Footnote Code
## Panel A
dfB.ord <- data.frame(t(GolubB[ord15, ]))
## Calculations for the left panel
## Transpose to observations by features
dfB15 <- data.frame(t(GolubB[ord15, ]))
library(MASS)
dfB15.lda <- lda(dfB15, grouping=tissue.mfB)
scores <- predict(dfB15.lda, dimen=2)$x
## Scores for the single PB:f observation
chooseCols <- with(golubInfo, tissue.mf=="PB:f"& cancer=="allB")
df.PBf <- data.frame(t(Golub[ord15, chooseCols, drop=FALSE]))
scores.PBf <- predict(dfB15.lda, newdata=df.PBf, dimen=2)$x
## Warning! The plot that now follows may be misleading!
## Use scoreplot(), from the hddplot package
scoreplot(list(scores=scores, cl=tissue.mfB, other=scores.PBf, cl.other="PB:f"))

## Footnote Code
## Panel B: Repeat plot, now with random normal data
simscores <- simulateScores(nrow=7129, cl=rep(1:3, c(19,10,2)),
cl.other=4, nfeatures=15, seed=41)
# Returns list elements: scores, cl, scores.other & cl.other
scoreplot(simscores)

## Distributional extremes
## The calculation may take tens of minutes, even with adequate
## memory and a fast processor.
## If necessary, use a smaller value of B.
library(multtest)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist
##
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
GolubB.maxT <- mt.maxT(GolubB, unclass(tissue.mfB)-1, test="f",
B=100000)
## b=1000 b=2000 b=3000 b=4000 b=5000 b=6000 b=7000 b=8000 b=9000 b=10000
## b=11000 b=12000 b=13000 b=14000 b=15000 b=16000 b=17000 b=18000 b=19000 b=20000
## b=21000 b=22000 b=23000 b=24000 b=25000 b=26000 b=27000 b=28000 b=29000 b=30000
## b=31000 b=32000 b=33000 b=34000 b=35000 b=36000 b=37000 b=38000 b=39000 b=40000
## b=41000 b=42000 b=43000 b=44000 b=45000 b=46000 b=47000 b=48000 b=49000 b=50000
## b=51000 b=52000 b=53000 b=54000 b=55000 b=56000 b=57000 b=58000 b=59000 b=60000
## b=61000 b=62000 b=63000 b=64000 b=65000 b=66000 b=67000 b=68000 b=69000 b=70000
## b=71000 b=72000 b=73000 b=74000 b=75000 b=76000 b=77000 b=78000 b=79000 b=80000
## b=81000 b=82000 b=83000 b=84000 b=85000 b=86000 b=87000 b=88000 b=89000 b=90000
## b=91000 b=92000 b=93000 b=94000 b=95000 b=96000 b=97000 b=98000 b=99000 b=100000
## Compare calculated F-statistics with permutation distribution
qqthin(qf(1-GolubB.maxT$rawp, 2, 28), GolubB.maxT$teststat)

## [1] "Graph retains 302 points."
## Compare calculated F-statistics with theoretical F-distribution
qqthin(qf(ppoints(7129), 2, 28), GolubB.maxT$teststat)

## [1] "Graph retains 274 points."
# The theoretical F-distribution gives estimates of quantiles
# that are too small
## NB also (not included in Figure 12.10) the comparison between
## the permutation distribution and the theoretical F:
qqthin(qf(ppoints(7129), 2, 28), qf(1-GolubB.maxT$rawp, 2, 28))

## [1] "Graph retains 456 points."
# qqthin() is a version of qqplot() that thins out points where
# overlap is substantial, thus giving smaller graphics files.
## Selection of features that discriminate
## ss 12.3.3: Accuracies and Scores for test data
Golub.BM <- with(golubInfo, Golub[, BM.PB=="BM"])
cancer.BM <- with(golubInfo, cancer[BM.PB=="BM"])
## Now split each of the cancer.BM categories between two subsets
## Uses divideUp(), from hddplot
gp.id <- divideUp(cancer.BM, nset=2, seed=29)
# Set seed to allow exact reproduction of the results below
table(gp.id, cancer.BM)
## cancer.BM
## gp.id allB allT aml
## 1 17 4 10
## 2 16 4 11
accboth <- accTrainTest(x = Golub.BM, cl = cancer.BM,
traintest=gp.id)
## 1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16:17:18:19:20:21:22:23:24:25:26:
##
## Training/test split Best accuracy, less 1SD Best accuracy
## I (training) / II (test) 0.89 (14 features) 0.94 (20 features)
## II (training) / I (test) 0.92 (10 features) 0.97 (17 features)
## Footnote Code
## Use function plotTrainTest() from hddplot
plotTrainTest(x=Golub.BM, nfeatures=c(14,10), cl=cancer.BM, traintest=gp.id)

rbind(accboth$sub1.2[1:20],accboth$sub2.1[1:20])
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] 4050 2794 6510 6696 4342 5542 4357 5543 1207 4584 6236 1429 6575
## [2,] 6606 4342 6510 3594 4050 6236 1694 1207 1268 4847 5542 2061 5543
## [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## [1,] 2833 4750 2335 1704 4882 6225 3544
## [2,] 4055 4375 1144 379 6696 4196 229
match(accboth$sub1.2[1:20],accboth$sub2.1[1:20])
## [1] 5 NA 3 18 2 11 NA 13 8 NA 6 NA NA NA NA NA NA NA NA NA
## Cross-validation to determine the optimum number of features
## Cross-validation to determine the optimum number of features
## Accuracy measure will be: tissue.mfB.cv$acc.cv
tissue.mfB.cv <- cvdisc(GolubB, cl=tissue.mfB, nfeatures=1:23,
nfold=c(10,4)) # 10-fold CV (x4)
##
## Preliminary per fold calculations
## 1:2:3:4:5:6:7:8:9:10:1:2:3:4:5:6:7:8:9:10:1:2:3:4:5:6:7:8:9:10:1:2:3:4:5:6:7:8:9:10:
## Show each choice of number of features:
## 1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16:17:18:19:20:21:22:23:
##
## Accuracy Best accuracy, less 1SD Best accuracy
## (Cross-validation) 0.85 (3 features) 0.9 (4 features)
## Defective measures will be in acc.resub (resubstitution)
## and acc.sel1 (select features prior to cross-validation)
tissue.mfB.badcv <- defectiveCVdisc(GolubB, cl=tissue.mfB,
foldids=tissue.mfB.cv$folds,
nfeatures=1:23)
## NB: Warning messages have been omitted
##
## Calculations for random normal data:
set.seed(43)
rGolubB <- matrix(rnorm(prod(dim(GolubB))), nrow=dim(GolubB)[1])
rtissue.mfB.cv <- cvdisc(rGolubB, cl=tissue.mfB, nfeatures=1:23,
nfold=c(10,4))
## [1] "Input rows (features) are not named. Names"
## [1] "1:7129 will be assigned."
##
## Preliminary per fold calculations
## 1:2:3:4:5:6:7:8:9:10:1:2:3:4:5:6:7:8:9:10:1:2:3:4:5:6:7:8:9:10:1:2:3:4:5:6:7:8:9:10:
## Show each choice of number of features:
## 1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16:17:18:19:20:21:22:23:
##
## Accuracy Best accuracy, less 1SD Best accuracy
## (Cross-validation) 0.39 (1 features) 0.48 (9 features)
rtissue.mfB.badcv <- defectiveCVdisc(rGolubB, cl=tissue.mfB,
nfeatures=1:23,
foldids=rtissue.mfB.cv$folds)
## [1] "Input rows (features) are not named. Names"
## [1] "1:7129 will be assigned."
## Which features?
genelist <- matrix(tissue.mfB.cv$genelist[1:3, ,], nrow=3)
tab <- table(genelist, row(genelist))
ord <- order(tab[,1], tab[,2], decreasing=TRUE)
tab[ord,]
##
## genelist 1 2 3
## M58459_at 32 4 0
## S74221_at 4 0 0
## U29195_at 4 0 0
## X54870_at 0 16 8
## U91327_at 0 8 16
## L08666_at 0 4 0
## U49395_at 0 4 0
## X00437_s_at 0 4 0
## X53416_at 0 0 4
## X62654_rna1_at 0 0 8
## X82494_at 0 0 4
## Cross-validation: bone marrow ({BM}) samples only
BMonly.cv <- cvdisc(Golub.BM, cl=cancer.BM, nfeatures=1:25,
nfold=c(10,4))
##
## Preliminary per fold calculations
## 1:2:3:4:5:6:7:8:9:10:1:2:3:4:5:6:7:8:9:10:1:2:3:4:5:6:7:8:9:10:1:2:3:4:5:6:7:8:9:10:
## Show each choice of number of features:
## 1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16:17:18:19:20:21:22:23:24:25:
##
## Accuracy Best accuracy, less 1SD Best accuracy
## (Cross-validation) 0.9 (19 features) 0.94 (23 features)
## ss 12.3.4: Graphs derived from the cross-validation process
## Panel A: Uses tissue.mfB.acc from above
tissue.mfB.scores <-
cvscores(cvlist = tissue.mfB.cv, nfeatures = 3, cl.other = NULL)
## 1:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:10
## 1:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:10
scoreplot(scorelist = tissue.mfB.scores, cl.circle=NULL,
prefix="B-cell subset -")

## Footnote Code
## Panel B; classify bone marrow samples a/c cancer type.
BMonly.scores <- cvscores(cvlist=BMonly.cv, nfeatures=19, cl.other=NULL)
## 1:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:10
## 1:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:101:2:3:4:5:6:7:8:9:10
scoreplot(scorelist=BMonly.scores, cl.circle=tissue.mfB,
circle=tissue.mfB%in%c("BM:f","BM:m"),
params=list(circle=list(col=c("cyan","gray"))),
prefix="B: BM samples -")

## Further comments
## Sec 12.4: Further Reading
library(DAAG); library(oz)
oz(sections=c(3:5, 11:16))
with(possumsites, points(Longitude, Latitude))
posval <- c(2, 4, 2, 2, 4, 2, 2)
with(possumsites,
text(Longitude, Latitude, row.names(possumsites), pos=posval))

`confusion` <-
function(actual, predicted, digits=4){
tab <- table(actual, predicted)
confuse <- apply(tab, 1, function(x)x/sum(x))
print(round(confuse, digits))
acc <- sum(tab[row(tab)==col(tab)])/sum(tab)
invisible(print(c("Overall accuracy" = round(acc,digits))))
}
library(MASS); library(DAAG); library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:Biobase':
##
## combine
##
## The following object is masked from 'package:BiocGenerics':
##
## combine
library(mlbench)
data(Vehicle)
lhat <- lda(Class ~ ., data=Vehicle, CV=TRUE)$class
qhat <- lda(Class ~ ., data=Vehicle, CV=TRUE)$class
confusion(Vehicle$Class, lhat)
## actual
## bus opel saab van
## bus 0.9587 0.0377 0.0507 0.0101
## opel 0.0183 0.6132 0.2995 0.0151
## saab 0.0092 0.3160 0.5899 0.0101
## van 0.0138 0.0330 0.0599 0.9648
## Overall accuracy
## 0.779
confusion(Vehicle$Class, qhat)
## actual
## bus opel saab van
## bus 0.9587 0.0377 0.0507 0.0101
## opel 0.0183 0.6132 0.2995 0.0151
## saab 0.0092 0.3160 0.5899 0.0101
## van 0.0138 0.0330 0.0599 0.9648
## Overall accuracy
## 0.779
randomForest(Class ~ ., data=Vehicle, CV=TRUE)
##
## Call:
## randomForest(formula = Class ~ ., data = Vehicle, CV = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 26.24%
## Confusion matrix:
## bus opel saab van class.error
## bus 215 1 0 2 0.01376
## opel 2 96 105 9 0.54717
## saab 6 80 118 13 0.45622
## van 1 0 3 195 0.02010
Vehicle.lda <- lda(Class ~ ., data=Vehicle)
twoD <- predict(Vehicle.lda)$x
qplot(twoD[,1], twoD[,2], color=Vehicle$Class,
geom=c("point","density2d"))

library(ape); library(MASS)
## webpage <-
## "http://evolution.genetics.washington.edu/book/primates.dna"
## primates.dna <- read.dna(webpage)
# Alternatively, get primateDNA from the DAAGbio package
library(DAAGbio)
## Loading required package: limma
##
## Attaching package: 'limma'
##
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
primates.dna <- as.DNAbin(primateDNA)
primates.dist <- dist.dna(primates.dna, model="K80")
primates.cmd <- cmdscale(primates.dist)
eqscplot(primates.cmd)
rtleft <- c(4,2,4,2)[unclass(cut(primates.cmd[,1], breaks=4))]
text(primates.cmd, labels=row.names(primates.cmd), pos=rtleft)

d <- dist(primates.cmd)
sum((d-primates.dist)^2)/sum(primates.dist^2)
## [1] 0.1009
library(DAAG)
pacific.dist <- dist(x = as.matrix(rockArt[-c(47, 54, 60, 63, 92),
28:641]), method = "binary")
sum(pacific.dist==1)/length(pacific.dist)
## [1] 0.6312
plot(density(pacific.dist, to = 1))

## Check that all columns have at least one distance < 1
symmat <- as.matrix(pacific.dist)
table(apply(symmat, 2, function(x) sum(x<1)))
##
## 1 2 3 4 5 6 7 8 13 14 15 17 19 21 22 23 25 27 28 29 30 31 32 33 34
## 4 3 1 1 3 1 1 3 1 1 2 1 1 1 2 1 4 1 1 3 3 1 1 1 2
## 36 37 40 41 42 43 44 45 46 47 49 50 51 52 53 54 55 56 57 58 60 62 63 65 66
## 1 3 3 1 2 2 2 3 2 2 1 1 2 1 3 1 3 4 2 1 2 2 1 2 1
## 69 70 71 77 85
## 2 1 1 1 1
pacific.cmd <- cmdscale(pacific.dist)
pacific.sam <- sammon(pacific.dist)
## Initial stress : 0.58369
## stress after 10 iters: 0.41947, magic = 0.018
## stress after 20 iters: 0.31989, magic = 0.009
## stress after 30 iters: 0.21724, magic = 0.098
## stress after 40 iters: 0.17060, magic = 0.226
## stress after 50 iters: 0.16035, magic = 0.101
## stress after 60 iters: 0.15895, magic = 0.101
## stress after 70 iters: 0.15800, magic = 0.500
## stress after 80 iters: 0.15744, magic = 0.500
## stress after 90 iters: 0.15720, magic = 0.500
## stress after 100 iters: 0.15712, magic = 0.500