## Chapter 2: Styles of Data Analysis
## Sec 2.1: Revealing Views of the Data
## ss 2.1.1: Views of a single sample
## Histograms and density plots
library(DAAG) # Ensure that the DAAG package is attached
## Loading required package: lattice
## Form the subset of possum that holds data on females only
ftotlngth <- with(possum, totlngth[sex=="f"])
## Footnote Code
## To get a 1 by 4 layout, precede with
opar <- par(mfrow = c(1,4), pty="s")
hist(ftotlngth, breaks = 72.5 + (0:5) * 5, ylim = c(0, 22),
xlab="Total length (cm)", main ="A: Breaks at 72.5, 77.5, ...")
hist(ftotlngth, breaks = 75 + (0:5) * 5, ylim = c(0, 22),
xlab="Total length (cm)", main="B: Breaks at 75, 80, ...")
dens <- density(ftotlngth)
xlim <- range(dens$x); ylim <- range(dens$y)
hist(ftotlngth, breaks = 72.5 + (0:5) * 5, probability = T,
xlim = xlim, ylim = ylim, xlab="Total length (cm)", main ="C: Breaks as in A")
lines(dens)
hist(ftotlngth, breaks = 75 + (0:5) * 5, probability = T,
xlim = xlim, ylim = ylim, xlab="Total length (cm)", main="D: Breaks as in B")
lines(dens)

par(opar)
par(mfrow=c(1,1))
## The stem-and-leaf display
with(ais, stem(ht[sport=="Row"]))
##
## The decimal point is 1 digit(s) to the right of the |
##
## 15 | 6
## 16 |
## 16 | 5
## 17 | 4
## 17 | 5678899
## 18 | 00000011223
## 18 | 55666668899
## 19 | 123
## 19 | 58
## Footnote Code
## Use quantile() to obtain the quartiles of ht: data frame ais (DAAG package)
quantile(ais$ht[ais$sport=="Row"], prob=c(.25,.5,.75))
## 25% 50% 75%
## 179.3 181.8 186.3
# For the 50th percentile (the 2nd quartile), an alternative is median()
## Boxplots
## Base graphics boxplot function
boxplot(ftotlngth, horizontal=TRUE)

## Alternative: lattice graphics bwplot function
bwplot(~ftotlngth, data=fossum)

## ss 2.1.2: Patterns in univariate time series
## Panel A
plot(log10(measles), xlab="", ylim=log10 (c(1,5000*1000)),
ylab=" Deaths; Population (log scale)", yaxt="n")
ytiks <- c(1, 10, 100, 1000, 1000000, 5000000)
## London population in thousands
londonpop <-
ts(c(1088,1258,1504,1778,2073,2491,2921,3336,3881,4266,
4563,4541,4498,4408), start=1801, end=1931, deltat=10)
points(log10(londonpop*1000), pch=16, cex=.5)
axis(2, at=log10(ytiks), labels=paste(ytiks), las=2)

## Panel B
plot(window(measles, start=1840, end=1882), ylim=c (0, 4600),
yaxt="n")
axis(2, at=(0:4)* 1000, labels=paste(0:4), las=2)

## Both graphs on the one graphics page
## Panel A:
par(fig=c(0, 1, .38, 1), cex=0.8) # 38% to 100% of page, in y-direction
plot(log10(measles), ylab="log10(Deaths)",
ylim=log10(c(1,5000*1000)))
mtext(side=3, line=0.5, "A (1629-1939)", adj=0)
## Panel B: window from 1840 to 1882; more complete code
par(fig=c(0, 1, 0, .4), new=TRUE) # 0% to 38% of height of figure region
plot(window(measles, start=1840, end=1882), ylab="Deaths")
mtext(side=3, line=0.5, "B (1841-1881)", adj=0)

par(fig=c(0, 1, 0, 1), cex=1) # Restore default figure region
## ss 2.1.3: Patterns in bivariate data
## Plot four vs one: data frame milk (DAAG)
xyrange <- range(milk)
par(pty="s") # square plotting region
plot(four ~ one, data = milk, xlim = xyrange, ylim = xyrange,
pch = 16)
rug(milk$one) # x-axis rug (default is side=1)
rug(milk$four, side = 2) # y-axis rug
abline(0, 1)

par(pty="m")
## The fitting of a smooth trend curve
## Plot ohms vs juice: data frame fruitohms (DAAG)
plot(ohms ~ juice, xlab="Apparent juice content (%)",
ylab="Resistance (ohms)", data=fruitohms)
## Add a smooth curve, as in Panel B
with(fruitohms, lines(lowess(juice, ohms), lwd=2))

# With lwd=2, the curve is twice the default thickness
## What is the appropriate scale?
## The following omits the labeling information
oldpar <- par(mfrow = c(1,2), pty="s")
## Plot brain vs body: data frame Animals (MASS package)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:DAAG':
##
## hills
plot(brain ~ body, data=Animals) # Panel A
plot(log(brain) ~ log(body), data=Animals) # Panel B

par(oldpar)
## ss 2.1.4: Patterns in grouped data -- lengths of cuckoo eggs
## Compare stripplot() with bwplot(), both from lattice package
stripplot(species ~ length, xlab="Length of egg (mm)", data=cuckoos)

bwplot(species ~ length, xlab="Length of egg (mm)", data=cuckoos,
scales=list(y=list(alternating=0)))

# alternating=0; omit y-axis labels
## Footnote Code
## For tidier labels replace ".", in several of the species names, by a space
specnam <- with(cuckoos, sub(pattern=".", replacement=" ", levels(species), fixed=TRUE))
# fixed=TRUE: do not interpret "." as a `regular expression',
## Panel A: Strip plot: data frame cuckoos (DAAG)
plt1 <- stripplot(species ~ length, factor.levels=specnam, data=cuckoos)
print(update(plt1, xlab="Length of egg (mm)"),
position=c(0,0,0.55,1)) # xmin, ymin, xmax, ymax
# Use print() to display lattice graphics objects
## Panel B: Box plot
plt2 <- bwplot(species ~ length, factor.levels=specnam, data=cuckoos)
print(update(plt2, xlab="Length of egg (mm)", scales=list(y=list(alternating=0))),
newpage=FALSE, position=c(0.55,0,1,1))

## Comparing densities between groups -- lattice style density plots
## Density plot for earconch: data frame possum (DAAG package)
library(lattice)
densityplot(~earconch | sex, groups=Pop, data=possum,
auto.key=list(space="right"))

## ss 2.1.5: \hspace*{-5pt}*\hspace*{5pt}Multiple variables and times
## Apply function range to columns of data frame jobs (DAAG)
sapply(jobs, range)
## BC Alberta Prairies Ontario Quebec Atlantic Date
## [1,] 1737 1366 973 5212 3167 941 95.00
## [2,] 1840 1436 999 5360 3257 968 96.92
## Simplified plot; all series in a single panel; use log scale
(simplejobsA.xyplot <-
xyplot(Ontario+Quebec+BC+Alberta+Prairies+Atlantic ~ Date,
outer=FALSE, data=jobs, type="b",
ylab="Number of workers", scales=list(y=list(log="e")),
auto.key=list(space="right", lines=TRUE)))

## Footnote Code
## Panel A: Update trellis object to improve x- and y-axis tick labels
datelabpos <- seq(from=95, by=0.5, length=5)
datelabs <- format(seq(from=as.Date("1Jan1995", format="%d%b%Y"),
by="6 month", length=5), "%b%y")
## Now create $y$-labels that have numbers, with log values underneath
ylabpos <- exp(pretty(log(unlist(jobs[,-7])), 5))
ylabels <- paste(round(ylabpos),"\n(", log(ylabpos), ")", sep="")
update(simplejobsA.xyplot, xlab="",
scales=list(x=list(at=datelabpos, labels=datelabs),
y=list(at=ylabpos, labels=ylabels)))

## Simplified code for Figure 2.9B
xyplot(Ontario+Quebec+BC+Alberta+Prairies+Atlantic ~ Date,
data=jobs, type="b", layout=c(3,2), ylab="Number of jobs",
scales=list(y=list(relation="sliced", log=TRUE)),
outer=TRUE)

## *Small proportional changes, on a scale of natural logarithms
## *Tick positions and labeling, on a logarithmic scale
## ss 2.1.6: Scatterplots, broken down by multiple factors
(target.xyplot <-
xyplot(csoa ~ it | sex*agegp, data=tinting, groups=target,
aspect=1, auto.key=list(columns=2)))

## Footnote Code
## Settings used for Figure 2.12B (suitable for grayscale on a printed page)
update(target.xyplot,
par.settings=simpleTheme(col=c("black","gray20"), pch=c(1, 16)))

# In the above, par.settings changed settings for this use of xyplot()
## Note the use of simpleTheme() for changing settings; see help(simpleTheme)
## Use trellis.par.set() to change settings while the current device is in use
(tint.xyplot <-
xyplot(csoa ~ it|sex*agegp, groups=tint, data=tinting, aspect=1,
type=c("p","smooth"), span=1.25, auto.key=list(columns=3)))

# "p": points; "smooth": a smooth curve
# With span=1.25, the smooth curve is close to a straight line
## Footnote Code
## Panel B, with refinements
themeB <- simpleTheme(col=c("skyblue1", "skyblue4")[c(2,1,2)], lwd=c(1,1,2),
pch=c(1,16,16)) # open, filled, filled
update(tint.xyplot, par.settings=themeB, legend=NULL,
auto.key=list(columns=3, points=TRUE, lines=TRUE))

# Set legend=NULL to allow new use of auto.key
## ss 2.1.7: What to look for in plots
## Outliers
## Asymmetry of the distribution
## Changes in variability
## Clustering
## Non-linearity
## Sec 2.2: Data Summary
## ss 2.2.1: Counts
## Table of counts example: data frame nswpsid1 (DAAG)
tab <- with(nswpsid1, table(trt, nodeg, useNA="ifany"))
dimnames(tab) <- list(trt=c("none", "training"),
educ = c("completed", "dropout"))
tab
## educ
## trt completed dropout
## none 1730 760
## training 80 217
## Addition over one or more margins of a table
stones <- array(c(81,6,234,36,192,71,55,25), dim=c(2,2,2),
dimnames=list(Success=c("yes","no"),
Method=c("open","ultrasound"),
Size=c("<2cm", ">=2cm")))
# NB: The margins are 1:Success, 2:Method, 3:Size
library(vcd)
## Loading required package: grid
mosaic(stones, sort=3:1) # c.f. mosaicplot() in base graphics

# Re-ordering the margins gives a more interpretable plot.
## Function to calculate percentage success rates
roundpc <- function(x)round(100*x[1]/sum(x), 1)
## Add "%Yes" to margin 1 (Success) of the table
stonesplus <- addmargins(stones, margin=1, FUN=c("%Yes"=roundpc))
## Print table, use layout similar to that shown alongside plot
ftable(stonesplus, col.vars=1)
## Success yes no %Yes
## Method Size
## open <2cm 81.0 6.0 93.1
## >=2cm 192.0 71.0 73.0
## ultrasound <2cm 234.0 36.0 86.7
## >=2cm 55.0 25.0 68.8
## Get sum for each margin 1,2 combination; i.e., sum over margin 3
stones12 <- margin.table(stones, margin=c(1,2))
stones12plus <- addmargins(stones12, margin=1, FUN=c("%Yes"=roundpc))
ftable(stones12plus, col.vars=1) # Table based on sums over Size
## Success yes no %Yes
## Method
## open 273.0 77.0 78.0
## ultrasound 289.0 61.0 82.6
## Footnote Code
## Add arguments that control size of textual components
mosaic(aperm(stones, 3:1), main=NULL, gp_varnames=gpar(fontsize=8),
labeling_args=list(gp_labels=gpar(fontsize=7),
legend_args=list(fontsize=7)))

## Tabulation that accounts for frequencies or weights -- the {xtabs()} function
library(DAAG)
## NB: The parentheses generate an implicit print(abtab)
(Atab <- xtabs(weight ~ airbag + dead, data=nassCDS))
## dead
## airbag alive dead
## none 5445246 39676
## airbag 6622691 25919
roundpc2 <- function(x)round(100*x[2]/sum(x), 2)
addmargins(Atab, margin=2, FUN=c("%Dead"=roundpc2))
## dead
## airbag alive dead %Dead
## none 5.445e+06 3.968e+04 7.200e-01
## airbag 6.623e+06 2.592e+04 3.900e-01
SAtab <- xtabs(weight ~ seatbelt + airbag + dead, data=nassCDS)
ftable(addmargins(SAtab, margin=3, FUN=c("%Dead"=roundpc2)),
col.vars=3)
## dead alive dead %Dead
## seatbelt airbag
## none none 1.342e+06 2.407e+04 1.760e+00
## airbag 8.719e+05 1.376e+04 1.550e+00
## belted none 4.103e+06 1.561e+04 3.800e-01
## airbag 5.751e+06 1.216e+04 2.100e-01
FSAtab <- xtabs(weight ~ dvcat + seatbelt + airbag + dead,
data=nassCDS)
ftable(addmargins(FSAtab, margin=4, FUN=c("%Dead"=roundpc2)),
col.vars=4)
## dead alive dead %Dead
## dvcat seatbelt airbag
## 1-9km/h none none 3.310e+04 0.000e+00 0.000e+00
## airbag 3.464e+04 1.399e+02 4.000e-01
## belted none 2.257e+05 0.000e+00 0.000e+00
## airbag 4.026e+05 0.000e+00 0.000e+00
## 10-24 none none 6.683e+05 1.666e+03 2.500e-01
## airbag 5.343e+05 2.617e+03 4.900e-01
## belted none 2.670e+06 9.030e+02 3.000e-02
## airbag 4.259e+06 8.293e+02 2.000e-02
## 25-39 none none 4.797e+05 9.544e+03 1.950e+00
## airbag 2.433e+05 2.702e+03 1.100e+00
## belted none 1.006e+06 5.601e+03 5.500e-01
## airbag 9.255e+05 3.093e+03 3.300e-01
## 40-54 none none 1.260e+05 4.767e+03 3.640e+00
## airbag 4.778e+04 3.930e+03 7.600e+00
## belted none 1.692e+05 4.705e+03 2.710e+00
## airbag 1.361e+05 3.157e+03 2.270e+00
## 55+ none none 3.484e+04 8.090e+03 1.885e+01
## airbag 1.182e+04 4.372e+03 2.700e+01
## belted none 3.239e+04 4.400e+03 1.196e+01
## airbag 2.720e+04 5.080e+03 1.574e+01
## ss 2.2.2: Summaries of information from data frames
## Summary as a prelude to analysis -- {aggregate()}
## Footnote Code
## Individual vine means, by block and treatment
library(lattice)
## Panel function calls panel.dotplot(), then panel.average()
dotplot(shade ~ yield | block, data=kiwishade, aspect=1,
panel=function(x,y,...){panel.dotplot(x, y, pch=1, col="gray40")
av <- sapply(split(x,y),mean)
ypos <- unique(y)
lpoints(ypos~av, pch=3, cex=1.25,
col="black")},
key=list(space="top", columns=2, col=c("gray40", "black"),
text=list(c("Individual vine yields", "Plot means (4 vines)")),
points=list(pch=c(1,3), cex=c(1,1.25))), layout=c(3,1))

# Note that parameter settings were given both in the calls to the
# panel functions and in the list supplied to key.
## mean yield by block by shade: data frame kiwishade (DAAG)
kiwimeans <- with(kiwishade,
aggregate(yield, by=list(block, shade), mean))
names(kiwimeans) <- c("block","shade","meanyield")
head(kiwimeans, 4)
## block shade meanyield
## 1 east none 99.03
## 2 north none 104.03
## 3 west none 97.56
## 4 east Aug2Dec 105.56
# . . .
## The benefits of data summary -- dengue status example
## ss 2.2.3: Standard deviation and inter-quartile range
## Cuckoo eggs example
## Footnote Code
## SD of length, by species: data frame cuckoos (DAAG)
sapply(split(cuckoos$length, cuckoos$species), sd)
## hedge.sparrow meadow.pipit pied.wagtail robin tree.pipit
## 1.0494 0.9196 1.0723 0.6821 0.8801
## wren
## 0.7542
# Subsection 14.9.6 has information on split()
## Degrees of freedom
## Other measures of variability
## The pooled standard deviation
## Elastic bands example
## ss 2.2.4: Correlation
## Correlation between body and brain: data frame Animals (MASS)
## Product--moment correlation
with(Animals, cor(body, brain))
## [1] -0.005341
## Product--moment correlation, after log transformation
with(log(Animals), cor(body, brain))
## [1] 0.7795
## Spearman rank correlation
with(Animals, cor(body, brain, method="spearman"))
## [1] 0.7163
## Sec 2.3: Statistical Analysis Questions, Aims and Strategies
## Effective use of the information in the data
## Observational versus experimental data
## ss 2.3.1: How relevant and how reliable are the data?
## ss 2.3.2: How will results be used?
## ss 2.3.3: Formal and informal assessments
## Questionnaires and surveys
## ss 2.3.4: Statistical Analysis Strategies
## ss 2.3.5: Planning the formal analysis
## ss 2.3.6: Changes to the intended plan of analysis
## Sec 2.4: Recap
## Sec 2.5: Further Reading
attach(cuckoohosts)
plot(c(clength, hlength), c(cbreadth, hbreadth),
col=rep(1:2,c(12,12)))
for(i in 1:12)lines(c(clength[i], hlength[i]),
c(cbreadth[i], hbreadth[i]))
text(hlength, hbreadth, abbreviate(rownames(cuckoohosts),8))

detach(cuckoohosts)
deathrate <- c(40.7, 36,27,30.5,27.6,83.5)
hosp <- c("Cliniques of Vienna (1834-63)\n(> 2000 cases pa)",
"Enfans Trouves at Petersburg\n(1845-59, 1000-2000 cases pa)",
"Pesth (500-1000 cases pa)",
"Edinburgh (200-500 cases pa)",
"Frankfort (100-200 cases pa)", "Lund (< 100 cases pa)")
hosp <- factor(hosp, levels=hosp[order(deathrate)])
dotplot(hosp~deathrate, xlim=c(0,95), xlab="Death rate per 1000 ",
type=c("h","p"))

## Source: Nightingale (1871). Data are ascribed to Dr Le Fort
library(Devore6) # Or Devore5 or (wide format) Devore7
tomatoes <- ex10.22
with(Animals, cor(brain,body))
## [1] -0.005341
with(Animals, cor(log(brain),log(body)))
## [1] 0.7795
with(Animals, cor(log(brain),log(body), method="spearman"))
## [1] 0.7163
bwplot(shade ~ yield|block, data=kiwishade, layout=c(3,1))
