eda.R

## Chapter 2: Styles of Data Analysis 

## Sec 2.1: Revealing Views of the Data 
## ss 2.1.1: Views of a single sample 
##                    Histograms and density plots 
library(DAAG)        # Ensure that the DAAG package is attached

## Loading required package: lattice

## Form the subset of possum that holds data on females only 
ftotlngth <- with(possum, totlngth[sex=="f"]) 

## Footnote Code
## To get a 1 by 4 layout, precede with 
opar <- par(mfrow = c(1,4), pty="s") 

hist(ftotlngth, breaks = 72.5 + (0:5) * 5, ylim = c(0, 22), 
     xlab="Total length (cm)", main ="A: Breaks at 72.5, 77.5, ...") 
hist(ftotlngth, breaks = 75 + (0:5) * 5, ylim = c(0, 22), 
     xlab="Total length (cm)", main="B: Breaks at 75, 80, ...") 

dens <- density(ftotlngth)  
xlim <- range(dens$x); ylim <- range(dens$y) 
hist(ftotlngth, breaks = 72.5 + (0:5) * 5, probability = T, 
     xlim = xlim, ylim = ylim, xlab="Total length (cm)", main ="C: Breaks as in A") 
lines(dens) 
hist(ftotlngth, breaks = 75 + (0:5) * 5, probability = T, 
     xlim = xlim, ylim = ylim, xlab="Total length (cm)", main="D: Breaks as in B") 
lines(dens)

plot of chunk unnamed-chunk-1

par(opar)
par(mfrow=c(1,1))

##                     The stem-and-leaf display 
with(ais, stem(ht[sport=="Row"]))

## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##   15 | 6
##   16 | 
##   16 | 5
##   17 | 4
##   17 | 5678899
##   18 | 00000011223
##   18 | 55666668899
##   19 | 123
##   19 | 58

## Footnote Code
## Use quantile() to obtain the quartiles of ht: data frame ais (DAAG package) 
quantile(ais$ht[ais$sport=="Row"], prob=c(.25,.5,.75))

##   25%   50%   75% 
## 179.3 181.8 186.3

 # For the 50th percentile (the 2nd quartile), an alternative is median() 

##                              Boxplots 
## Base graphics boxplot function 
boxplot(ftotlngth, horizontal=TRUE)

plot of chunk unnamed-chunk-1

## Alternative: lattice graphics bwplot function 
bwplot(~ftotlngth, data=fossum)

plot of chunk unnamed-chunk-1

## ss 2.1.2: Patterns in univariate time series 
## Panel A 
plot(log10(measles), xlab="", ylim=log10 (c(1,5000*1000)), 
     ylab=" Deaths; Population (log scale)", yaxt="n") 
ytiks <- c(1, 10, 100, 1000, 1000000, 5000000) 
## London population in thousands 
londonpop <-  
  ts(c(1088,1258,1504,1778,2073,2491,2921,3336,3881,4266, 
       4563,4541,4498,4408), start=1801, end=1931, deltat=10)  
points(log10(londonpop*1000), pch=16, cex=.5) 
axis(2, at=log10(ytiks), labels=paste(ytiks), las=2)

plot of chunk unnamed-chunk-1

## Panel B 
plot(window(measles, start=1840, end=1882), ylim=c (0, 4600),  
     yaxt="n") 
axis(2, at=(0:4)* 1000, labels=paste(0:4), las=2)

plot of chunk unnamed-chunk-1

## Both graphs on the one graphics page
## Panel A:  
par(fig=c(0, 1, .38, 1), cex=0.8)   # 38% to 100% of page, in y-direction 
plot(log10(measles), ylab="log10(Deaths)",  
     ylim=log10(c(1,5000*1000))) 
mtext(side=3, line=0.5, "A (1629-1939)", adj=0) 
## Panel B: window from 1840 to 1882; more complete code 
par(fig=c(0, 1, 0, .4), new=TRUE)  # 0% to 38% of height of figure region 
plot(window(measles, start=1840, end=1882), ylab="Deaths") 
mtext(side=3, line=0.5, "B (1841-1881)", adj=0)

plot of chunk unnamed-chunk-1

par(fig=c(0, 1, 0, 1), cex=1)     # Restore default figure region 

## ss 2.1.3: Patterns in bivariate data 
## Plot four vs one: data frame milk (DAAG) 
xyrange <- range(milk) 
par(pty="s")      # square plotting region 
plot(four ~ one, data = milk, xlim = xyrange, ylim = xyrange,  
     pch = 16)    
rug(milk$one)              # x-axis rug (default is side=1) 
rug(milk$four, side = 2)   # y-axis rug 
abline(0, 1)

plot of chunk unnamed-chunk-1

par(pty="m")

##                The fitting of a smooth trend curve 
## Plot ohms vs juice: data frame fruitohms (DAAG) 
plot(ohms ~ juice, xlab="Apparent juice content (%)", 
     ylab="Resistance (ohms)", data=fruitohms) 
## Add a smooth curve, as in Panel B 
with(fruitohms, lines(lowess(juice, ohms), lwd=2))

plot of chunk unnamed-chunk-1

 # With lwd=2, the curve is twice the default thickness 

##                   What is the appropriate scale? 
## The following omits the labeling information 
oldpar <- par(mfrow = c(1,2), pty="s")   
## Plot brain vs body: data frame Animals (MASS package) 
library(MASS)

## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:DAAG':
## 
##     hills

plot(brain ~ body, data=Animals)             # Panel A 
plot(log(brain) ~ log(body), data=Animals)   # Panel B

plot of chunk unnamed-chunk-1

par(oldpar) 

## ss 2.1.4: Patterns in grouped data -- lengths of cuckoo eggs 
## Compare stripplot() with bwplot(), both from lattice package 
stripplot(species ~ length, xlab="Length of egg (mm)", data=cuckoos)

plot of chunk unnamed-chunk-1

bwplot(species ~ length, xlab="Length of egg (mm)", data=cuckoos, 
       scales=list(y=list(alternating=0)))

plot of chunk unnamed-chunk-1

  # alternating=0; omit y-axis labels 

## Footnote Code
## For tidier labels replace ".", in several of the species names, by a space 
specnam <- with(cuckoos, sub(pattern=".", replacement=" ", levels(species), fixed=TRUE)) 
  # fixed=TRUE: do not interpret "." as a `regular expression',  
## Panel A: Strip plot: data frame cuckoos (DAAG) 
plt1 <- stripplot(species ~ length, factor.levels=specnam, data=cuckoos) 
print(update(plt1, xlab="Length of egg (mm)"),  
      position=c(0,0,0.55,1))  # xmin, ymin, xmax, ymax 
  # Use print() to display lattice graphics objects  
## Panel B: Box plot 
plt2 <- bwplot(species ~ length, factor.levels=specnam, data=cuckoos) 
print(update(plt2, xlab="Length of egg (mm)", scales=list(y=list(alternating=0))), 
      newpage=FALSE, position=c(0.55,0,1,1))

plot of chunk unnamed-chunk-1

##  Comparing densities between groups -- lattice style density plots 
## Density plot for earconch: data frame possum (DAAG package) 
library(lattice) 
densityplot(~earconch | sex, groups=Pop, data=possum,  
                       auto.key=list(space="right"))

plot of chunk unnamed-chunk-1

## ss 2.1.5: \hspace*{-5pt}*\hspace*{5pt}Multiple variables and times 
## Apply function range to columns of data frame jobs (DAAG) 
sapply(jobs, range)

##        BC Alberta Prairies Ontario Quebec Atlantic  Date
## [1,] 1737    1366      973    5212   3167      941 95.00
## [2,] 1840    1436      999    5360   3257      968 96.92

## Simplified plot; all series in a single panel; use log scale 
(simplejobsA.xyplot <- 
  xyplot(Ontario+Quebec+BC+Alberta+Prairies+Atlantic ~ Date,  
         outer=FALSE, data=jobs, type="b",  
         ylab="Number of workers", scales=list(y=list(log="e")), 
         auto.key=list(space="right", lines=TRUE)))

plot of chunk unnamed-chunk-1

## Footnote Code
##  Panel A: Update trellis object to improve x- and y-axis tick labels 
datelabpos <- seq(from=95, by=0.5, length=5) 
datelabs <- format(seq(from=as.Date("1Jan1995", format="%d%b%Y"),   
                       by="6 month", length=5), "%b%y") 
## Now create $y$-labels that have numbers, with log values underneath 
ylabpos <- exp(pretty(log(unlist(jobs[,-7])), 5)) 
ylabels <- paste(round(ylabpos),"\n(", log(ylabpos), ")", sep="")  
update(simplejobsA.xyplot, xlab="",  
       scales=list(x=list(at=datelabpos, labels=datelabs), 
                   y=list(at=ylabpos, labels=ylabels)))

plot of chunk unnamed-chunk-1

## Simplified code for Figure 2.9B 
xyplot(Ontario+Quebec+BC+Alberta+Prairies+Atlantic ~ Date,  
       data=jobs, type="b", layout=c(3,2), ylab="Number of jobs", 
       scales=list(y=list(relation="sliced", log=TRUE)),  
       outer=TRUE)

plot of chunk unnamed-chunk-1

##   *Small proportional changes, on a scale of natural logarithms 
##        *Tick positions and labeling, on a logarithmic scale 
## ss 2.1.6: Scatterplots, broken down by multiple factors 
(target.xyplot <- 
  xyplot(csoa ~ it | sex*agegp, data=tinting, groups=target,  
         aspect=1, auto.key=list(columns=2)))

plot of chunk unnamed-chunk-1

## Footnote Code
## Settings used for Figure 2.12B (suitable for grayscale on a printed page) 
update(target.xyplot,  
       par.settings=simpleTheme(col=c("black","gray20"), pch=c(1, 16)))

$plot of chunk unnamed-chunk-1$

 # In the above, par.settings changed settings for this use of xyplot() 
## Note the use of simpleTheme() for changing settings; see help(simpleTheme) 
## Use trellis.par.set() to change settings while the current device is in use 

(tint.xyplot <-  
  xyplot(csoa ~ it|sex*agegp, groups=tint, data=tinting, aspect=1,
         type=c("p","smooth"), span=1.25, auto.key=list(columns=3)))

plot of chunk unnamed-chunk-1

 # "p": points; "smooth": a smooth curve 
 # With span=1.25, the smooth curve is close to a straight line 

## Footnote Code
## Panel B, with refinements 
themeB <- simpleTheme(col=c("skyblue1", "skyblue4")[c(2,1,2)], lwd=c(1,1,2), 
                            pch=c(1,16,16))  # open, filled, filled 
update(tint.xyplot, par.settings=themeB, legend=NULL, 
       auto.key=list(columns=3, points=TRUE, lines=TRUE))

plot of chunk unnamed-chunk-1

 # Set legend=NULL to allow new use of auto.key 

## ss 2.1.7: What to look for in plots 
##                              Outliers 
##                   Asymmetry of the distribution 
##                       Changes in variability 
##                             Clustering 
##                           Non-linearity 

## Sec 2.2: Data Summary 
## ss 2.2.1: Counts 
## Table of counts example: data frame nswpsid1 (DAAG) 
tab <- with(nswpsid1, table(trt, nodeg, useNA="ifany")) 
dimnames(tab) <- list(trt=c("none", "training"),  
                      educ = c("completed", "dropout")) 
tab

##           educ
## trt        completed dropout
##   none          1730     760
##   training        80     217

##            Addition over one or more margins of a table 
stones <- array(c(81,6,234,36,192,71,55,25), dim=c(2,2,2), 
                dimnames=list(Success=c("yes","no"),  
                              Method=c("open","ultrasound"), 
                              Size=c("<2cm", ">=2cm"))) 
  # NB: The margins are 1:Success, 2:Method, 3:Size 
library(vcd)

## Loading required package: grid

mosaic(stones, sort=3:1)  # c.f. mosaicplot() in base graphics

plot of chunk unnamed-chunk-1

  # Re-ordering the margins gives a more interpretable plot. 

## Function to calculate percentage success rates  
roundpc <- function(x)round(100*x[1]/sum(x), 1)  
## Add "%Yes" to margin 1 (Success) of the table  
stonesplus <- addmargins(stones, margin=1, FUN=c("%Yes"=roundpc))  
## Print table, use layout similar to that shown alongside plot  
ftable(stonesplus, col.vars=1)

##                  Success   yes    no  %Yes
## Method     Size                           
## open       <2cm           81.0   6.0  93.1
##            >=2cm         192.0  71.0  73.0
## ultrasound <2cm          234.0  36.0  86.7
##            >=2cm          55.0  25.0  68.8

## Get sum for each margin 1,2 combination; i.e., sum over margin 3 
stones12 <- margin.table(stones, margin=c(1,2))  
stones12plus <- addmargins(stones12, margin=1, FUN=c("%Yes"=roundpc)) 
ftable(stones12plus, col.vars=1) # Table based on sums over Size

##            Success   yes    no  %Yes
## Method                              
## open               273.0  77.0  78.0
## ultrasound         289.0  61.0  82.6

## Footnote Code
## Add arguments that control size of textual components 
mosaic(aperm(stones, 3:1), main=NULL, gp_varnames=gpar(fontsize=8), 
       labeling_args=list(gp_labels=gpar(fontsize=7), 
       legend_args=list(fontsize=7)))

plot of chunk unnamed-chunk-1

##  Tabulation that accounts for frequencies or weights -- the {xtabs()} function 
library(DAAG) 
## NB: The parentheses generate an implicit print(abtab) 
(Atab <- xtabs(weight ~ airbag + dead, data=nassCDS))

##         dead
## airbag     alive    dead
##   none   5445246   39676
##   airbag 6622691   25919

roundpc2 <- function(x)round(100*x[2]/sum(x), 2) 
addmargins(Atab, margin=2, FUN=c("%Dead"=roundpc2))

##         dead
## airbag       alive      dead     %Dead
##   none   5.445e+06 3.968e+04 7.200e-01
##   airbag 6.623e+06 2.592e+04 3.900e-01

SAtab <- xtabs(weight ~ seatbelt + airbag + dead, data=nassCDS) 
ftable(addmargins(SAtab, margin=3, FUN=c("%Dead"=roundpc2)),  
       col.vars=3)

##                 dead     alive      dead     %Dead
## seatbelt airbag                                   
## none     none        1.342e+06 2.407e+04 1.760e+00
##          airbag      8.719e+05 1.376e+04 1.550e+00
## belted   none        4.103e+06 1.561e+04 3.800e-01
##          airbag      5.751e+06 1.216e+04 2.100e-01

FSAtab <- xtabs(weight ~ dvcat + seatbelt + airbag + dead,  
                data=nassCDS) 
ftable(addmargins(FSAtab, margin=4, FUN=c("%Dead"=roundpc2)),  
       col.vars=4)

##                         dead     alive      dead     %Dead
## dvcat   seatbelt airbag                                   
## 1-9km/h none     none        3.310e+04 0.000e+00 0.000e+00
##                  airbag      3.464e+04 1.399e+02 4.000e-01
##         belted   none        2.257e+05 0.000e+00 0.000e+00
##                  airbag      4.026e+05 0.000e+00 0.000e+00
## 10-24   none     none        6.683e+05 1.666e+03 2.500e-01
##                  airbag      5.343e+05 2.617e+03 4.900e-01
##         belted   none        2.670e+06 9.030e+02 3.000e-02
##                  airbag      4.259e+06 8.293e+02 2.000e-02
## 25-39   none     none        4.797e+05 9.544e+03 1.950e+00
##                  airbag      2.433e+05 2.702e+03 1.100e+00
##         belted   none        1.006e+06 5.601e+03 5.500e-01
##                  airbag      9.255e+05 3.093e+03 3.300e-01
## 40-54   none     none        1.260e+05 4.767e+03 3.640e+00
##                  airbag      4.778e+04 3.930e+03 7.600e+00
##         belted   none        1.692e+05 4.705e+03 2.710e+00
##                  airbag      1.361e+05 3.157e+03 2.270e+00
## 55+     none     none        3.484e+04 8.090e+03 1.885e+01
##                  airbag      1.182e+04 4.372e+03 2.700e+01
##         belted   none        3.239e+04 4.400e+03 1.196e+01
##                  airbag      2.720e+04 5.080e+03 1.574e+01

## ss 2.2.2: Summaries of information from data frames 
##         Summary as a prelude to analysis -- {aggregate()} 
## Footnote Code
## Individual vine means, by block and treatment 
library(lattice) 
## Panel function calls panel.dotplot(), then panel.average() 
dotplot(shade ~ yield | block, data=kiwishade, aspect=1,  
        panel=function(x,y,...){panel.dotplot(x, y, pch=1, col="gray40") 
                                av <- sapply(split(x,y),mean)
                                ypos <- unique(y)
                                lpoints(ypos~av, pch=3, cex=1.25, 
                                        col="black")},
        key=list(space="top", columns=2, col=c("gray40", "black"), 
          text=list(c("Individual vine yields", "Plot means (4 vines)")), 
          points=list(pch=c(1,3), cex=c(1,1.25))), layout=c(3,1))

plot of chunk unnamed-chunk-1

 # Note that parameter settings were given both in the calls to the  
 # panel functions and in the list supplied to key. 

## mean yield by block by shade: data frame kiwishade (DAAG) 
kiwimeans <- with(kiwishade, 
                  aggregate(yield, by=list(block, shade), mean)) 
names(kiwimeans) <- c("block","shade","meanyield") 
head(kiwimeans, 4)

##   block   shade meanyield
## 1  east    none     99.03
## 2 north    none    104.03
## 3  west    none     97.56
## 4  east Aug2Dec    105.56

# . . . 

##       The benefits of data summary -- dengue status example 
## ss 2.2.3: Standard deviation and inter-quartile range 
##                        Cuckoo eggs example 
## Footnote Code
## SD of length, by species: data frame cuckoos (DAAG) 
sapply(split(cuckoos$length, cuckoos$species), sd)

## hedge.sparrow  meadow.pipit  pied.wagtail         robin    tree.pipit 
##        1.0494        0.9196        1.0723        0.6821        0.8801 
##          wren 
##        0.7542

 # Subsection 14.9.6 has information on split() 

##                         Degrees of freedom 
##                   Other measures of variability 
##                   The pooled standard deviation 
##                       Elastic bands example 
## ss 2.2.4: Correlation 
## Correlation between body and brain: data frame Animals (MASS) 
## Product--moment correlation 
with(Animals, cor(body, brain))

## [1] -0.005341

## Product--moment correlation, after log transformation 
with(log(Animals), cor(body, brain))

## [1] 0.7795

 ## Spearman rank correlation 
with(Animals, cor(body, brain, method="spearman"))

## [1] 0.7163

## Sec 2.3: Statistical Analysis Questions, Aims and Strategies 
##            Effective use of the information in the data 
##               Observational versus experimental data 
## ss 2.3.1: How relevant and how reliable are the data? 
## ss 2.3.2: How will results be used? 
## ss 2.3.3: Formal and informal assessments 
##                     Questionnaires and surveys 
## ss 2.3.4: Statistical Analysis Strategies 
## ss 2.3.5: Planning the formal analysis 
## ss 2.3.6: Changes to the intended plan of analysis 

## Sec 2.4: Recap 

## Sec 2.5: Further Reading 
attach(cuckoohosts) 
plot(c(clength, hlength), c(cbreadth, hbreadth), 
     col=rep(1:2,c(12,12))) 
for(i in 1:12)lines(c(clength[i], hlength[i]),  
                    c(cbreadth[i], hbreadth[i])) 
text(hlength, hbreadth, abbreviate(rownames(cuckoohosts),8))

plot of chunk unnamed-chunk-1

detach(cuckoohosts) 

deathrate <- c(40.7, 36,27,30.5,27.6,83.5) 
hosp <- c("Cliniques of Vienna (1834-63)\n(> 2000 cases pa)", 
    "Enfans Trouves at Petersburg\n(1845-59, 1000-2000 cases pa)", 
    "Pesth (500-1000 cases pa)", 
    "Edinburgh (200-500 cases pa)", 
    "Frankfort (100-200 cases pa)", "Lund (< 100 cases pa)") 
hosp <- factor(hosp, levels=hosp[order(deathrate)]) 
dotplot(hosp~deathrate, xlim=c(0,95), xlab="Death rate per 1000 ", 
        type=c("h","p"))

plot of chunk unnamed-chunk-1

## Source: Nightingale (1871). Data are ascribed to Dr Le Fort 

library(Devore6)     # Or Devore5 or (wide format) Devore7 
tomatoes <- ex10.22 

with(Animals, cor(brain,body))

## [1] -0.005341

with(Animals, cor(log(brain),log(body)))

## [1] 0.7795

with(Animals, cor(log(brain),log(body), method="spearman"))

## [1] 0.7163

bwplot(shade ~ yield|block, data=kiwishade, layout=c(3,1))

plot of chunk unnamed-chunk-1

eda.R

johnm1

Fri Aug 29 14:59:12 2014