## Chapter 4: A Review of Inference Concepts 

## Sec 4.1: Basic Concepts of Estimation 
## ss 4.1.1: Population parameters and sample statistics 
## ss 4.1.2: Sampling distributions 
##               Reliance on the central limit theorem 
## ss 4.1.3: Assessing accuracy -- the standard error 
## Footnote Code
## Calculate heated-ambient; take heated & ambient from columns of pair65 
library(DAAG)
## Loading required package: lattice
test <- with(pair65, heated-ambient) 
c(mean = mean(test), SD = sd(test), SEM = sd(test)/sqrt(length(test))) 
##  mean    SD   SEM 
## 6.333 6.103 2.034
## ss 4.1.4: The standard error for the difference of means 
## Footnote Code
## Heated vs ambient; unpaired elastic band data 
heated <- c(254, 252, 239, 240, 250, 256, 267, 249, 259, 269) 
ambient <- c(233, 252, 237, 246, 255, 244, 248, 242, 217, 257, 254) 
v1 <- var(heated)           # 10 numbers; 10-1 = 9 d.f. 
v2 <- var(ambient)          # 11 numbers; 11-1 = 10 d.f. 
v <- (9*v1 + 10*v2)/(9+10)  # Pooled estimate of variance 
# Estimate SED; variances may not be equal 
c(sem1 = sqrt(v1/10), sem2 = sqrt(v2/11), sed = sqrt(v1/10 + v2/11)) 
##  sem1  sem2   sed 
## 3.138 3.538 4.729
# Estimate SED; use pooled estimate 
c(sd = sqrt(v), sed = sqrt(v1/10 + v2/11)) 
##     sd    sed 
## 10.915  4.729
## ss 4.1.5: \kern-4pt* The standard error of the median 
## Footnote Code
## median and SD for length, by species: data frame cuckoos (DAAG) 
wren <- split(cuckoos$length, cuckoos$species)$wren 
median(wren) 
## [1] 21
n <- length(wren) 
sqrt(pi/2)*sd(wren)/sqrt(n)  # this SE computation assumes normality 
## [1] 0.2441
## ss 4.1.6: The sampling distribution of the $t$ statistic 

## Sec 4.2: Confidence Intervals and Tests of Hypotheses 
##               Calculations with the $t$-distribution 
#  Plus or minus 1.96SE normal distribution limits, e.g. 
pnorm(1.96) - pnorm(-1.96) 
## [1] 0.95
#  Plus or minus 2.31SE t distribution (8 df) limits, e.g. 
pt(2.31, 8) - pt(-2.31,8)  # 2.31 SEs either side 
## [1] 0.9503
qnorm(0.975)          # normal distribution 
## [1] 1.96
qt(0.975, 8)          # t-distribution with 8 d.f. 
## [1] 2.306
##                Confidence intervals of 95\% or 99\% 
## Footnote Code
## Detailed calculations; 95% CI for mean of heated-ambient 
pair65.diff <- with(pair65, heated-ambient)  
pair65.n <- length(pair65.diff)  
pair65.se <- sd(pair65.diff)/sqrt(pair65.n)  
mean(pair65.diff) + qt(c(.025,.975),8)*pair65.se  
## [1]  1.642 11.025
## 95% CI for mean of heated-ambient: data frame pair65 (DAAG) 
with(pair65, t.test(heated, ambient, paired=TRUE,  
                    conf.level=0.95)$conf.int) 
## [1]  1.642 11.025
## attr(,"conf.level")
## [1] 0.95
##                        Tests of hypotheses 
1-pt(6.33/2.03, 8)   # Equals pt(-6.33/2.03, 8) 
## [1] 0.007133
##                     What is a small $p$-value? 
##          $t$-distribution versus the normal distribution 
##            How good is the normal theory approximation? 
## ss 4.2.1: A summary of one- and two-sample calculations 
## Footnote Code
## t-test and confidence interval calculations 
heated <- c(254, 252, 239, 240, 250, 256, 267, 249, 259, 269) 
ambient <- c(233, 252, 237, 246, 255, 244, 248, 242, 217, 257, 254) 
t.test(heated, ambient, var.equal=TRUE) 
## 
##  Two Sample t-test
## 
## data:  heated and ambient
## t = 1.973, df = 19, p-value = 0.06322
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.5723 19.3905
## sample estimates:
## mean of x mean of y 
##     253.5     244.1
##                      When is pairing helpful? 
## Footnote Code
## heated vs ambient: pair65 (DAAG); and cross vs self: mignonette (DAAG) 
par(mfrow=c(1,2), pty="s") 
plot(heated ~ ambient, data=pair65); abline(0, 1)    # left panel 
with(pair65, abline(mean(heated-ambient), 1, lty=2)) 
plot(cross ~ self, data=mignonette); abline(0, 1)    # right panel 
with(mignonette, abline(mean(cross-self), 1, lty=2)) 

plot of chunk unnamed-chunk-1

par(mfrow = c(1,1), pty="m") 

##            What if the standard deviations are unequal? 
##                  Different ways to report results 
## Footnote Code
## Different ways to report results: calculations 
pair65.diff <- with(pair65, heated-ambient)  
n <- length(pair65.diff) 
av <- mean(pair65.diff); sd <- sqrt(var(pair65.diff)); se <- sd/sqrt(n) 
print(c(mean=av, SED=se, "mean/SED"=av/se))   # Items 1 and 2 
##     mean      SED mean/SED 
##    6.333    2.034    3.113
t.test(pair65.diff)                           # Items 3 and 4 
## 
##  One Sample t-test
## 
## data:  pair65.diff
## t = 3.113, df = 8, p-value = 0.01438
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##   1.642 11.025
## sample estimates:
## mean of x 
##     6.333
## ss 4.2.2: Confidence intervals and tests for proportions 
## ss 4.2.3: Confidence intervals for the correlation 
## ss 4.2.4: Confidence intervals versus hypothesis tests 
##                Guidelines for significance testing 

## Sec 4.3: Contingency Tables 
## Footnote Code
## Compare number with a high school qualification, between `untreated' rows  
## from data frame psid3 and `treated' rows from nswdemo 
library(DAAG)   # Data are from DAAG 
nswpsid3 <- rbind(psid3, subset(nswdemo, trt==1)) 
table(nswpsid3$trt, nswpsid3$nodeg) 
##    
##       0   1
##   0  63  65
##   1  80 217
  # PSID3 males are coded 0; NSW male trainees are coded 1. 

# To agree with hand calculation below, specify correct=FALSE 
chisq.test(with(nswpsid3, table(trt, nodeg)), correct=FALSE) 
## 
##  Pearson's Chi-squared test
## 
## data:  with(nswpsid3, table(trt, nodeg))
## X-squared = 19.89, df = 1, p-value = 8.189e-06
##               The mechanics of the chi-squared test 
##        An example where a chi-squared test may not be valid 
## Footnote Code
## Engine man data 
engineman <- matrix(c(5,3,17,85), 2,2) 
chisq.test(engineman) 
## Warning: Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  engineman
## X-squared = 7.086, df = 1, p-value = 0.00777
## ss 4.3.1: Rare and endangered plant species 
## Footnote Code
## Enter the data thus: 
rareplants <- matrix(c(37,190,94,23,59,23,10,141,28,15,58,16), ncol=3, 
     byrow=TRUE, dimnames=list(c("CC","CR","RC","RR"), c("D","W","WD"))) 

(x2 <- chisq.test(rareplants)) 
## 
##  Pearson's Chi-squared test
## 
## data:  rareplants
## X-squared = 34.99, df = 6, p-value = 4.336e-06
## Footnote Code
## Expected number of species, by habitat (calculate x2 as above) 
x2E <- stack(data.frame(t(x2$expected))) 
habitat <- rep(c(1,2,3), 4) 
plot(x2E$values ~ habitat, axes=FALSE, xlim=c(0.5, 3.5), pch=16, 
     xlab="habitat", ylab="Expected Number of Species") 
text(x2E$values ~ habitat, labels=x2E$ind, pos=rep(c(4,4,2,2),3)) 
axis(1, at=seq(1,3), labels=c("D", "W", "WD")) 
axis(2); box() 

plot of chunk unnamed-chunk-1

##  Examination of departures from a consistent overall row pattern 
x2 <- chisq.test(rareplants) 
## Standardized residuals 
residuals(x2) 
##          D        W      WD
## CC -0.3693 -1.19598  2.2634
## CR  2.8275 -1.06657 -0.2753
## RC -2.5466  2.36753 -2.0990
## RR  1.2416  0.07224 -1.0227
x2$expected 
##        D      W    WD
## CC 39.32 207.22 74.47
## CR 12.86  67.78 24.36
## RC 21.92 115.55 41.53
## RR 10.90  57.45 20.65
## ss 4.3.2: Additional notes 
##                       Interpretation issues 
##                        Modeling approaches 

## Sec 4.4: One-Way Unstructured Comparisons 
tomato <- 
  data.frame(weight= 
             c(1.5, 1.9, 1.3, 1.5, 2.4, 1.5,   # water 
               1.5, 1.2, 1.2, 2.1, 2.9, 1.6,   # Nutrient 
               1.9, 1.6, 0.8, 1.15, 0.9, 1.6), # Nutrient+24D 
             trt = rep(c("water", "Nutrient", "Nutrient+24D"), 
                       c(6, 6, 6))) 
## Make water the first level of trt.  It will then appear as 
## the initial level in the graphs. In aov or lm calculations, 
## it will be the baseline or reference level. 
tomato$trt <- relevel(tomato$trt, ref="water") 

stripplot(trt~weight, aspect=0.6, data=tomato) 

plot of chunk unnamed-chunk-1

##                The tomato data -- a visual summary 
## Do analysis of variance calculations 
tomato.aov <- aov(weight ~ trt, data=tomato) 
## Summarize results graphically 
oneway.plot(obj=tomato.aov) 

plot of chunk unnamed-chunk-1

## [1] 1 1 1 1
##                   The analysis of variance table 
anova(tomato.aov) 
## Analysis of Variance Table
## 
## Response: weight
##           Df Sum Sq Mean Sq F value Pr(>F)
## trt        2   0.63   0.314     1.2   0.33
## Residuals 15   3.91   0.261
## ss 4.4.1: Multiple comparisons 
##              *Microarray data -- severe multiplicity 
## ss 4.4.2: Data with a two-way structure, i.e.\ two factors 
## Footnote Code
## Simplified version of code 
library(lattice) 
## Panel A 
dotplot(trt ~ ShootDryMass, data=rice, aspect=1,  
        panel=function(x,y,...){panel.dotplot(x, y, pch=1, col="gray40") 
                                panel.average(x, y, type="p", col="black", 
                                              pch=3, cex=1.25)}, 
        xlab="Shoot dry mass (g)") 

plot of chunk unnamed-chunk-1

## Panel B 
with(rice, interaction.plot(fert, variety, ShootDryMass, 
                            xlab="Level of first factor")) 

plot of chunk unnamed-chunk-1

## ss 4.4.3: Presentation issues 

## Sec 4.5: Response Curves 
## Footnote Code
## Data frame modelcars (DAAG) 
plot(distance.traveled ~ starting.point, data=modelcars, 
       xlim=c(0,12.5), xaxs="i", xlab = "Distance up ramp (cm)",  
       ylab="Distance traveled (cm)")   

plot of chunk unnamed-chunk-1

## Sec 4.6: Data with a Nested Variation Structure 
## ss 4.6.1: Degrees of freedom considerations 
## ss 4.6.2: General multi-way analysis of variance designs 

## Sec 4.7: Resampling Methods for Standard Errors, Tests and Confidence Intervals 
## ss 4.7.1: The one-sample permutation test 
## ss 4.7.2: The two-sample permutation test 
## Footnote Code
## Draw one curve only; permutation distribution of difference in means 
x1 <- two65$ambient; x2 <- two65$heated; x <- c(x1, x2) 
n1 <- length(x1); n2 <- length(x2); n <- n1+n2 
dbar <- mean(x2) - mean(x1) 
z <- numeric(2000) 
for(i in 1:2000){ 
    mn <- sample(n, n2, replace=FALSE) 
    dbardash <- mean(x[mn]) - mean(x[-mn]) 
    z[i]<- dbardash 
    } 
plot(density(z), yaxs="i") 
abline(v = dbar) 
abline(v = -dbar, lty=2) 

plot of chunk unnamed-chunk-1

signif((sum(z > abs(dbar)) + sum (z< -abs(dbar)))/length(z), 3) 
## [1] 0.059
## ss 4.7.3: \kern-4pt* Estimating the standard error of the median: bootstrapping 
## bootstrap estimate of median of wren length: data frame cuckoos (DAAG) 
wren <- split(cuckoos$length, cuckoos$species)$wren 
library(boot) 
## 
## Attaching package: 'boot'
## 
## The following object is masked from 'package:lattice':
## 
##     melanoma
## First define median.fun(), with two required arguments: 
##         data specifies the data vector, 
##         indices selects vector elements for a each resample 
median.fun <- function(data, indices){median(data[indices])} 
## Call boot(), with statistic=median.fun, R = # of resamples 
(wren.boot <- boot(data = wren, statistic = median.fun, R = 999)) 
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = wren, statistic = median.fun, R = 999)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*       21 0.04805      0.2068
## ss 4.7.4: Bootstrap estimates of confidence intervals 
##         Bootstrap 95\% confidence intervals for the median 
median.fun <- function(data, indices){median(data[indices])} 
## Call the boot() function, with statistic=median.fun 
wren <- cuckoos[cuckoos$species=="wren", "length"] 
(wren.boot <- boot(data=wren, statistic=median.fun, R=9999)) 
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = wren, statistic = median.fun, R = 9999)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*       21 0.05434      0.2204
##                    The correlation coefficient 
## Footnote Code
## Bootstrap estimate of 95% CI of cor(chest, belly): data frame possum (DAAG) 
possum.fun <- function(data, indices) { 
    chest <- data$chest[indices] 
    belly <- data$belly[indices] 
    cor(belly, chest) 
    } 
possum.boot <- boot(possum, possum.fun, R=9999) 
boot.ci(possum.boot, type=c("perc", "bca")) 
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 9999 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = possum.boot, type = c("perc", "bca"))
## 
## Intervals : 
## Level     Percentile            BCa          
## 95%   ( 0.4773,  0.7095 )   ( 0.4763,  0.7089 )  
## Calculations and Intervals on Original Scale
##                 The bootstrap -- parting comments 

## Sec 4.8: * Theories of Inference 
## ss 4.8.1: Maximum likelihood estimation 
funlik <- function(mu, sigma, x=with(pair65, heated-ambient)) 
  prod(dnorm(x, mu, sigma)) 

log(funlik(6.3, 6.1))    # Close to estimated mean and SD 
## [1] -28.55
log(funlik(6.33, 5.75))  # Close to the actual mle's 
## [1] -28.52
log(funlik(7, 5.75)) 
## [1] -28.58
## ss 4.8.2: Bayesian estimation 
##       Bayesian estimation with normal priors and likelihood 
## ss 4.8.3: If there is strong prior information, use it! 

## Sec 4.9: Recap 
##                           Dos and don'ts 

## Sec 4.10: Further Reading 
oldpar <- par(mfrow=c(2,2)) 
tenfold1000 <- rep(1:10, rep(1000,10)) 
boxplot(split(rnorm(1000*10), tenfold1000), ylab="normal - 1000") 
boxplot(split(rt(1000*10, 7), tenfold1000), 
        ylab=expression(t[7]*" - 1000")) 
tenfold100 <- rep(1:10, rep(100, 10)) 
boxplot(split(rnorm(100*10), tenfold100), ylab="normal - 100") 
boxplot(split(rt(100*10, 7), tenfold100), 
        ylab=expression(t[7]*" - 100")) 

plot of chunk unnamed-chunk-1

par(oldpar) 

y1 <- rnorm(51) 
y <- y1[-1] + y1[-51] 
acf(y1)              # acf is  `autocorrelation function' 

plot of chunk unnamed-chunk-1

                     # (see Chapter 9) 
acf(y) 

plot of chunk unnamed-chunk-1

## UCBAdmissions is in the datasets package 
## For each combination of margins 1 and 2, calculate the sum 
UCBtotal <- apply(UCBAdmissions, c(1,2), sum) 

apply(UCBAdmissions, 3, function(x) 
      (x[1,1]*x[2,2])/(x[1,2]*x[2,1])) 
##      A      B      C      D      E      F 
## 0.3492 0.8025 1.1331 0.9213 1.2216 0.8279
admissions <- array(c(30,30,10,10,15,5,30,10),  
                    dim=c(2,2,2)) 

## Compare densities for ambient & heated: list two65 (DAAG) 
with(two65, overlapDensity(ambient, heated)) 

plot of chunk unnamed-chunk-1

## [1] 228.7 269.0
  # Do overlapDensity(ambient, heated) with ambient and heated 
  # taken, if not found elsewhere, from the columns of two65 

z.transform <- function(r) .5*log((1+r)/(1-r)) 
z.inverse <- function(z) (exp(2*z)-1)/(exp(2*z)+1) 
possum.fun <- function(data, indices) { 
     chest <- data$chest[indices] 
     belly <- data$belly[indices] 
     z.transform(cor(belly, chest))} 
possum.boot <- boot(possum, possum.fun, R=999) 
z.inverse(boot.ci(possum.boot, type="perc")$percent[4:5]) 
## [1] 0.4767 0.7061
 # See help(bootci.object). The 4th and 5th elements of 
 # the percent list element hold the interval endpoints.