## Chapter 4: A Review of Inference Concepts 

## Sec 4.1: Basic Concepts of Estimation 
## ss 4.1.1: Population parameters and sample statistics 
## ss 4.1.2: Sampling distributions 
##               Reliance on the central limit theorem 
## ss 4.1.3: Assessing accuracy -- the standard error 
## Footnote Code
## Calculate heated-ambient; take heated & ambient from columns of pair65 
library(DAAG)
## Loading required package: lattice
test <- with(pair65, heated-ambient) 
c(mean = mean(test), SD = sd(test), SEM = sd(test)/sqrt(length(test))) 
##  mean    SD   SEM 
## 6.333 6.103 2.034
## ss 4.1.4: The standard error for the difference of means 
## Footnote Code
## Heated vs ambient; unpaired elastic band data 
heated <- c(254, 252, 239, 240, 250, 256, 267, 249, 259, 269) 
ambient <- c(233, 252, 237, 246, 255, 244, 248, 242, 217, 257, 254) 
v1 <- var(heated)           # 10 numbers; 10-1 = 9 d.f. 
v2 <- var(ambient)          # 11 numbers; 11-1 = 10 d.f. 
v <- (9*v1 + 10*v2)/(9+10)  # Pooled estimate of variance 
# Estimate SED; variances may not be equal 
c(sem1 = sqrt(v1/10), sem2 = sqrt(v2/11), sed = sqrt(v1/10 + v2/11)) 
##  sem1  sem2   sed 
## 3.138 3.538 4.729
# Estimate SED; use pooled estimate 
c(sd = sqrt(v), sed = sqrt(v1/10 + v2/11)) 
##     sd    sed 
## 10.915  4.729
## ss 4.1.5: \kern-4pt* The standard error of the median 
## Footnote Code
## median and SD for length, by species: data frame cuckoos (DAAG) 
wren <- split(cuckoos$length, cuckoos$species)$wren 
median(wren) 
## [1] 21
n <- length(wren) 
sqrt(pi/2)*sd(wren)/sqrt(n)  # this SE computation assumes normality 
## [1] 0.2441
## ss 4.1.6: The sampling distribution of the $t$ statistic 

## Sec 4.2: Confidence Intervals and Tests of Hypotheses 
##               Calculations with the $t$-distribution 
#  Plus or minus 1.96SE normal distribution limits, e.g. 
pnorm(1.96) - pnorm(-1.96) 
## [1] 0.95
#  Plus or minus 2.31SE t distribution (8 df) limits, e.g. 
pt(2.31, 8) - pt(-2.31,8)  # 2.31 SEs either side 
## [1] 0.9503
qnorm(0.975)          # normal distribution 
## [1] 1.96
qt(0.975, 8)          # t-distribution with 8 d.f. 
## [1] 2.306
##                Confidence intervals of 95\% or 99\% 
## Footnote Code
## Detailed calculations; 95% CI for mean of heated-ambient 
pair65.diff <- with(pair65, heated-ambient)  
pair65.n <- length(pair65.diff)  
pair65.se <- sd(pair65.diff)/sqrt(pair65.n)  
mean(pair65.diff) + qt(c(.025,.975),8)*pair65.se  
## [1]  1.642 11.025
## 95% CI for mean of heated-ambient: data frame pair65 (DAAG) 
with(pair65, t.test(heated, ambient, paired=TRUE,  
                    conf.level=0.95)$conf.int) 
## [1]  1.642 11.025
## attr(,"conf.level")
## [1] 0.95
##                        Tests of hypotheses 
1-pt(6.33/2.03, 8)   # Equals pt(-6.33/2.03, 8) 
## [1] 0.007133
##                     What is a small $p$-value? 
##          $t$-distribution versus the normal distribution 
##            How good is the normal theory approximation? 
## ss 4.2.1: A summary of one- and two-sample calculations 
## Footnote Code
## t-test and confidence interval calculations 
heated <- c(254, 252, 239, 240, 250, 256, 267, 249, 259, 269) 
ambient <- c(233, 252, 237, 246, 255, 244, 248, 242, 217, 257, 254) 
t.test(heated, ambient, var.equal=TRUE) 
## 
##  Two Sample t-test
## 
## data:  heated and ambient
## t = 1.973, df = 19, p-value = 0.06322
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.5723 19.3905
## sample estimates:
## mean of x mean of y 
##     253.5     244.1
##                      When is pairing helpful? 
## Footnote Code
## heated vs ambient: pair65 (DAAG); and cross vs self: mignonette (DAAG) 
par(mfrow=c(1,2), pty="s") 
plot(heated ~ ambient, data=pair65); abline(0, 1)    # left panel 
with(pair65, abline(mean(heated-ambient), 1, lty=2)) 
plot(cross ~ self, data=mignonette); abline(0, 1)    # right panel 
with(mignonette, abline(mean(cross-self), 1, lty=2)) 

plot of chunk unnamed-chunk-1

par(mfrow = c(1,1), pty="m") 

##            What if the standard deviations are unequal? 
##                  Different ways to report results 
## Footnote Code
## Different ways to report results: calculations 
pair65.diff <- with(pair65, heated-ambient)  
n <- length(pair65.diff) 
av <- mean(pair65.diff); sd <- sqrt(var(pair65.diff)); se <- sd/sqrt(n) 
print(c(mean=av, SED=se, "mean/SED"=av/se))   # Items 1 and 2 
##     mean      SED mean/SED 
##    6.333    2.034    3.113
t.test(pair65.diff)                           # Items 3 and 4 
## 
##  One Sample t-test
## 
## data:  pair65.diff
## t = 3.113, df = 8, p-value = 0.01438
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##   1.642 11.025
## sample estimates:
## mean of x 
##     6.333
## ss 4.2.2: Confidence intervals and tests for proportions 
## ss 4.2.3: Confidence intervals for the correlation 
## ss 4.2.4: Confidence intervals versus hypothesis tests 
##                Guidelines for significance testing 

## Sec 4.3: Contingency Tables 
## Footnote Code
## Compare number with a high school qualification, between `untreated' rows  
## from data frame psid3 and `treated' rows from nswdemo 
library(DAAG)   # Data are from DAAG 
nswpsid3 <- rbind(psid3, subset(nswdemo, trt==1)) 
table(nswpsid3$trt, nswpsid3$nodeg) 
##    
##       0   1
##   0  63  65
##   1  80 217
  # PSID3 males are coded 0; NSW male trainees are coded 1. 

# To agree with hand calculation below, specify correct=FALSE 
chisq.test(with(nswpsid3, table(trt, nodeg)), correct=FALSE) 
## 
##  Pearson's Chi-squared test
## 
## data:  with(nswpsid3, table(trt, nodeg))
## X-squared = 19.89, df = 1, p-value = 8.189e-06
##               The mechanics of the chi-squared test 
##        An example where a chi-squared test may not be valid 
## Footnote Code
## Engine man data 
engineman <- matrix(c(5,3,17,85), 2,2) 
chisq.test(engineman) 
## Warning: Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  engineman
## X-squared = 7.086, df = 1, p-value = 0.00777
## ss 4.3.1: Rare and endangered plant species 
## Footnote Code
## Enter the data thus: 
rareplants <- matrix(c(37,190,94,23,59,23,10,141,28,15,58,16), ncol=3, 
     byrow=TRUE, dimnames=list(c("CC","CR","RC","RR"), c("D","W","WD"))) 

(x2 <- chisq.test(rareplants)) 
## 
##  Pearson's Chi-squared test
## 
## data:  rareplants
## X-squared = 34.99, df = 6, p-value = 4.336e-06
## Footnote Code
## Expected number of species, by habitat (calculate x2 as above) 
x2E <- stack(data.frame(t(x2$expected))) 
habitat <- rep(c(1,2,3), 4) 
plot(x2E$values ~ habitat, axes=FALSE, xlim=c(0.5, 3.5), pch=16, 
     xlab="habitat", ylab="Expected Number of Species") 
text(x2E$values ~ habitat, labels=x2E$ind, pos=rep(c(4,4,2,2),3)) 
axis(1, at=seq(1,3), labels=c("D", "W", "WD")) 
axis(2); box() 

plot of chunk unnamed-chunk-1

##  Examination of departures from a consistent overall row pattern 
x2 <- chisq.test(rareplants) 
## Standardized residuals 
residuals(x2) 
##          D        W      WD
## CC -0.3693 -1.19598  2.2634
## CR  2.8275 -1.06657 -0.2753
## RC -2.5466  2.36753 -2.0990
## RR  1.2416  0.07224 -1.0227
x2$expected 
##        D      W    WD
## CC 39.32 207.22 74.47
## CR 12.86  67.78 24.36
## RC 21.92 115.55 41.53
## RR 10.90  57.45 20.65
## ss 4.3.2: Additional notes 
##                       Interpretation issues 
##                        Modeling approaches 

## Sec 4.4: One-Way Unstructured Comparisons 
tomato <- 
  data.frame(weight= 
             c(1.5, 1.9, 1.3, 1.5, 2.4, 1.5,   # water 
               1.5, 1.2, 1.2, 2.1, 2.9, 1.6,   # Nutrient 
               1.9, 1.6, 0.8, 1.15, 0.9, 1.6), # Nutrient+24D 
             trt = rep(c("water", "Nutrient", "Nutrient+24D"), 
                       c(6, 6, 6))) 
## Make water the first level of trt.  It will then appear as 
## the initial level in the graphs. In aov or lm calculations, 
## it will be the baseline or reference level. 
tomato$trt <- relevel(tomato$trt, ref="water") 

stripplot(trt~weight, aspect=0.6, data=tomato) 

plot of chunk unnamed-chunk-1

##                The tomato data -- a visual summary 
## Do analysis of variance calculations 
tomato.aov <- aov(weight ~ trt, data=tomato) 
## Summarize results graphically 
oneway.plot(obj=tomato.aov) 

plot of chunk unnamed-chunk-1

## [1] 1 1 1 1
##                   The analysis of variance table 
anova(tomato.aov) 
## Analysis of Variance Table
## 
## Response: weight
##           Df Sum Sq Mean Sq F value Pr(>F)
## trt        2   0.63   0.314     1.2   0.33
## Residuals 15   3.91   0.261
## ss 4.4.1: Multiple comparisons 
##              *Microarray data -- severe multiplicity 
## ss 4.4.2: Data with a two-way structure, i.e.\ two factors 
## Footnote Code
## Simplified version of code 
library(lattice) 
## Panel A 
dotplot(trt ~ ShootDryMass, data=rice, aspect=1,  
        panel=function(x,y,...){panel.dotplot(x, y, pch=1, col="gray40") 
                                panel.average(x, y, type="p", col="black", 
                                              pch=3, cex=1.25)}, 
        xlab="Shoot dry mass (g)") 

plot of chunk unnamed-chunk-1

## Panel B 
with(rice, interaction.plot(fert, variety, ShootDryMass, 
                            xlab="Level of first factor")) 

plot of chunk unnamed-chunk-1

## ss 4.4.3: Presentation issues 

## Sec 4.5: Response Curves 
## Footnote Code
## Data frame modelcars (DAAG) 
plot(distance.traveled ~ starting.point, data=modelcars, 
       xlim=c(0,12.5), xaxs="i", xlab = "Distance up ramp (cm)",  
       ylab="Distance traveled (cm)")   

plot of chunk unnamed-chunk-1

## Sec 4.6: Data with a Nested Variation Structure 
## ss 4.6.1: Degrees of freedom considerations 
## ss 4.6.2: General multi-way analysis of variance designs 

## Sec 4.7: Resampling Methods for Standard Errors, Tests and Confidence Intervals 
## ss 4.7.1: The one-sample permutation test 
## ss 4.7.2: The two-sample permutation test 
## Footnote Code
## Draw one curve only; permutation distribution of difference in means 
x1 <- two65$ambient; x2 <- two65$heated; x <- c(x1, x2) 
n1 <- length(x1); n2 <- length(x2); n <- n1+n2 
dbar <- mean(x2) - mean(x1) 
z <- numeric(2000) 
for(i in 1:2000){ 
    mn <- sample(n, n2, replace=FALSE) 
    dbardash <- mean(x[mn]) - mean(x[-mn]) 
    z[i]<- dbardash 
    } 
plot(density(z), yaxs="i") 
abline(v = dbar) 
abline(v = -dbar, lty=2)