## Chapter 4: A Review of Inference Concepts
## Sec 4.1: Basic Concepts of Estimation
## ss 4.1.1: Population parameters and sample statistics
## ss 4.1.2: Sampling distributions
## Reliance on the central limit theorem
## ss 4.1.3: Assessing accuracy -- the standard error
## Footnote Code
## Calculate heated-ambient; take heated & ambient from columns of pair65
library(DAAG)
## Loading required package: lattice
test <- with(pair65, heated-ambient)
c(mean = mean(test), SD = sd(test), SEM = sd(test)/sqrt(length(test)))
## mean SD SEM
## 6.333 6.103 2.034
## ss 4.1.4: The standard error for the difference of means
## Footnote Code
## Heated vs ambient; unpaired elastic band data
heated <- c(254, 252, 239, 240, 250, 256, 267, 249, 259, 269)
ambient <- c(233, 252, 237, 246, 255, 244, 248, 242, 217, 257, 254)
v1 <- var(heated) # 10 numbers; 10-1 = 9 d.f.
v2 <- var(ambient) # 11 numbers; 11-1 = 10 d.f.
v <- (9*v1 + 10*v2)/(9+10) # Pooled estimate of variance
# Estimate SED; variances may not be equal
c(sem1 = sqrt(v1/10), sem2 = sqrt(v2/11), sed = sqrt(v1/10 + v2/11))
## sem1 sem2 sed
## 3.138 3.538 4.729
# Estimate SED; use pooled estimate
c(sd = sqrt(v), sed = sqrt(v1/10 + v2/11))
## sd sed
## 10.915 4.729
## ss 4.1.5: \kern-4pt* The standard error of the median
## Footnote Code
## median and SD for length, by species: data frame cuckoos (DAAG)
wren <- split(cuckoos$length, cuckoos$species)$wren
median(wren)
## [1] 21
n <- length(wren)
sqrt(pi/2)*sd(wren)/sqrt(n) # this SE computation assumes normality
## [1] 0.2441
## ss 4.1.6: The sampling distribution of the $t$ statistic
## Sec 4.2: Confidence Intervals and Tests of Hypotheses
## Calculations with the $t$-distribution
# Plus or minus 1.96SE normal distribution limits, e.g.
pnorm(1.96) - pnorm(-1.96)
## [1] 0.95
# Plus or minus 2.31SE t distribution (8 df) limits, e.g.
pt(2.31, 8) - pt(-2.31,8) # 2.31 SEs either side
## [1] 0.9503
qnorm(0.975) # normal distribution
## [1] 1.96
qt(0.975, 8) # t-distribution with 8 d.f.
## [1] 2.306
## Confidence intervals of 95\% or 99\%
## Footnote Code
## Detailed calculations; 95% CI for mean of heated-ambient
pair65.diff <- with(pair65, heated-ambient)
pair65.n <- length(pair65.diff)
pair65.se <- sd(pair65.diff)/sqrt(pair65.n)
mean(pair65.diff) + qt(c(.025,.975),8)*pair65.se
## [1] 1.642 11.025
## 95% CI for mean of heated-ambient: data frame pair65 (DAAG)
with(pair65, t.test(heated, ambient, paired=TRUE,
conf.level=0.95)$conf.int)
## [1] 1.642 11.025
## attr(,"conf.level")
## [1] 0.95
## Tests of hypotheses
1-pt(6.33/2.03, 8) # Equals pt(-6.33/2.03, 8)
## [1] 0.007133
## What is a small $p$-value?
## $t$-distribution versus the normal distribution
## How good is the normal theory approximation?
## ss 4.2.1: A summary of one- and two-sample calculations
## Footnote Code
## t-test and confidence interval calculations
heated <- c(254, 252, 239, 240, 250, 256, 267, 249, 259, 269)
ambient <- c(233, 252, 237, 246, 255, 244, 248, 242, 217, 257, 254)
t.test(heated, ambient, var.equal=TRUE)
##
## Two Sample t-test
##
## data: heated and ambient
## t = 1.973, df = 19, p-value = 0.06322
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.5723 19.3905
## sample estimates:
## mean of x mean of y
## 253.5 244.1
## When is pairing helpful?
## Footnote Code
## heated vs ambient: pair65 (DAAG); and cross vs self: mignonette (DAAG)
par(mfrow=c(1,2), pty="s")
plot(heated ~ ambient, data=pair65); abline(0, 1) # left panel
with(pair65, abline(mean(heated-ambient), 1, lty=2))
plot(cross ~ self, data=mignonette); abline(0, 1) # right panel
with(mignonette, abline(mean(cross-self), 1, lty=2))
par(mfrow = c(1,1), pty="m")
## What if the standard deviations are unequal?
## Different ways to report results
## Footnote Code
## Different ways to report results: calculations
pair65.diff <- with(pair65, heated-ambient)
n <- length(pair65.diff)
av <- mean(pair65.diff); sd <- sqrt(var(pair65.diff)); se <- sd/sqrt(n)
print(c(mean=av, SED=se, "mean/SED"=av/se)) # Items 1 and 2
## mean SED mean/SED
## 6.333 2.034 3.113
t.test(pair65.diff) # Items 3 and 4
##
## One Sample t-test
##
## data: pair65.diff
## t = 3.113, df = 8, p-value = 0.01438
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 1.642 11.025
## sample estimates:
## mean of x
## 6.333
## ss 4.2.2: Confidence intervals and tests for proportions
## ss 4.2.3: Confidence intervals for the correlation
## ss 4.2.4: Confidence intervals versus hypothesis tests
## Guidelines for significance testing
## Sec 4.3: Contingency Tables
## Footnote Code
## Compare number with a high school qualification, between `untreated' rows
## from data frame psid3 and `treated' rows from nswdemo
library(DAAG) # Data are from DAAG
nswpsid3 <- rbind(psid3, subset(nswdemo, trt==1))
table(nswpsid3$trt, nswpsid3$nodeg)
##
## 0 1
## 0 63 65
## 1 80 217
# PSID3 males are coded 0; NSW male trainees are coded 1.
# To agree with hand calculation below, specify correct=FALSE
chisq.test(with(nswpsid3, table(trt, nodeg)), correct=FALSE)
##
## Pearson's Chi-squared test
##
## data: with(nswpsid3, table(trt, nodeg))
## X-squared = 19.89, df = 1, p-value = 8.189e-06
## The mechanics of the chi-squared test
## An example where a chi-squared test may not be valid
## Footnote Code
## Engine man data
engineman <- matrix(c(5,3,17,85), 2,2)
chisq.test(engineman)
## Warning: Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: engineman
## X-squared = 7.086, df = 1, p-value = 0.00777
## ss 4.3.1: Rare and endangered plant species
## Footnote Code
## Enter the data thus:
rareplants <- matrix(c(37,190,94,23,59,23,10,141,28,15,58,16), ncol=3,
byrow=TRUE, dimnames=list(c("CC","CR","RC","RR"), c("D","W","WD")))
(x2 <- chisq.test(rareplants))
##
## Pearson's Chi-squared test
##
## data: rareplants
## X-squared = 34.99, df = 6, p-value = 4.336e-06
## Footnote Code
## Expected number of species, by habitat (calculate x2 as above)
x2E <- stack(data.frame(t(x2$expected)))
habitat <- rep(c(1,2,3), 4)
plot(x2E$values ~ habitat, axes=FALSE, xlim=c(0.5, 3.5), pch=16,
xlab="habitat", ylab="Expected Number of Species")
text(x2E$values ~ habitat, labels=x2E$ind, pos=rep(c(4,4,2,2),3))
axis(1, at=seq(1,3), labels=c("D", "W", "WD"))
axis(2); box()
## Examination of departures from a consistent overall row pattern
x2 <- chisq.test(rareplants)
## Standardized residuals
residuals(x2)
## D W WD
## CC -0.3693 -1.19598 2.2634
## CR 2.8275 -1.06657 -0.2753
## RC -2.5466 2.36753 -2.0990
## RR 1.2416 0.07224 -1.0227
x2$expected
## D W WD
## CC 39.32 207.22 74.47
## CR 12.86 67.78 24.36
## RC 21.92 115.55 41.53
## RR 10.90 57.45 20.65
## ss 4.3.2: Additional notes
## Interpretation issues
## Modeling approaches
## Sec 4.4: One-Way Unstructured Comparisons
tomato <-
data.frame(weight=
c(1.5, 1.9, 1.3, 1.5, 2.4, 1.5, # water
1.5, 1.2, 1.2, 2.1, 2.9, 1.6, # Nutrient
1.9, 1.6, 0.8, 1.15, 0.9, 1.6), # Nutrient+24D
trt = rep(c("water", "Nutrient", "Nutrient+24D"),
c(6, 6, 6)))
## Make water the first level of trt. It will then appear as
## the initial level in the graphs. In aov or lm calculations,
## it will be the baseline or reference level.
tomato$trt <- relevel(tomato$trt, ref="water")
stripplot(trt~weight, aspect=0.6, data=tomato)
## The tomato data -- a visual summary
## Do analysis of variance calculations
tomato.aov <- aov(weight ~ trt, data=tomato)
## Summarize results graphically
oneway.plot(obj=tomato.aov)
## [1] 1 1 1 1
## The analysis of variance table
anova(tomato.aov)
## Analysis of Variance Table
##
## Response: weight
## Df Sum Sq Mean Sq F value Pr(>F)
## trt 2 0.63 0.314 1.2 0.33
## Residuals 15 3.91 0.261
## ss 4.4.1: Multiple comparisons
## *Microarray data -- severe multiplicity
## ss 4.4.2: Data with a two-way structure, i.e.\ two factors
## Footnote Code
## Simplified version of code
library(lattice)
## Panel A
dotplot(trt ~ ShootDryMass, data=rice, aspect=1,
panel=function(x,y,...){panel.dotplot(x, y, pch=1, col="gray40")
panel.average(x, y, type="p", col="black",
pch=3, cex=1.25)},
xlab="Shoot dry mass (g)")
## Panel B
with(rice, interaction.plot(fert, variety, ShootDryMass,
xlab="Level of first factor"))
## ss 4.4.3: Presentation issues
## Sec 4.5: Response Curves
## Footnote Code
## Data frame modelcars (DAAG)
plot(distance.traveled ~ starting.point, data=modelcars,
xlim=c(0,12.5), xaxs="i", xlab = "Distance up ramp (cm)",
ylab="Distance traveled (cm)")
## Sec 4.6: Data with a Nested Variation Structure
## ss 4.6.1: Degrees of freedom considerations
## ss 4.6.2: General multi-way analysis of variance designs
## Sec 4.7: Resampling Methods for Standard Errors, Tests and Confidence Intervals
## ss 4.7.1: The one-sample permutation test
## ss 4.7.2: The two-sample permutation test
## Footnote Code
## Draw one curve only; permutation distribution of difference in means
x1 <- two65$ambient; x2 <- two65$heated; x <- c(x1, x2)
n1 <- length(x1); n2 <- length(x2); n <- n1+n2
dbar <- mean(x2) - mean(x1)
z <- numeric(2000)
for(i in 1:2000){
mn <- sample(n, n2, replace=FALSE)
dbardash <- mean(x[mn]) - mean(x[-mn])
z[i]<- dbardash
}
plot(density(z), yaxs="i")
abline(v = dbar)
abline(v = -dbar, lty=2)
signif((sum(z > abs(dbar)) + sum (z< -abs(dbar)))/length(z), 3)
## [1] 0.059
## ss 4.7.3: \kern-4pt* Estimating the standard error of the median: bootstrapping
## bootstrap estimate of median of wren length: data frame cuckoos (DAAG)
wren <- split(cuckoos$length, cuckoos$species)$wren
library(boot)
##
## Attaching package: 'boot'
##
## The following object is masked from 'package:lattice':
##
## melanoma
## First define median.fun(), with two required arguments:
## data specifies the data vector,
## indices selects vector elements for a each resample
median.fun <- function(data, indices){median(data[indices])}
## Call boot(), with statistic=median.fun, R = # of resamples
(wren.boot <- boot(data = wren, statistic = median.fun, R = 999))
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = wren, statistic = median.fun, R = 999)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21 0.04805 0.2068
## ss 4.7.4: Bootstrap estimates of confidence intervals
## Bootstrap 95\% confidence intervals for the median
median.fun <- function(data, indices){median(data[indices])}
## Call the boot() function, with statistic=median.fun
wren <- cuckoos[cuckoos$species=="wren", "length"]
(wren.boot <- boot(data=wren, statistic=median.fun, R=9999))
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = wren, statistic = median.fun, R = 9999)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21 0.05434 0.2204
## The correlation coefficient
## Footnote Code
## Bootstrap estimate of 95% CI of cor(chest, belly): data frame possum (DAAG)
possum.fun <- function(data, indices) {
chest <- data$chest[indices]
belly <- data$belly[indices]
cor(belly, chest)
}
possum.boot <- boot(possum, possum.fun, R=9999)
boot.ci(possum.boot, type=c("perc", "bca"))
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 9999 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = possum.boot, type = c("perc", "bca"))
##
## Intervals :
## Level Percentile BCa
## 95% ( 0.4773, 0.7095 ) ( 0.4763, 0.7089 )
## Calculations and Intervals on Original Scale
## The bootstrap -- parting comments
## Sec 4.8: * Theories of Inference
## ss 4.8.1: Maximum likelihood estimation
funlik <- function(mu, sigma, x=with(pair65, heated-ambient))
prod(dnorm(x, mu, sigma))
log(funlik(6.3, 6.1)) # Close to estimated mean and SD
## [1] -28.55
log(funlik(6.33, 5.75)) # Close to the actual mle's
## [1] -28.52
log(funlik(7, 5.75))
## [1] -28.58
## ss 4.8.2: Bayesian estimation
## Bayesian estimation with normal priors and likelihood
## ss 4.8.3: If there is strong prior information, use it!
## Sec 4.9: Recap
## Dos and don'ts
## Sec 4.10: Further Reading
oldpar <- par(mfrow=c(2,2))
tenfold1000 <- rep(1:10, rep(1000,10))
boxplot(split(rnorm(1000*10), tenfold1000), ylab="normal - 1000")
boxplot(split(rt(1000*10, 7), tenfold1000),
ylab=expression(t[7]*" - 1000"))
tenfold100 <- rep(1:10, rep(100, 10))
boxplot(split(rnorm(100*10), tenfold100), ylab="normal - 100")
boxplot(split(rt(100*10, 7), tenfold100),
ylab=expression(t[7]*" - 100"))
par(oldpar)
y1 <- rnorm(51)
y <- y1[-1] + y1[-51]
acf(y1) # acf is `autocorrelation function'
# (see Chapter 9)
acf(y)
## UCBAdmissions is in the datasets package
## For each combination of margins 1 and 2, calculate the sum
UCBtotal <- apply(UCBAdmissions, c(1,2), sum)
apply(UCBAdmissions, 3, function(x)
(x[1,1]*x[2,2])/(x[1,2]*x[2,1]))
## A B C D E F
## 0.3492 0.8025 1.1331 0.9213 1.2216 0.8279
admissions <- array(c(30,30,10,10,15,5,30,10),
dim=c(2,2,2))
## Compare densities for ambient & heated: list two65 (DAAG)
with(two65, overlapDensity(ambient, heated))
## [1] 228.7 269.0
# Do overlapDensity(ambient, heated) with ambient and heated
# taken, if not found elsewhere, from the columns of two65
z.transform <- function(r) .5*log((1+r)/(1-r))
z.inverse <- function(z) (exp(2*z)-1)/(exp(2*z)+1)
possum.fun <- function(data, indices) {
chest <- data$chest[indices]
belly <- data$belly[indices]
z.transform(cor(belly, chest))}
possum.boot <- boot(possum, possum.fun, R=999)
z.inverse(boot.ci(possum.boot, type="perc")$percent[4:5])
## [1] 0.4767 0.7061
# See help(bootci.object). The 4th and 5th elements of
# the percent list element hold the interval endpoints.