## Chapter 7: Exploiting the Linear Model Framework
## Sec 7.1: Levels of a Factor -- Using Indicator Variables
## ss 7.1.1: Example -- sugar weight
## Footnote Code
options()$contrasts # Check the default factor contrasts
## unordered ordered
## "contr.treatment" "contr.poly"
## If your output does not agree with the above, then enter
options(contrasts=c("contr.treatment", "contr.poly"))
library(DAAG)
## Loading required package: lattice
levels(sugar$trt) # Note the order of the levels
## [1] "Control" "A" "B" "C"
sugar.aov <- aov(weight ~ trt, data=sugar)
model.matrix(sugar.aov)
## (Intercept) trtA trtB trtC
## 1 1 0 0 0
## 2 1 0 0 0
## 3 1 0 0 0
## 4 1 1 0 0
## 5 1 1 0 0
## 6 1 1 0 0
## 7 1 0 1 0
## 8 1 0 1 0
## 9 1 0 1 0
## 10 1 0 0 1
## 11 1 0 0 1
## 12 1 0 0 1
## attr(,"assign")
## [1] 0 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$trt
## [1] "contr.treatment"
summary.lm(sugar.aov) # NB: summary.lm(),
##
## Call:
## aov(formula = weight ~ trt, data = sugar)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.333 -2.783 -0.617 2.175 14.567
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.23 4.47 18.61 7.2e-08
## trtA -21.40 6.33 -3.38 0.00960
## trtB -15.73 6.33 -2.49 0.03768
## trtC -34.33 6.33 -5.43 0.00062
##
## Residual standard error: 7.75 on 8 degrees of freedom
## Multiple R-squared: 0.791, Adjusted R-squared: 0.713
## F-statistic: 10.1 on 3 and 8 DF, p-value: 0.00425
# not summary() or summary.aov()
sem <- summary.lm(sugar.aov)$sigma/sqrt(3) # 3 results/trt
# Alternatively, sem <- 6.33/sqrt(2)
qtukey(p=.95, nmeans=4, df=8) * sem
## [1] 20.26
## ss 7.1.2: Different choices for the model matrix when there are factors
oldoptions <- options(contrasts=c("contr.sum", "contr.poly"))
# The mean over all treatment levels is now the baseline.
# (The second setting ("contr.poly") is for ordered factors.)
summary.lm(aov(weight ~ trt, data = sugar))
##
## Call:
## aov(formula = weight ~ trt, data = sugar)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.333 -2.783 -0.617 2.175 14.567
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65.37 2.24 29.23 2e-09
## trt1 17.87 3.87 4.61 0.0017
## trt2 -3.53 3.87 -0.91 0.3883
## trt3 2.13 3.87 0.55 0.5968
##
## Residual standard error: 7.75 on 8 degrees of freedom
## Multiple R-squared: 0.791, Adjusted R-squared: 0.713
## F-statistic: 10.1 on 3 and 8 DF, p-value: 0.00425
options(oldoptions) # Restore default treatment contrasts
## Sec 7.2: Block Designs and Balanced Incomplete Block Designs
## ss 7.2.1: Analysis of the rice data, allowing for block effects
ricebl.aov <- aov(ShootDryMass ~ Block + variety * fert,
data=rice)
summary(ricebl.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## Block 1 3528 3528 10.9 0.0016
## variety 1 22685 22685 70.1 6.4e-12
## fert 2 7019 3509 10.8 8.6e-05
## variety:fert 2 38622 19311 59.7 1.9e-15
## Residuals 65 21034 324
summary.lm(ricebl.aov)
##
## Call:
## aov(formula = ShootDryMass ~ Block + variety * fert, data = rice)
##
## Residuals:
## Min 1Q Median 3Q Max
## -64.33 -9.81 -0.29 11.69 48.67
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 129.33 8.21 15.75 < 2e-16
## Block -14.00 4.24 -3.30 0.0016
## varietyANU843 -101.00 7.34 -13.75 < 2e-16
## fertNH4Cl -58.08 7.34 -7.91 4.2e-11
## fertNH4NO3 -35.00 7.34 -4.77 1.1e-05
## varietyANU843:fertNH4Cl 97.33 10.39 9.37 1.1e-13
## varietyANU843:fertNH4NO3 99.17 10.39 9.55 5.4e-14
##
## Residual standard error: 18 on 65 degrees of freedom
## Multiple R-squared: 0.774, Adjusted R-squared: 0.753
## F-statistic: 37 on 6 and 65 DF, p-value: <2e-16
## Footnote Code
## AOV calculations, ignoring block effects
rice.aov <- aov(ShootDryMass ~ variety * fert, data=rice)
summary.lm(rice.aov)$sigma
## [1] 19.29
## ss 7.2.2: A balanced incomplete block design
table(appletaste$product, appletaste$panelist)
##
## a b c d e f g h i j k l m n o p q r s t
## 298 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
## 493 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1
## 649 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 937 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0
sapply(appletaste, is.factor) # panelist & product are factors
## aftertaste panelist product
## FALSE TRUE TRUE
summary(appletaste.aov <- aov(aftertaste ~ panelist + product,
data=appletaste))
## Df Sum Sq Mean Sq F value Pr(>F)
## panelist 19 30461 1603 2.21 0.019
## product 3 34014 11338 15.60 1e-06
## Residuals 37 26892 727
## Sec 7.3: Fitting Multiple Lines\label{sec:xlines}
## Fit various models to columns of data frame leaftemp (DAAG)
leaf.lm1 <- lm(tempDiff ~ 1 , data = leaftemp)
leaf.lm2 <- lm(tempDiff ~ vapPress, data = leaftemp)
leaf.lm3 <- lm(tempDiff ~ CO2level + vapPress, data = leaftemp)
leaf.lm4 <- lm(tempDiff ~ CO2level + vapPress
+ vapPress:CO2level, data = leaftemp)
anova(leaf.lm1, leaf.lm2, leaf.lm3, leaf.lm4)
## Analysis of Variance Table
##
## Model 1: tempDiff ~ 1
## Model 2: tempDiff ~ vapPress
## Model 3: tempDiff ~ CO2level + vapPress
## Model 4: tempDiff ~ CO2level + vapPress + vapPress:CO2level
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 61 40.0
## 2 60 34.7 1 5.27 11.33 0.0014
## 3 58 28.2 2 6.54 7.03 0.0019
## 4 56 26.1 2 2.13 2.28 0.1112
summary(leaf.lm3)
##
## Call:
## lm(formula = tempDiff ~ CO2level + vapPress, data = leaftemp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6968 -0.5430 0.0608 0.4637 1.3585
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.685 0.560 4.80 1.2e-05
## CO2levelmedium 0.320 0.219 1.46 0.14861
## CO2levelhigh 0.793 0.218 3.64 0.00058
## vapPress -0.839 0.261 -3.22 0.00213
##
## Residual standard error: 0.697 on 58 degrees of freedom
## Multiple R-squared: 0.295, Adjusted R-squared: 0.259
## F-statistic: 8.11 on 3 and 58 DF, p-value: 0.000135
## Sec 7.4: Polynomial Regression
## Fit quadratic curve: data frame seedrates (DAAG)
seedrates.lm2 <- lm(grain ~ rate + I(rate^2), data = seedrates)
# The wrapper function I() ensures that the result from
# calculating rate^2 is treated as a variable in its own right.
plot(grain ~ rate, data = seedrates, pch = 16,
xlim = c(50, 160), cex=1.4)
new.df <- data.frame(rate = (1:14) * 12.5) # for plotting the fitted curve
hat2 <- predict(seedrates.lm2, newdata = new.df, interval="predict",
coverage = 0.95)
lines(new.df$rate, hat2[, "fit"], lty = 2, lwd=2)

summary(seedrates.lm2, corr=TRUE)
##
## Call:
## lm(formula = grain ~ rate + I(rate^2), data = seedrates)
##
## Residuals:
## 1 2 3 4 5
## 0.04571 -0.12286 0.09429 -0.00286 -0.01429
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.060000 0.455694 52.80 0.00036
## rate -0.066686 0.009911 -6.73 0.02138
## I(rate^2) 0.000171 0.000049 3.50 0.07294
##
## Residual standard error: 0.115 on 2 degrees of freedom
## Multiple R-squared: 0.996, Adjusted R-squared: 0.992
## F-statistic: 256 on 2 and 2 DF, p-value: 0.0039
##
## Correlation of Coefficients:
## (Intercept) rate
## rate -0.98
## I(rate^2) 0.94 -0.99
## ss 7.4.1: Issues in the choice of model
## Footnote Code
## Fit line, fit curve, determine pointwise bounds, and create the plots
CIcurves <-
function(form=grain~rate, data=seedrates, lty=1, col=3,
newdata=data.frame(rate=seq(from=50, to=175, by=25))){
seedrates.lm <- lm(form, data=data)
x <- newdata[, all.vars(form)[2]]
hat <- predict(seedrates.lm, newdata=newdata, interval="confidence")
lines(spline(x, hat[, "fit"]))
lines(spline(x, hat[, "lwr"]), lty=lty, col=col)
lines(spline(x, hat[, "upr"]), lty=lty, col=col)
}
plot(grain ~ rate, data=seedrates, xlim=c(50,175), ylim=c(15.5,22))
CIcurves()
CIcurves(form=grain~rate+I(rate^2), lty=2)

## Sec 7.5: \kern-4pt$^{*}$ Methods for Passing Smooth Curves through Data
## ss 7.5.1: Scatterplot smoothing -- regression splines
## Footnote Code
## Fit various models to columns of data frame fruitohms (DAAG)
library(splines)
## Panel A
plot(ohms ~ juice, cex=0.8, xlab="Apparent juice content (%)",
ylab="Resistance (ohms)", data=fruitohms)
CIcurves(form=ohms ~ ns(juice, 2), data=fruitohms,
newdata=data.frame(juice=pretty(fruitohms$juice,20)))

## For panels B, C, D replace form = ohms ~ ns(juice,2) by:
## form = ohms ~ ns(juice,3) # panel B: nspline, df = 4
## ohms ~ poly(juice,2) # panel C: polynomial, df = 3
## ohms ~ poly(juice,3) # panel D: polynomial, df = 4
# For more information on poly(), see help(poly) and Exercise 15.
## Footnote Code
## Fit degree 2 normal spline, plot diagnostics
opar <- par(mfrow = c(2,2), ask=FALSE)
fruit.lm2 <- lm(ohms ~ ns(juice,2), data=fruitohms)
# for panel B: ns(juice,3)
plot(fruit.lm2)

par(opar)
summary(fruit.lm2)
##
## Call:
## lm(formula = ohms ~ ns(juice, 2), data = fruitohms)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3577 -620 9 448 3066
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8041 260 30.9 < 2e-16
## ns(juice, 2)1 -8596 578 -14.9 < 2e-16
## ns(juice, 2)2 -2105 357 -5.9 3.3e-08
##
## Residual standard error: 1030 on 125 degrees of freedom
## Multiple R-squared: 0.701, Adjusted R-squared: 0.696
## F-statistic: 146 on 2 and 125 DF, p-value: <2e-16
## Footnote Code
## Display the basis curves as in panel A
mm2 <- model.matrix(fruit.lm2)
ylim <- range(mm2[, -1])
plot(mm2[, 2] ~ juice, ylim=ylim, xlab="Apparent Juice Content (%)",
ylab="Spline basis functions", type="l", data=fruitohms)
lines(fruitohms$juice, mm2[, 3], col="gray")

# NB: Values of juice are already ordered
## For panel B basis curves: mm3 <- model.matrix(fruit.lm3), etc.
## ss 7.5.2: *Roughness penalty methods, and generalized additive models(GAMs)
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-2. For overview type 'help("mgcv-package")'.
## Thin plate regression splines (bs="tp")
fruit.tp <- gam(ohms ~ s(juice, bs="tp"), data=fruitohms)
## Plot points, fitted curve, and +/- 1SE limits
plot(fruit.tp, residuals=TRUE)

## Cubic regression splines (bs="cr")
fruit.cr <- gam(ohms ~ s(juice, bs="cr"), data=fruitohms)
## Plot points, fitted curve, and +/- 1SE limits
plot(fruit.cr, residuals=TRUE)

## ss 7.5.3: Distributional assumptions for automatic choice of roughness penalty
## ss 7.5.4: Other smoothing methods
## *Monotone curves
## The monoProc package is not available for R_3.0.0 or higher.
## The following uses the 'MonoPoly' package to fit a monotone cubic.
if(!require(MonoPoly))return('Package "MonoPoly" must be installed')
## Loading required package: MonoPoly
## Loading required package: quadprog
u <- monpol(ohms~juice, data=fruitohms, degree=3)
plot(ohms ~ juice, data=fruitohms, xlab="Apparent juice content (%)",
ylab="Resistance (ohms)", col="gray40")
ord <- with(fruitohms, order(juice))
lines(fitted(u)[ord] ~ juice[ord], data=fruitohms, col=2)

# library(monoProc)
# fit.mono <- monoproc(loess(ohms~juice, data=fruitohms),
# bandwidth=0.1, mono1="decreasing",
# gridsize=30)
# plot(ohms ~ juice, data=fruitohms,
# xlab="Apparent juice content (%)", ylab="Resistance (ohms)")
# lines(fit.mono)
## Sec 7.6: Smoothing with Multiple Explanatory Variables
## ss 7.6.1: An additive model with two smooth terms
## Regression of dewpt vs maxtemp: data frame dewpoint (DAAG)
library(mgcv)
ds.gam <- gam(dewpt ~ s(mintemp) + s(maxtemp), data=dewpoint)
oldpar <- par(mfrow = c(1,2), pty="s")
plot(ds.gam, se=2) # se=2: Show 2SE limits

## Try also: plot(ds.gam, se=2, residuals=TRUE, pch=1, cex=0.4)
par(oldpar)
## Footnote Code
## Residuals vs maxtemp, for different mintemp ranges
library(lattice)
mintempRange <- equal.count(dewpoint$mintemp, number=3)
xyplot(residuals(ds.gam) ~ maxtemp | mintempRange, data=dewpoint, aspect=1,
layout=c(3,1), type=c("p","smooth"),
xlab="Maximum temperature", ylab="Residual")

## ss 7.6.2: *A smooth surface
## Sec 7.7: Further Reading
roller.lm <- lm(depression~weight, data=roller)
roller.lm2 <- lm(depression~weight+I(weight^2), data=roller)
seedrates.lm <- lm(grain ~ rate + I(rate^2), data=seedrates)
seedrates.pol <- lm(grain ~ poly(rate,2), data=seedrates)
library(mgcv)
xy <- data.frame(x=1:200, y=arima.sim(list(ar=0.75), n=200))
df.gam <- gam(y ~ s(x), data=xy)
plot(df.gam, residuals=TRUE)
