## Chapter 7: Exploiting the Linear Model Framework

## Sec 7.1: Levels of a Factor -- Using Indicator Variables
## ss 7.1.1: Example -- sugar weight
## Footnote Code
options()$contrasts # Check the default factor contrasts  ## unordered ordered ## "contr.treatment" "contr.poly" ## If your output does not agree with the above, then enter options(contrasts=c("contr.treatment", "contr.poly")) library(DAAG) ## Loading required package: lattice levels(sugar$trt)  #  Note the order of the levels 
## [1] "Control" "A"       "B"       "C"
sugar.aov <- aov(weight ~ trt, data=sugar)
model.matrix(sugar.aov) 
##    (Intercept) trtA trtB trtC
## 1            1    0    0    0
## 2            1    0    0    0
## 3            1    0    0    0
## 4            1    1    0    0
## 5            1    1    0    0
## 6            1    1    0    0
## 7            1    0    1    0
## 8            1    0    1    0
## 9            1    0    1    0
## 10           1    0    0    1
## 11           1    0    0    1
## 12           1    0    0    1
## attr(,"assign")
## [1] 0 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$trt ## [1] "contr.treatment" summary.lm(sugar.aov) # NB: summary.lm(),  ## ## Call: ## aov(formula = weight ~ trt, data = sugar) ## ## Residuals: ## Min 1Q Median 3Q Max ## -13.333 -2.783 -0.617 2.175 14.567 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 83.23 4.47 18.61 7.2e-08 ## trtA -21.40 6.33 -3.38 0.00960 ## trtB -15.73 6.33 -2.49 0.03768 ## trtC -34.33 6.33 -5.43 0.00062 ## ## Residual standard error: 7.75 on 8 degrees of freedom ## Multiple R-squared: 0.791, Adjusted R-squared: 0.713 ## F-statistic: 10.1 on 3 and 8 DF, p-value: 0.00425  # not summary() or summary.aov() sem <- summary.lm(sugar.aov)$sigma/sqrt(3)  # 3 results/trt
# Alternatively, sem <- 6.33/sqrt(2)
qtukey(p=.95, nmeans=4, df=8) * sem 
## [1] 20.26
## ss 7.1.2: Different choices for the model matrix when there are  factors
oldoptions <- options(contrasts=c("contr.sum", "contr.poly"))
# The mean over all treatment levels is now the baseline.
# (The second setting ("contr.poly") is for ordered factors.)
summary.lm(aov(weight ~ trt, data = sugar)) 
##
## Call:
## aov(formula = weight ~ trt, data = sugar)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -13.333  -2.783  -0.617   2.175  14.567
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    65.37       2.24   29.23    2e-09
## trt1           17.87       3.87    4.61   0.0017
## trt2           -3.53       3.87   -0.91   0.3883
## trt3            2.13       3.87    0.55   0.5968
##
## Residual standard error: 7.75 on 8 degrees of freedom
## Multiple R-squared:  0.791,  Adjusted R-squared:  0.713
## F-statistic: 10.1 on 3 and 8 DF,  p-value: 0.00425
options(oldoptions)  # Restore default treatment contrasts

## Sec 7.2: Block Designs and Balanced Incomplete Block Designs
## ss 7.2.1: Analysis of the rice data, allowing for block effects
ricebl.aov <- aov(ShootDryMass ~ Block + variety * fert,
data=rice)
summary(ricebl.aov) 
##              Df Sum Sq Mean Sq F value  Pr(>F)
## Block         1   3528    3528    10.9  0.0016
## variety       1  22685   22685    70.1 6.4e-12
## fert          2   7019    3509    10.8 8.6e-05
## variety:fert  2  38622   19311    59.7 1.9e-15
## Residuals    65  21034     324
summary.lm(ricebl.aov) 
##
## Call:
## aov(formula = ShootDryMass ~ Block + variety * fert, data = rice)
##
## Residuals:
##    Min     1Q Median     3Q    Max
## -64.33  -9.81  -0.29  11.69  48.67
##
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)
## (Intercept)                129.33       8.21   15.75  < 2e-16
## Block                      -14.00       4.24   -3.30   0.0016
## varietyANU843             -101.00       7.34  -13.75  < 2e-16
## fertNH4Cl                  -58.08       7.34   -7.91  4.2e-11
## fertNH4NO3                 -35.00       7.34   -4.77  1.1e-05
## varietyANU843:fertNH4Cl     97.33      10.39    9.37  1.1e-13
## varietyANU843:fertNH4NO3    99.17      10.39    9.55  5.4e-14
##
## Residual standard error: 18 on 65 degrees of freedom
## Multiple R-squared:  0.774,  Adjusted R-squared:  0.753
## F-statistic:   37 on 6 and 65 DF,  p-value: <2e-16
## Footnote Code
## AOV calculations, ignoring block effects
rice.aov <- aov(ShootDryMass ~ variety * fert, data=rice)
summary.lm(rice.aov)$sigma  ## [1] 19.29 ## ss 7.2.2: A balanced incomplete block design table(appletaste$product, appletaste$panelist)  ## ## a b c d e f g h i j k l m n o p q r s t ## 298 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 ## 493 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 ## 649 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ## 937 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 sapply(appletaste, is.factor) # panelist & product are factors  ## aftertaste panelist product ## FALSE TRUE TRUE summary(appletaste.aov <- aov(aftertaste ~ panelist + product, data=appletaste))  ## Df Sum Sq Mean Sq F value Pr(>F) ## panelist 19 30461 1603 2.21 0.019 ## product 3 34014 11338 15.60 1e-06 ## Residuals 37 26892 727 ## Sec 7.3: Fitting Multiple Lines\label{sec:xlines} ## Fit various models to columns of data frame leaftemp (DAAG) leaf.lm1 <- lm(tempDiff ~ 1 , data = leaftemp) leaf.lm2 <- lm(tempDiff ~ vapPress, data = leaftemp) leaf.lm3 <- lm(tempDiff ~ CO2level + vapPress, data = leaftemp) leaf.lm4 <- lm(tempDiff ~ CO2level + vapPress + vapPress:CO2level, data = leaftemp) anova(leaf.lm1, leaf.lm2, leaf.lm3, leaf.lm4)  ## Analysis of Variance Table ## ## Model 1: tempDiff ~ 1 ## Model 2: tempDiff ~ vapPress ## Model 3: tempDiff ~ CO2level + vapPress ## Model 4: tempDiff ~ CO2level + vapPress + vapPress:CO2level ## Res.Df RSS Df Sum of Sq F Pr(>F) ## 1 61 40.0 ## 2 60 34.7 1 5.27 11.33 0.0014 ## 3 58 28.2 2 6.54 7.03 0.0019 ## 4 56 26.1 2 2.13 2.28 0.1112 summary(leaf.lm3)  ## ## Call: ## lm(formula = tempDiff ~ CO2level + vapPress, data = leaftemp) ## ## Residuals: ## Min 1Q Median 3Q Max ## -1.6968 -0.5430 0.0608 0.4637 1.3585 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 2.685 0.560 4.80 1.2e-05 ## CO2levelmedium 0.320 0.219 1.46 0.14861 ## CO2levelhigh 0.793 0.218 3.64 0.00058 ## vapPress -0.839 0.261 -3.22 0.00213 ## ## Residual standard error: 0.697 on 58 degrees of freedom ## Multiple R-squared: 0.295, Adjusted R-squared: 0.259 ## F-statistic: 8.11 on 3 and 58 DF, p-value: 0.000135 ## Sec 7.4: Polynomial Regression ## Fit quadratic curve: data frame seedrates (DAAG) seedrates.lm2 <- lm(grain ~ rate + I(rate^2), data = seedrates) # The wrapper function I() ensures that the result from # calculating rate^2 is treated as a variable in its own right. plot(grain ~ rate, data = seedrates, pch = 16, xlim = c(50, 160), cex=1.4) new.df <- data.frame(rate = (1:14) * 12.5) # for plotting the fitted curve hat2 <- predict(seedrates.lm2, newdata = new.df, interval="predict", coverage = 0.95) lines(new.df$rate, hat2[, "fit"], lty = 2, lwd=2) 

summary(seedrates.lm2, corr=TRUE) 
##
## Call:
## lm(formula = grain ~ rate + I(rate^2), data = seedrates)
##
## Residuals:
##        1        2        3        4        5
##  0.04571 -0.12286  0.09429 -0.00286 -0.01429
##
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.060000   0.455694   52.80  0.00036
## rate        -0.066686   0.009911   -6.73  0.02138
## I(rate^2)    0.000171   0.000049    3.50  0.07294
##
## Residual standard error: 0.115 on 2 degrees of freedom
## Multiple R-squared:  0.996,  Adjusted R-squared:  0.992
## F-statistic:  256 on 2 and 2 DF,  p-value: 0.0039
##
## Correlation of Coefficients:
##           (Intercept) rate
## rate      -0.98
## I(rate^2)  0.94       -0.99
## ss 7.4.1: Issues in the choice of model
## Footnote Code
## Fit line, fit curve, determine pointwise bounds, and create the plots
CIcurves <-
function(form=grain~rate, data=seedrates, lty=1, col=3,
newdata=data.frame(rate=seq(from=50, to=175, by=25))){
seedrates.lm <- lm(form, data=data)
x <- newdata[, all.vars(form)[2]]
hat <- predict(seedrates.lm, newdata=newdata, interval="confidence")
lines(spline(x, hat[, "fit"]))
lines(spline(x, hat[, "lwr"]), lty=lty, col=col)
lines(spline(x, hat[, "upr"]), lty=lty, col=col)
}
plot(grain ~ rate, data=seedrates, xlim=c(50,175), ylim=c(15.5,22))
CIcurves()
CIcurves(form=grain~rate+I(rate^2), lty=2) 

## Sec 7.5: \kern-4pt$^{*}$ Methods for Passing Smooth Curves through Data
## ss 7.5.1: Scatterplot smoothing -- regression splines
## Footnote Code
## Fit various models to columns of data frame fruitohms (DAAG)
library(splines)
## Panel A
plot(ohms ~ juice, cex=0.8, xlab="Apparent juice content (%)",
ylab="Resistance (ohms)", data=fruitohms)
CIcurves(form=ohms ~ ns(juice, 2), data=fruitohms,
newdata=data.frame(juice=pretty(fruitohms$juice,20)))  ## For panels B, C, D replace form = ohms ~ ns(juice,2) by: ## form = ohms ~ ns(juice,3) # panel B: nspline, df = 4 ## ohms ~ poly(juice,2) # panel C: polynomial, df = 3 ## ohms ~ poly(juice,3) # panel D: polynomial, df = 4 # For more information on poly(), see help(poly) and Exercise 15. ## Footnote Code ## Fit degree 2 normal spline, plot diagnostics opar <- par(mfrow = c(2,2), ask=FALSE) fruit.lm2 <- lm(ohms ~ ns(juice,2), data=fruitohms) # for panel B: ns(juice,3) plot(fruit.lm2)  par(opar) summary(fruit.lm2)  ## ## Call: ## lm(formula = ohms ~ ns(juice, 2), data = fruitohms) ## ## Residuals: ## Min 1Q Median 3Q Max ## -3577 -620 9 448 3066 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 8041 260 30.9 < 2e-16 ## ns(juice, 2)1 -8596 578 -14.9 < 2e-16 ## ns(juice, 2)2 -2105 357 -5.9 3.3e-08 ## ## Residual standard error: 1030 on 125 degrees of freedom ## Multiple R-squared: 0.701, Adjusted R-squared: 0.696 ## F-statistic: 146 on 2 and 125 DF, p-value: <2e-16 ## Footnote Code ## Display the basis curves as in panel A mm2 <- model.matrix(fruit.lm2) ylim <- range(mm2[, -1]) plot(mm2[, 2] ~ juice, ylim=ylim, xlab="Apparent Juice Content (%)", ylab="Spline basis functions", type="l", data=fruitohms) lines(fruitohms$juice, mm2[, 3], col="gray")