## Chapter 7: Exploiting the Linear Model Framework 

## Sec 7.1: Levels of a Factor -- Using Indicator Variables 
## ss 7.1.1: Example -- sugar weight 
## Footnote Code
options()$contrasts  # Check the default factor contrasts 
##         unordered           ordered 
## "contr.treatment"      "contr.poly"
## If your output does not agree with the above, then enter 
options(contrasts=c("contr.treatment", "contr.poly")) 

library(DAAG)
## Loading required package: lattice
levels(sugar$trt)  #  Note the order of the levels 
## [1] "Control" "A"       "B"       "C"
sugar.aov <- aov(weight ~ trt, data=sugar) 
model.matrix(sugar.aov) 
##    (Intercept) trtA trtB trtC
## 1            1    0    0    0
## 2            1    0    0    0
## 3            1    0    0    0
## 4            1    1    0    0
## 5            1    1    0    0
## 6            1    1    0    0
## 7            1    0    1    0
## 8            1    0    1    0
## 9            1    0    1    0
## 10           1    0    0    1
## 11           1    0    0    1
## 12           1    0    0    1
## attr(,"assign")
## [1] 0 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$trt
## [1] "contr.treatment"
summary.lm(sugar.aov)   # NB: summary.lm(),  
## 
## Call:
## aov(formula = weight ~ trt, data = sugar)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.333  -2.783  -0.617   2.175  14.567 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    83.23       4.47   18.61  7.2e-08
## trtA          -21.40       6.33   -3.38  0.00960
## trtB          -15.73       6.33   -2.49  0.03768
## trtC          -34.33       6.33   -5.43  0.00062
## 
## Residual standard error: 7.75 on 8 degrees of freedom
## Multiple R-squared:  0.791,  Adjusted R-squared:  0.713 
## F-statistic: 10.1 on 3 and 8 DF,  p-value: 0.00425
                        # not summary() or summary.aov() 

sem <- summary.lm(sugar.aov)$sigma/sqrt(3)  # 3 results/trt 
# Alternatively, sem <- 6.33/sqrt(2) 
qtukey(p=.95, nmeans=4, df=8) * sem 
## [1] 20.26
## ss 7.1.2: Different choices for the model matrix when there are  factors 
oldoptions <- options(contrasts=c("contr.sum", "contr.poly")) 
 # The mean over all treatment levels is now the baseline. 
 # (The second setting ("contr.poly") is for ordered factors.) 
summary.lm(aov(weight ~ trt, data = sugar)) 
## 
## Call:
## aov(formula = weight ~ trt, data = sugar)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.333  -2.783  -0.617   2.175  14.567 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    65.37       2.24   29.23    2e-09
## trt1           17.87       3.87    4.61   0.0017
## trt2           -3.53       3.87   -0.91   0.3883
## trt3            2.13       3.87    0.55   0.5968
## 
## Residual standard error: 7.75 on 8 degrees of freedom
## Multiple R-squared:  0.791,  Adjusted R-squared:  0.713 
## F-statistic: 10.1 on 3 and 8 DF,  p-value: 0.00425
options(oldoptions)  # Restore default treatment contrasts 


## Sec 7.2: Block Designs and Balanced Incomplete Block Designs 
## ss 7.2.1: Analysis of the rice data, allowing for block effects 
ricebl.aov <- aov(ShootDryMass ~ Block + variety * fert,  
                  data=rice)
summary(ricebl.aov) 
##              Df Sum Sq Mean Sq F value  Pr(>F)
## Block         1   3528    3528    10.9  0.0016
## variety       1  22685   22685    70.1 6.4e-12
## fert          2   7019    3509    10.8 8.6e-05
## variety:fert  2  38622   19311    59.7 1.9e-15
## Residuals    65  21034     324
summary.lm(ricebl.aov) 
## 
## Call:
## aov(formula = ShootDryMass ~ Block + variety * fert, data = rice)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -64.33  -9.81  -0.29  11.69  48.67 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)
## (Intercept)                129.33       8.21   15.75  < 2e-16
## Block                      -14.00       4.24   -3.30   0.0016
## varietyANU843             -101.00       7.34  -13.75  < 2e-16
## fertNH4Cl                  -58.08       7.34   -7.91  4.2e-11
## fertNH4NO3                 -35.00       7.34   -4.77  1.1e-05
## varietyANU843:fertNH4Cl     97.33      10.39    9.37  1.1e-13
## varietyANU843:fertNH4NO3    99.17      10.39    9.55  5.4e-14
## 
## Residual standard error: 18 on 65 degrees of freedom
## Multiple R-squared:  0.774,  Adjusted R-squared:  0.753 
## F-statistic:   37 on 6 and 65 DF,  p-value: <2e-16
## Footnote Code
## AOV calculations, ignoring block effects 
rice.aov <- aov(ShootDryMass ~ variety * fert, data=rice) 
summary.lm(rice.aov)$sigma 
## [1] 19.29
## ss 7.2.2: A balanced incomplete block design 
table(appletaste$product, appletaste$panelist) 
##      
##       a b c d e f g h i j k l m n o p q r s t
##   298 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
##   493 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1
##   649 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##   937 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0
sapply(appletaste, is.factor)  # panelist & product are factors 
## aftertaste   panelist    product 
##      FALSE       TRUE       TRUE
summary(appletaste.aov <- aov(aftertaste ~ panelist + product,  
                              data=appletaste)) 
##             Df Sum Sq Mean Sq F value Pr(>F)
## panelist    19  30461    1603    2.21  0.019
## product      3  34014   11338   15.60  1e-06
## Residuals   37  26892     727
## Sec 7.3: Fitting Multiple Lines\label{sec:xlines} 
## Fit various models to columns of data frame leaftemp (DAAG) 
leaf.lm1 <- lm(tempDiff ~ 1 , data = leaftemp) 
leaf.lm2 <- lm(tempDiff ~ vapPress, data = leaftemp) 
leaf.lm3 <- lm(tempDiff ~ CO2level + vapPress, data = leaftemp) 
leaf.lm4 <- lm(tempDiff ~ CO2level + vapPress  
               + vapPress:CO2level, data = leaftemp)            

anova(leaf.lm1, leaf.lm2, leaf.lm3, leaf.lm4) 
## Analysis of Variance Table
## 
## Model 1: tempDiff ~ 1
## Model 2: tempDiff ~ vapPress
## Model 3: tempDiff ~ CO2level + vapPress
## Model 4: tempDiff ~ CO2level + vapPress + vapPress:CO2level
##   Res.Df  RSS Df Sum of Sq     F Pr(>F)
## 1     61 40.0                          
## 2     60 34.7  1      5.27 11.33 0.0014
## 3     58 28.2  2      6.54  7.03 0.0019
## 4     56 26.1  2      2.13  2.28 0.1112
summary(leaf.lm3) 
## 
## Call:
## lm(formula = tempDiff ~ CO2level + vapPress, data = leaftemp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6968 -0.5430  0.0608  0.4637  1.3585 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)       2.685      0.560    4.80  1.2e-05
## CO2levelmedium    0.320      0.219    1.46  0.14861
## CO2levelhigh      0.793      0.218    3.64  0.00058
## vapPress         -0.839      0.261   -3.22  0.00213
## 
## Residual standard error: 0.697 on 58 degrees of freedom
## Multiple R-squared:  0.295,  Adjusted R-squared:  0.259 
## F-statistic: 8.11 on 3 and 58 DF,  p-value: 0.000135
## Sec 7.4: Polynomial Regression 
## Fit quadratic curve: data frame seedrates (DAAG) 
seedrates.lm2 <- lm(grain ~ rate + I(rate^2), data = seedrates) 
  # The wrapper function I() ensures that the result from 
  # calculating rate^2 is treated as a variable in its own right. 
plot(grain ~ rate, data = seedrates, pch = 16, 
     xlim = c(50, 160),  cex=1.4) 
new.df <- data.frame(rate = (1:14) * 12.5) # for plotting the fitted curve 
hat2 <- predict(seedrates.lm2, newdata = new.df, interval="predict", 
                coverage = 0.95) 
lines(new.df$rate, hat2[, "fit"], lty = 2, lwd=2) 

plot of chunk unnamed-chunk-1

summary(seedrates.lm2, corr=TRUE) 
## 
## Call:
## lm(formula = grain ~ rate + I(rate^2), data = seedrates)
## 
## Residuals:
##        1        2        3        4        5 
##  0.04571 -0.12286  0.09429 -0.00286 -0.01429 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.060000   0.455694   52.80  0.00036
## rate        -0.066686   0.009911   -6.73  0.02138
## I(rate^2)    0.000171   0.000049    3.50  0.07294
## 
## Residual standard error: 0.115 on 2 degrees of freedom
## Multiple R-squared:  0.996,  Adjusted R-squared:  0.992 
## F-statistic:  256 on 2 and 2 DF,  p-value: 0.0039
## 
## Correlation of Coefficients:
##           (Intercept) rate 
## rate      -0.98            
## I(rate^2)  0.94       -0.99
## ss 7.4.1: Issues in the choice of model 
## Footnote Code
## Fit line, fit curve, determine pointwise bounds, and create the plots 
CIcurves <- 
  function(form=grain~rate, data=seedrates, lty=1, col=3, 
           newdata=data.frame(rate=seq(from=50, to=175, by=25))){ 
    seedrates.lm <- lm(form, data=data) 
    x <- newdata[, all.vars(form)[2]] 
    hat <- predict(seedrates.lm, newdata=newdata, interval="confidence") 
    lines(spline(x, hat[, "fit"])) 
    lines(spline(x, hat[, "lwr"]), lty=lty, col=col) 
    lines(spline(x, hat[, "upr"]), lty=lty, col=col) 
  } 
plot(grain ~ rate, data=seedrates, xlim=c(50,175), ylim=c(15.5,22))  
CIcurves() 
CIcurves(form=grain~rate+I(rate^2), lty=2) 

plot of chunk unnamed-chunk-1

## Sec 7.5: \kern-4pt$^{*}$ Methods for Passing Smooth Curves through Data 
## ss 7.5.1: Scatterplot smoothing -- regression splines 
## Footnote Code
## Fit various models to columns of data frame fruitohms (DAAG) 
library(splines) 
## Panel A 
plot(ohms ~ juice, cex=0.8, xlab="Apparent juice content (%)", 
     ylab="Resistance (ohms)", data=fruitohms) 
CIcurves(form=ohms ~ ns(juice, 2), data=fruitohms, 
        newdata=data.frame(juice=pretty(fruitohms$juice,20))) 

plot of chunk unnamed-chunk-1

## For panels B, C, D replace form = ohms ~ ns(juice,2) by: 
## form = ohms ~ ns(juice,3)   # panel B: nspline, df = 4  
## ohms ~ poly(juice,2)        # panel C: polynomial, df = 3  
## ohms ~ poly(juice,3)        # panel D: polynomial, df = 4  
  # For more information on poly(), see help(poly) and Exercise 15.  

## Footnote Code
## Fit degree 2 normal spline, plot diagnostics 
opar <- par(mfrow = c(2,2), ask=FALSE) 
fruit.lm2 <- lm(ohms ~ ns(juice,2), data=fruitohms)    
  # for panel B: ns(juice,3) 
plot(fruit.lm2) 

plot of chunk unnamed-chunk-1

par(opar) 

summary(fruit.lm2) 
## 
## Call:
## lm(formula = ohms ~ ns(juice, 2), data = fruitohms)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -3577   -620      9    448   3066 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)       8041        260    30.9  < 2e-16
## ns(juice, 2)1    -8596        578   -14.9  < 2e-16
## ns(juice, 2)2    -2105        357    -5.9  3.3e-08
## 
## Residual standard error: 1030 on 125 degrees of freedom
## Multiple R-squared:  0.701,  Adjusted R-squared:  0.696 
## F-statistic:  146 on 2 and 125 DF,  p-value: <2e-16
## Footnote Code
## Display the basis curves as in panel A 
mm2 <- model.matrix(fruit.lm2) 
ylim <- range(mm2[, -1]) 
plot(mm2[, 2] ~ juice, ylim=ylim, xlab="Apparent Juice Content (%)",  
     ylab="Spline basis functions", type="l", data=fruitohms) 
lines(fruitohms$juice, mm2[, 3], col="gray") 

plot of chunk unnamed-chunk-1

  # NB: Values of juice are already ordered 
## For panel B basis curves: mm3 <- model.matrix(fruit.lm3), etc. 

## ss 7.5.2: *Roughness penalty methods, and generalized additive  models(GAMs) 
library(mgcv) 
## Loading required package: nlme
## This is mgcv 1.8-2. For overview type 'help("mgcv-package")'.
## Thin plate regression splines (bs="tp") 
fruit.tp <- gam(ohms ~ s(juice, bs="tp"), data=fruitohms)  
## Plot points, fitted curve, and +/- 1SE limits 
plot(fruit.tp, residuals=TRUE)   

plot of chunk unnamed-chunk-1

## Cubic regression splines (bs="cr") 
fruit.cr <- gam(ohms ~ s(juice, bs="cr"), data=fruitohms)  
## Plot points, fitted curve, and +/- 1SE limits 
plot(fruit.cr, residuals=TRUE)  

plot of chunk unnamed-chunk-1

## ss 7.5.3: Distributional assumptions for automatic choice of  roughness penalty 
## ss 7.5.4: Other smoothing methods 
##                          *Monotone curves 
## The monoProc package is not available for R_3.0.0 or higher. 
## The following uses the 'MonoPoly' package to fit a monotone cubic.
    if(!require(MonoPoly))return('Package "MonoPoly" must be installed')
## Loading required package: MonoPoly
## Loading required package: quadprog
    u <- monpol(ohms~juice, data=fruitohms, degree=3)
    plot(ohms ~ juice, data=fruitohms, xlab="Apparent juice content (%)",
         ylab="Resistance (ohms)", col="gray40")
    ord <- with(fruitohms, order(juice))
    lines(fitted(u)[ord] ~ juice[ord], data=fruitohms, col=2)

plot of chunk unnamed-chunk-1

# library(monoProc) 
# fit.mono <- monoproc(loess(ohms~juice, data=fruitohms),  
#                      bandwidth=0.1, mono1="decreasing",  
#                      gridsize=30) 
# plot(ohms ~ juice, data=fruitohms,  
#      xlab="Apparent juice content (%)", ylab="Resistance (ohms)") 
# lines(fit.mono) 


## Sec 7.6: Smoothing with Multiple Explanatory Variables 
## ss 7.6.1: An additive model with two smooth terms 
## Regression of dewpt vs maxtemp: data frame dewpoint (DAAG) 
library(mgcv) 
ds.gam <- gam(dewpt ~ s(mintemp) + s(maxtemp), data=dewpoint)  
oldpar <- par(mfrow = c(1,2), pty="s")  
plot(ds.gam, se=2)   # se=2: Show 2SE limits 

plot of chunk unnamed-chunk-1

## Try also: plot(ds.gam, se=2, residuals=TRUE, pch=1, cex=0.4) 
par(oldpar)  

## Footnote Code
## Residuals vs maxtemp, for different mintemp ranges 
library(lattice)  
mintempRange <- equal.count(dewpoint$mintemp, number=3) 
xyplot(residuals(ds.gam) ~ maxtemp | mintempRange, data=dewpoint, aspect=1, 
       layout=c(3,1), type=c("p","smooth"),  
       xlab="Maximum temperature", ylab="Residual") 

plot of chunk unnamed-chunk-1

## ss 7.6.2: *A smooth surface 

## Sec 7.7: Further Reading 
roller.lm <- lm(depression~weight, data=roller) 
roller.lm2 <- lm(depression~weight+I(weight^2), data=roller) 

seedrates.lm <- lm(grain ~ rate + I(rate^2), data=seedrates)  
seedrates.pol <- lm(grain ~ poly(rate,2), data=seedrates) 

library(mgcv) 
xy <- data.frame(x=1:200, y=arima.sim(list(ar=0.75), n=200)) 
df.gam <- gam(y ~ s(x), data=xy) 
plot(df.gam, residuals=TRUE) 

plot of chunk unnamed-chunk-1