## Chapter 5: Regression with a Single Predictor 

## Sec 5.1: Fitting a Line to Data 
##                     How accurate is the line? 
## ss 5.1.1: Summary information -- lawn roller example 
## Footnote Code
## Global options used for this and most later output 
options(show.signif.stars=FALSE, digits=4) 
  # show.signif.stars=FALSE suppresses interpretive features that, 
  # for our use of the output, are unhelpful. 

library(DAAG) 
## Loading required package: lattice
## Fit lm model: data from roller (DAAG); output in roller.lm 
roller.lm <- lm(depression ~ weight, data = roller) 
## Use the extractor function summary() to summarize results  
summary(roller.lm) 
## 
## Call:
## lm(formula = depression ~ weight, data = roller)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8.18  -5.58  -1.35   5.92   8.02 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    -2.09       4.75   -0.44   0.6723
## weight          2.67       0.70    3.81   0.0052
## 
## Residual standard error: 6.74 on 8 degrees of freedom
## Multiple R-squared:  0.644,  Adjusted R-squared:   0.6 
## F-statistic: 14.5 on 1 and 8 DF,  p-value: 0.00518
## Footnote Code
## Fit model that omits intercept term; i.e., y = bx 
lm(depression ~ -1 + weight, data=roller) 
## 
## Call:
## lm(formula = depression ~ -1 + weight, data = roller)
## 
## Coefficients:
## weight  
##   2.39
## ss 5.1.2: Residual plots 
## A: Plot residuals vs fitted values; B: normal probability plot 
plot(roller.lm, which = 1:2) 

plot of chunk unnamed-chunk-1plot of chunk unnamed-chunk-1

## Footnote Code
## Alternatively, use the following code: 
test <- residuals(roller.lm); n <- length(test) 
av <- mean(test); sdev <- sd(test)  
y <- c(test, rnorm(7*n, av, sdev)) 
fac <- c(rep("residuals(roller.lm)",n), paste("reference", rep(1:7, rep(n,7)))) 
fac <- factor(fac, levels=unique(fac)) 
library(lattice) 
qqmath(~ y|fac, aspect=1, layout=c(4,2)) 

plot of chunk unnamed-chunk-1

## Normal probability plot, plus 7 reference plots 
qreference(residuals(roller.lm), nrep=8, nrows=2) 

plot of chunk unnamed-chunk-1

## ss 5.1.3: Iron slag example: is there a pattern in the residuals? 
## Footnote Code
## Panel A: chemical vs magnetic (Data frame ironslag from DAAG) 
plot(chemical ~ magnetic, data=ironslag) 
ironslag.lm <- lm(chemical ~ magnetic, data=ironslag) 
abline(ironslag.lm) 
with(ironslag, lines(lowess(chemical ~ magnetic, f=.9), lty=2)) 

plot of chunk unnamed-chunk-1

## Footnote Code
## Panel B: Residuals from straight line fit, vs magnetic 
res <- residuals(ironslag.lm) 
plot(res ~ magnetic, xlab="Residual", data=ironslag) 
with(ironslag, lines(lowess(res ~ magnetic, f=.9), lty=2)) 

plot of chunk unnamed-chunk-1

## Footnote Code
## Panel C: Observed vs predicted 
yhat <- fitted(ironslag.lm) 
plot(chemical ~ yhat, data=ironslag, xlab="Predicted chemical", ylab="Chemical") 
with(ironslag, lines(lowess(chemical ~ yhat, f=.9), lty=2)) 

plot of chunk unnamed-chunk-1

## Footnote Code
## Panel D: Check whether error variance seems constant 
sqrtabs <- sqrt(abs(res)) 
plot(sqrtabs ~ yhat, data=ironslag, xlab = "Predicted chemical",  
     ylab =  expression(sqrt(abs(residual))), type = "n") 
panel.smooth(yhat, sqrtabs, span = 0.95) 

plot of chunk unnamed-chunk-1

## ss 5.1.4: The analysis of variance table 
anova(roller.lm) 
## Analysis of Variance Table
## 
## Response: depression
##           Df Sum Sq Mean Sq F value Pr(>F)
## weight     1    658     658    14.5 0.0052
## Residuals  8    363      45
## Sec 5.2: Outliers, Influence and Robust Regression 
softbacks.lm <- lm(weight ~ volume, data=softbacks) 
summary(softbacks.lm) 
## 
## Call:
## lm(formula = weight ~ volume, data = softbacks)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -89.67 -39.89 -25.00   9.07 215.91 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   41.372     97.559    0.42  0.68629
## volume         0.686      0.106    6.47  0.00064
## 
## Residual standard error: 102 on 6 degrees of freedom
## Multiple R-squared:  0.875,  Adjusted R-squared:  0.854 
## F-statistic: 41.9 on 1 and 6 DF,  p-value: 0.000644
par(mfrow=c(2,2))    # Use par(mfrow=c(1,4)) for layout in figure 
plot(softbacks.lm, which=1:4) 

plot of chunk unnamed-chunk-1

 # By default, plots 1:3 and 5 [which=c(1:3,5)] are given 
par(mfrow=c(1,1)) 

##                         Robust regression 

## Sec 5.3: Standard Errors and Confidence Intervals 
## ss 5.3.1: Confidence intervals and tests for the slope 
## Footnote Code
## Code for confidence interval calculations 
SEb <- summary(roller.lm)$coefficients[2, 2] 
coef(roller.lm)[2] + qt(c(0.025,.975), 8)*SEb 
## [1] 1.052 4.282
## ss 5.3.2: SEs and confidence intervals for predicted values 
## Footnote Code
## Code to obtain fitted values and standard errors (SE, then SE.OBS) 
fit.with.se <- predict(roller.lm, se.fit=TRUE) 
fit.with.se$se.fit                                       # SE 
##  [1] 3.614 2.977 2.881 2.308 2.197 2.130 2.142 2.384 3.370 4.918
sqrt(fit.with.se$se.fit^2+fit.with.se$residual.scale^2)  # SE.OBS 
##  [1] 7.644 7.364 7.326 7.120 7.085 7.064 7.068 7.145 7.532 8.340
## Footnote Code
## Plot depression vs weight, with 95\% pointwise bounds for the fitted line 
plot(depression ~ weight, data=roller, xlab = "Weight of Roller (tonnes)", 
     ylab = "Depression in Lawn (mm)", pch = 16) 
roller.lm <- lm(depression ~ weight, data = roller) 
abline(roller.lm$coef, lty = 1) 
xy <- data.frame(weight = pretty(roller$weight, 20)) 
yhat <- predict(roller.lm, newdata = xy, interval="confidence") 
ci <- data.frame(lower=yhat[, "lwr"], upper=yhat[, "upr"]) 
lines(xy$weight, ci$lower, lty = 2, lwd=2, col="grey") 
lines(xy$weight, ci$upper, lty = 2, lwd=2, col="grey") 

plot of chunk unnamed-chunk-1

## ss 5.3.3: *{\kern5pt}Implications for design 

## Sec 5.4: Assessing Predictive Accuracy 
## ss 5.4.1: Training/test sets, and cross-validation 
## ss 5.4.2: Cross-validation -- an example 
## Footnote Code
## Split row numbers randomly into 3 groups 
rand <- sample(1:15)%%3 + 1 
  # a%%3 is the remainder of a, modulo 3 
  # Subtract from a the largest multiple of 3 that is <= a; take remainder 
(1:15)[rand == 1]    # Observation numbers for the first group 
## [1]  4  6 12 13 14
(1:15)[rand == 2]    # Observation numbers for the second group 
## [1]  5  8  9 10 11
(1:15)[rand == 3]    # Observation numbers for the third group. 
## [1]  1  2  3  7 15
## Cross-validate lm calculations: data frame houseprices (DAAG) 
houseprices.lm <- lm(sale.price ~ area, data=houseprices) 
CVlm(houseprices, houseprices.lm, plotit=TRUE) 
## Analysis of Variance Table
## 
## Response: sale.price
##           Df Sum Sq Mean Sq F value Pr(>F)
## area       1  18566   18566       8  0.014
## Residuals 13  30179    2321

plot of chunk unnamed-chunk-1

## 
## fold 1 
## Observations in test set: 5 
##              11  20    21     22   23
## area        802 696 771.0 1006.0 1191
## cvpred      204 188 199.3  234.7  262
## sale.price  215 255 260.0  293.0  375
## CV residual  11  67  60.7   58.3  113
## 
## Sum of squares = 24351    Mean square = 4870    n = 5 
## 
## fold 2 
## Observations in test set: 5 
##              10   13    14      17     18
## area        905  716 963.0 1018.00 887.00
## cvpred      255  224 264.4  273.38 252.06
## sale.price  215  113 185.0  276.00 260.00
## CV residual -40 -112 -79.4    2.62   7.94
## 
## Sum of squares = 20416    Mean square = 4083    n = 5 
## 
## fold 3 
## Observations in test set: 5 
##                 9   12     15    16     19
## area        694.0 1366 821.00 714.0 790.00
## cvpred      183.2  388 221.94 189.3 212.49
## sale.price  192.0  274 212.00 220.0 221.50
## CV residual   8.8 -114  -9.94  30.7   9.01
## 
## Sum of squares = 14241    Mean square = 2848    n = 5 
## 
## Overall (Sum over all 5 folds) 
##   ms 
## 3934
## Footnote Code
## Estimate of sigma^2 from regression output 
summary(houseprices.lm)$sigma^2 
## [1] 2321
## ss 5.4.3: *~Bootstrapping 
houseprices.lm <- lm(sale.price ~ area, data=houseprices) 
summary(houseprices.lm)$coef 
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   70.750    60.3477    1.17   0.2621
## area           0.188     0.0664    2.83   0.0142
houseprices.fn <- function (houseprices, index){ 
    house.resample <- houseprices[index, ] 
    house.lm <- lm(sale.price ~ area, data=house.resample) 
    coef(house.lm)[2]    # slope estimate for resampled data 
    } 

set.seed(1028)     # use to replicate the exact results below 
library(boot)      # ensure that the boot package is loaded 
## 
## Attaching package: 'boot'
## 
## The following object is masked from 'package:lattice':
## 
##     melanoma
## requires the data frame houseprices (DAAG) 
(houseprices.boot <- boot(houseprices, R=999, statistic=houseprices.fn)) 
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = houseprices, statistic = houseprices.fn, R = 999)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*    0.188  0.0126        0.09
housepred.fn <- function(houseprices, index){ 
    house.resample <- houseprices[index, ] 
    house.lm <- lm(sale.price ~ area, data=house.resample) 
    predict(house.lm, newdata=data.frame(area=1200)) 
    } 

## Footnote Code
## 95% CI for predicted price of 1200 square foot house 
housepred.boot <- boot(houseprices, R=999, statistic=housepred.fn) 
boot.ci(housepred.boot, type="perc") # "basic" is an alternative to "perc" 
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 999 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = housepred.boot, type = "perc")
## 
## Intervals : 
## Level     Percentile     
## 95%   (241, 368 )  
## Calculations and Intervals on Original Scale
## Footnote Code
## Bootstrap estimates of prediction errors of house prices 
houseprices2.fn <- function (houseprices, index) 
{ 
  house.resample <- houseprices[index, ] 
  house.lm <- lm(sale.price ~ area, data=house.resample) 
  houseprices$sale.price - predict(house.lm, houseprices)  # resampled prediction  
                                                           # errors 
} 
n <- length(houseprices$area);  R <- 200 
houseprices2.boot <- boot(houseprices, R=R, statistic=houseprices2.fn) 
house.fac <- factor(rep(1:n, rep(R, n))) 
plot(house.fac, as.vector(houseprices2.boot$t), ylab="Prediction Errors",  
     xlab="House") 

plot of chunk unnamed-chunk-1

## Footnote Code
## Ratios of bootstrap to model-based standard errors 
bootse <- apply(houseprices2.boot$t, 2, sd) 
usualse <- predict.lm(houseprices.lm, se.fit=TRUE)$se.fit 
plot(bootse/usualse, ylab="Ratio of Bootstrap SE's to Model-Based SE's",  
     xlab="House", pch=16) 
abline(1, 0) 

plot of chunk unnamed-chunk-1

##                             Commentary 

## Sec 5.5: Regression versus Qualitative anova Comparisons -- Issues of Power 
##                       The pattern of change 

## Sec 5.6: Logarithmic and Other Transformations 
## ss 5.6.1: *{\kern5pt}A Note on Power Transformations 
##                   *General power transformations 
## ss 5.6.2: Size and shape data -- allometric growth 
##                  The  allometric growth equation 
summary(cfseal.lm <- lm(log(heart) ~ log(weight), data=cfseal)) 
## 
## Call:
## lm(formula = log(heart) ~ log(weight), data = cfseal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3149 -0.0918  0.0000  0.1168  0.3205 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   1.2043     0.2113     5.7  4.1e-06
## log(weight)   1.1261     0.0547    20.6  < 2e-16
## 
## Residual standard error: 0.18 on 28 degrees of freedom
## Multiple R-squared:  0.938,  Adjusted R-squared:  0.936 
## F-statistic:  424 on 1 and 28 DF,  p-value: <2e-16
## Sec 5.7: There are two regression lines! 
##                An alternative to a regression line 

## Sec 5.8: The Model Matrix in Regression  
model.matrix(roller.lm) 
##    (Intercept) weight
## 1            1    1.9
## 2            1    3.1
## 3            1    3.3
## 4            1    4.8
## 5            1    5.3
## 6            1    6.1
## 7            1    6.4
## 8            1    7.6
## 9            1    9.8
## 10           1   12.4
## attr(,"assign")
## [1] 0 1
## Sec 5.9: *Bayesian regression estimation using the ph{MCMCpack} package 
library(MCMCpack) 
## Loading required package: coda
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:DAAG':
## 
##     hills
## 
## ##
## ## Markov Chain Monte Carlo Package (MCMCpack)
## ## Copyright (C) 2003-2014 Andrew D. Martin, Kevin M. Quinn, and Jong Hee Park
## ##
## ## Support provided by the U.S. National Science Foundation
## ## (Grants SES-0350646 and SES-0350613)
## ##
roller.mcmc <- MCMCregress(depression ~ weight, data=roller) 
summary(roller.mcmc) 
## 
## Iterations = 1001:11000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 10000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##              Mean     SD Naive SE Time-series SE
## (Intercept) -2.00  5.486  0.05486        0.05316
## weight       2.65  0.812  0.00812        0.00765
## sigma2      60.47 40.610  0.40610        0.52264
## 
## 2. Quantiles for each variable:
## 
##               2.5%   25%   50%   75%  97.5%
## (Intercept) -12.80 -5.38 -1.99  1.29   9.25
## weight        1.01  2.17  2.66  3.16   4.26
## sigma2       21.01 35.40 49.39 71.42 166.23
mat <- matrix(c(1:6), byrow=TRUE, ncol=2)  
  # panels are 1, then 2, ... 6. Layout=dim(mat), i.e., 3 by 2 
layout(mat, widths=rep(c(2,1),3), heights=rep(1,6))   
  # NB: widths & heights are relative 
plot(roller.mcmc, auto.layout=FALSE, ask=FALSE, col="gray40")         

plot of chunk unnamed-chunk-1

  # The method is plot.mcmc() 


## Sec 5.10: Recap 

## Sec 5.11: Methodological References 
## requires the data frame ironslag (DAAG) 
xy <- with(ironslag, lowess(chemical ~ magnetic)) 
chemfit <- approx(xy$x, xy$y, xout=ironslag$magnetic, ties="ordered")$y 
res2 <- with(ironslag,  chemical - chemfit) 
plot(res2 ~ magnetic, data=ironslag)  # Panel A 
abline(v=0, lty=2) 
sqrtabs2 <- sqrt(abs(res2)) 
plot(sqrtabs2 ~ chemfit, type="n")    # Panel B 
panel.smooth(chemfit, sqrtabs2) 

e1.lm <- lm(distance ~ stretch, data=elastic1) 

elastic1$newdistance <-  
   cbind(rep(1, 7), elastic1$stretch)%*%coef(e1.lm) +  
         rnorm(7, sd=summary(e1.lm)$sigma) 

"funRel" <- 
  function(x=leafshape$logpet, y=leafshape$loglen, scale=c(1,1)){ 
## Find principal components rotation; see Section 11.1 
## Here (unlike 11.1) the interest is in the final component 
    xy.prc <- prcomp(cbind(x,y), scale=scale) 
    b <- xy.prc$rotation[,2]/scale 
    bxy <- -b[1]/b[2]          # slope - functional eqn line 
    c(bxy = bxy) 
  } 
## Try the following: 
funRel(scale=c(1,1))    # Take x and y errors as equally important 
## bxy.x 
## 0.449
funRel(scale=c(1,10))   # Error is mostly in y; structural relation 
## bxy.x 
## 0.387
                        # line is close to regression line of y on x 
funRel(scale=c(10,1))   # Error is mostly in x ... 
## bxy.x 
## 0.796
## Note that all lines pass through (xbar, ybar) 

plot of chunk unnamed-chunk-1