## Chapter 5: Regression with a Single Predictor
## Sec 5.1: Fitting a Line to Data
## How accurate is the line?
## ss 5.1.1: Summary information -- lawn roller example
## Footnote Code
## Global options used for this and most later output
options(show.signif.stars=FALSE, digits=4)
# show.signif.stars=FALSE suppresses interpretive features that,
# for our use of the output, are unhelpful.
library(DAAG)
## Loading required package: lattice
## Fit lm model: data from roller (DAAG); output in roller.lm
roller.lm <- lm(depression ~ weight, data = roller)
## Use the extractor function summary() to summarize results
summary(roller.lm)
##
## Call:
## lm(formula = depression ~ weight, data = roller)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.18 -5.58 -1.35 5.92 8.02
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.09 4.75 -0.44 0.6723
## weight 2.67 0.70 3.81 0.0052
##
## Residual standard error: 6.74 on 8 degrees of freedom
## Multiple R-squared: 0.644, Adjusted R-squared: 0.6
## F-statistic: 14.5 on 1 and 8 DF, p-value: 0.00518
## Footnote Code
## Fit model that omits intercept term; i.e., y = bx
lm(depression ~ -1 + weight, data=roller)
##
## Call:
## lm(formula = depression ~ -1 + weight, data = roller)
##
## Coefficients:
## weight
## 2.39
## ss 5.1.2: Residual plots
## A: Plot residuals vs fitted values; B: normal probability plot
plot(roller.lm, which = 1:2)


## Footnote Code
## Alternatively, use the following code:
test <- residuals(roller.lm); n <- length(test)
av <- mean(test); sdev <- sd(test)
y <- c(test, rnorm(7*n, av, sdev))
fac <- c(rep("residuals(roller.lm)",n), paste("reference", rep(1:7, rep(n,7))))
fac <- factor(fac, levels=unique(fac))
library(lattice)
qqmath(~ y|fac, aspect=1, layout=c(4,2))

## Normal probability plot, plus 7 reference plots
qreference(residuals(roller.lm), nrep=8, nrows=2)

## ss 5.1.3: Iron slag example: is there a pattern in the residuals?
## Footnote Code
## Panel A: chemical vs magnetic (Data frame ironslag from DAAG)
plot(chemical ~ magnetic, data=ironslag)
ironslag.lm <- lm(chemical ~ magnetic, data=ironslag)
abline(ironslag.lm)
with(ironslag, lines(lowess(chemical ~ magnetic, f=.9), lty=2))

## Footnote Code
## Panel B: Residuals from straight line fit, vs magnetic
res <- residuals(ironslag.lm)
plot(res ~ magnetic, xlab="Residual", data=ironslag)
with(ironslag, lines(lowess(res ~ magnetic, f=.9), lty=2))

## Footnote Code
## Panel C: Observed vs predicted
yhat <- fitted(ironslag.lm)
plot(chemical ~ yhat, data=ironslag, xlab="Predicted chemical", ylab="Chemical")
with(ironslag, lines(lowess(chemical ~ yhat, f=.9), lty=2))

## Footnote Code
## Panel D: Check whether error variance seems constant
sqrtabs <- sqrt(abs(res))
plot(sqrtabs ~ yhat, data=ironslag, xlab = "Predicted chemical",
ylab = expression(sqrt(abs(residual))), type = "n")
panel.smooth(yhat, sqrtabs, span = 0.95)

## ss 5.1.4: The analysis of variance table
anova(roller.lm)
## Analysis of Variance Table
##
## Response: depression
## Df Sum Sq Mean Sq F value Pr(>F)
## weight 1 658 658 14.5 0.0052
## Residuals 8 363 45
## Sec 5.2: Outliers, Influence and Robust Regression
softbacks.lm <- lm(weight ~ volume, data=softbacks)
summary(softbacks.lm)
##
## Call:
## lm(formula = weight ~ volume, data = softbacks)
##
## Residuals:
## Min 1Q Median 3Q Max
## -89.67 -39.89 -25.00 9.07 215.91
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.372 97.559 0.42 0.68629
## volume 0.686 0.106 6.47 0.00064
##
## Residual standard error: 102 on 6 degrees of freedom
## Multiple R-squared: 0.875, Adjusted R-squared: 0.854
## F-statistic: 41.9 on 1 and 6 DF, p-value: 0.000644
par(mfrow=c(2,2)) # Use par(mfrow=c(1,4)) for layout in figure
plot(softbacks.lm, which=1:4)

# By default, plots 1:3 and 5 [which=c(1:3,5)] are given
par(mfrow=c(1,1))
## Robust regression
## Sec 5.3: Standard Errors and Confidence Intervals
## ss 5.3.1: Confidence intervals and tests for the slope
## Footnote Code
## Code for confidence interval calculations
SEb <- summary(roller.lm)$coefficients[2, 2]
coef(roller.lm)[2] + qt(c(0.025,.975), 8)*SEb
## [1] 1.052 4.282
## ss 5.3.2: SEs and confidence intervals for predicted values
## Footnote Code
## Code to obtain fitted values and standard errors (SE, then SE.OBS)
fit.with.se <- predict(roller.lm, se.fit=TRUE)
fit.with.se$se.fit # SE
## [1] 3.614 2.977 2.881 2.308 2.197 2.130 2.142 2.384 3.370 4.918
sqrt(fit.with.se$se.fit^2+fit.with.se$residual.scale^2) # SE.OBS
## [1] 7.644 7.364 7.326 7.120 7.085 7.064 7.068 7.145 7.532 8.340
## Footnote Code
## Plot depression vs weight, with 95\% pointwise bounds for the fitted line
plot(depression ~ weight, data=roller, xlab = "Weight of Roller (tonnes)",
ylab = "Depression in Lawn (mm)", pch = 16)
roller.lm <- lm(depression ~ weight, data = roller)
abline(roller.lm$coef, lty = 1)
xy <- data.frame(weight = pretty(roller$weight, 20))
yhat <- predict(roller.lm, newdata = xy, interval="confidence")
ci <- data.frame(lower=yhat[, "lwr"], upper=yhat[, "upr"])
lines(xy$weight, ci$lower, lty = 2, lwd=2, col="grey")
lines(xy$weight, ci$upper, lty = 2, lwd=2, col="grey")

## ss 5.3.3: *{\kern5pt}Implications for design
## Sec 5.4: Assessing Predictive Accuracy
## ss 5.4.1: Training/test sets, and cross-validation
## ss 5.4.2: Cross-validation -- an example
## Footnote Code
## Split row numbers randomly into 3 groups
rand <- sample(1:15)%%3 + 1
# a%%3 is the remainder of a, modulo 3
# Subtract from a the largest multiple of 3 that is <= a; take remainder
(1:15)[rand == 1] # Observation numbers for the first group
## [1] 4 6 12 13 14
(1:15)[rand == 2] # Observation numbers for the second group
## [1] 5 8 9 10 11
(1:15)[rand == 3] # Observation numbers for the third group.
## [1] 1 2 3 7 15
## Cross-validate lm calculations: data frame houseprices (DAAG)
houseprices.lm <- lm(sale.price ~ area, data=houseprices)
CVlm(houseprices, houseprices.lm, plotit=TRUE)
## Analysis of Variance Table
##
## Response: sale.price
## Df Sum Sq Mean Sq F value Pr(>F)
## area 1 18566 18566 8 0.014
## Residuals 13 30179 2321

##
## fold 1
## Observations in test set: 5
## 11 20 21 22 23
## area 802 696 771.0 1006.0 1191
## cvpred 204 188 199.3 234.7 262
## sale.price 215 255 260.0 293.0 375
## CV residual 11 67 60.7 58.3 113
##
## Sum of squares = 24351 Mean square = 4870 n = 5
##
## fold 2
## Observations in test set: 5
## 10 13 14 17 18
## area 905 716 963.0 1018.00 887.00
## cvpred 255 224 264.4 273.38 252.06
## sale.price 215 113 185.0 276.00 260.00
## CV residual -40 -112 -79.4 2.62 7.94
##
## Sum of squares = 20416 Mean square = 4083 n = 5
##
## fold 3
## Observations in test set: 5
## 9 12 15 16 19
## area 694.0 1366 821.00 714.0 790.00
## cvpred 183.2 388 221.94 189.3 212.49
## sale.price 192.0 274 212.00 220.0 221.50
## CV residual 8.8 -114 -9.94 30.7 9.01
##
## Sum of squares = 14241 Mean square = 2848 n = 5
##
## Overall (Sum over all 5 folds)
## ms
## 3934
## Footnote Code
## Estimate of sigma^2 from regression output
summary(houseprices.lm)$sigma^2
## [1] 2321
## ss 5.4.3: *~Bootstrapping
houseprices.lm <- lm(sale.price ~ area, data=houseprices)
summary(houseprices.lm)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.750 60.3477 1.17 0.2621
## area 0.188 0.0664 2.83 0.0142
houseprices.fn <- function (houseprices, index){
house.resample <- houseprices[index, ]
house.lm <- lm(sale.price ~ area, data=house.resample)
coef(house.lm)[2] # slope estimate for resampled data
}
set.seed(1028) # use to replicate the exact results below
library(boot) # ensure that the boot package is loaded
##
## Attaching package: 'boot'
##
## The following object is masked from 'package:lattice':
##
## melanoma
## requires the data frame houseprices (DAAG)
(houseprices.boot <- boot(houseprices, R=999, statistic=houseprices.fn))
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = houseprices, statistic = houseprices.fn, R = 999)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 0.188 0.0126 0.09
housepred.fn <- function(houseprices, index){
house.resample <- houseprices[index, ]
house.lm <- lm(sale.price ~ area, data=house.resample)
predict(house.lm, newdata=data.frame(area=1200))
}
## Footnote Code
## 95% CI for predicted price of 1200 square foot house
housepred.boot <- boot(houseprices, R=999, statistic=housepred.fn)
boot.ci(housepred.boot, type="perc") # "basic" is an alternative to "perc"
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 999 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = housepred.boot, type = "perc")
##
## Intervals :
## Level Percentile
## 95% (241, 368 )
## Calculations and Intervals on Original Scale
## Footnote Code
## Bootstrap estimates of prediction errors of house prices
houseprices2.fn <- function (houseprices, index)
{
house.resample <- houseprices[index, ]
house.lm <- lm(sale.price ~ area, data=house.resample)
houseprices$sale.price - predict(house.lm, houseprices) # resampled prediction
# errors
}
n <- length(houseprices$area); R <- 200
houseprices2.boot <- boot(houseprices, R=R, statistic=houseprices2.fn)
house.fac <- factor(rep(1:n, rep(R, n)))
plot(house.fac, as.vector(houseprices2.boot$t), ylab="Prediction Errors",
xlab="House")

## Footnote Code
## Ratios of bootstrap to model-based standard errors
bootse <- apply(houseprices2.boot$t, 2, sd)
usualse <- predict.lm(houseprices.lm, se.fit=TRUE)$se.fit
plot(bootse/usualse, ylab="Ratio of Bootstrap SE's to Model-Based SE's",
xlab="House", pch=16)
abline(1, 0)

## Commentary
## Sec 5.5: Regression versus Qualitative anova Comparisons -- Issues of Power
## The pattern of change
## Sec 5.6: Logarithmic and Other Transformations
## ss 5.6.1: *{\kern5pt}A Note on Power Transformations
## *General power transformations
## ss 5.6.2: Size and shape data -- allometric growth
## The allometric growth equation
summary(cfseal.lm <- lm(log(heart) ~ log(weight), data=cfseal))
##
## Call:
## lm(formula = log(heart) ~ log(weight), data = cfseal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3149 -0.0918 0.0000 0.1168 0.3205
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.2043 0.2113 5.7 4.1e-06
## log(weight) 1.1261 0.0547 20.6 < 2e-16
##
## Residual standard error: 0.18 on 28 degrees of freedom
## Multiple R-squared: 0.938, Adjusted R-squared: 0.936
## F-statistic: 424 on 1 and 28 DF, p-value: <2e-16
## Sec 5.7: There are two regression lines!
## An alternative to a regression line
## Sec 5.8: The Model Matrix in Regression
model.matrix(roller.lm)
## (Intercept) weight
## 1 1 1.9
## 2 1 3.1
## 3 1 3.3
## 4 1 4.8
## 5 1 5.3
## 6 1 6.1
## 7 1 6.4
## 8 1 7.6
## 9 1 9.8
## 10 1 12.4
## attr(,"assign")
## [1] 0 1
## Sec 5.9: *Bayesian regression estimation using the ph{MCMCpack} package
library(MCMCpack)
## Loading required package: coda
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:DAAG':
##
## hills
##
## ##
## ## Markov Chain Monte Carlo Package (MCMCpack)
## ## Copyright (C) 2003-2014 Andrew D. Martin, Kevin M. Quinn, and Jong Hee Park
## ##
## ## Support provided by the U.S. National Science Foundation
## ## (Grants SES-0350646 and SES-0350613)
## ##
roller.mcmc <- MCMCregress(depression ~ weight, data=roller)
summary(roller.mcmc)
##
## Iterations = 1001:11000
## Thinning interval = 1
## Number of chains = 1
## Sample size per chain = 10000
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## (Intercept) -2.00 5.486 0.05486 0.05316
## weight 2.65 0.812 0.00812 0.00765
## sigma2 60.47 40.610 0.40610 0.52264
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## (Intercept) -12.80 -5.38 -1.99 1.29 9.25
## weight 1.01 2.17 2.66 3.16 4.26
## sigma2 21.01 35.40 49.39 71.42 166.23
mat <- matrix(c(1:6), byrow=TRUE, ncol=2)
# panels are 1, then 2, ... 6. Layout=dim(mat), i.e., 3 by 2
layout(mat, widths=rep(c(2,1),3), heights=rep(1,6))
# NB: widths & heights are relative
plot(roller.mcmc, auto.layout=FALSE, ask=FALSE, col="gray40")

# The method is plot.mcmc()
## Sec 5.10: Recap
## Sec 5.11: Methodological References
## requires the data frame ironslag (DAAG)
xy <- with(ironslag, lowess(chemical ~ magnetic))
chemfit <- approx(xy$x, xy$y, xout=ironslag$magnetic, ties="ordered")$y
res2 <- with(ironslag, chemical - chemfit)
plot(res2 ~ magnetic, data=ironslag) # Panel A
abline(v=0, lty=2)
sqrtabs2 <- sqrt(abs(res2))
plot(sqrtabs2 ~ chemfit, type="n") # Panel B
panel.smooth(chemfit, sqrtabs2)
e1.lm <- lm(distance ~ stretch, data=elastic1)
elastic1$newdistance <-
cbind(rep(1, 7), elastic1$stretch)%*%coef(e1.lm) +
rnorm(7, sd=summary(e1.lm)$sigma)
"funRel" <-
function(x=leafshape$logpet, y=leafshape$loglen, scale=c(1,1)){
## Find principal components rotation; see Section 11.1
## Here (unlike 11.1) the interest is in the final component
xy.prc <- prcomp(cbind(x,y), scale=scale)
b <- xy.prc$rotation[,2]/scale
bxy <- -b[1]/b[2] # slope - functional eqn line
c(bxy = bxy)
}
## Try the following:
funRel(scale=c(1,1)) # Take x and y errors as equally important
## bxy.x
## 0.449
funRel(scale=c(1,10)) # Error is mostly in y; structural relation
## bxy.x
## 0.387
# line is close to regression line of y on x
funRel(scale=c(10,1)) # Error is mostly in x ...
## bxy.x
## 0.796
## Note that all lines pass through (xbar, ybar)
