## Chapter 6: Multiple Linear Regression
## Sec 6.1: Basic Ideas: a Book Weight Example
## Plot weight vs volume: data frame allbacks (DAAG)
par(pty="s")
library(DAAG)
## Loading required package: lattice
plot(weight ~ volume, data=allbacks, pch=c(16,1)[unclass(cover)])
# unclass(cover) gives the integer codes that identify levels
with(allbacks, text(weight ~ volume, labels=paste(1:15),
pos=c(2,4)[unclass(cover)]))

summary(allbacks.lm <- lm(weight ~ volume+area, data=allbacks))
##
## Call:
## lm(formula = weight ~ volume + area, data = allbacks)
##
## Residuals:
## Min 1Q Median 3Q Max
## -104.1 -30.0 -15.5 16.8 212.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.4134 58.4025 0.38 0.70786
## volume 0.7082 0.0611 11.60 7.1e-08
## area 0.4684 0.1019 4.59 0.00062
##
## Residual standard error: 77.7 on 12 degrees of freedom
## Multiple R-squared: 0.928, Adjusted R-squared: 0.917
## F-statistic: 77.9 on 2 and 12 DF, p-value: 1.34e-07
## coefficient estimates and SEs only: summary(allbacks.lm)$coef
par(pty="m")
## Footnote Code
## 5% critical value; t-statistic with 12 d.f.
qt(0.975, 12)
## [1] 2.179
anova(allbacks.lm)
## Analysis of Variance Table
##
## Response: weight
## Df Sum Sq Mean Sq F value Pr(>F)
## volume 1 812132 812132 134.7 7e-08
## area 1 127328 127328 21.1 0.00062
## Residuals 12 72373 6031
## Footnote Code
## Correlation of volume with area
with(allbacks, cor(volume,area))
## [1] 0.001535
model.matrix(allbacks.lm)
## (Intercept) volume area
## 1 1 885 382
## 2 1 1016 468
## 3 1 1125 387
## 4 1 239 371
## 5 1 701 371
## 6 1 641 367
## 7 1 1228 396
## 8 1 412 0
## 9 1 953 0
## 10 1 929 0
## 11 1 1492 0
## 12 1 419 0
## 13 1 1010 0
## 14 1 595 0
## 15 1 1034 0
## attr(,"assign")
## [1] 0 1 2
## ss 6.1.1: Omission of the intercept term
allbacks.lm0 <- lm(weight ~ -1+volume+area, data=allbacks)
summary(allbacks.lm0)
##
## Call:
## lm(formula = weight ~ -1 + volume + area, data = allbacks)
##
## Residuals:
## Min 1Q Median 3Q Max
## -112.5 -28.7 -10.5 24.6 213.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## volume 0.7289 0.0277 26.34 1.1e-12
## area 0.4809 0.0934 5.15 0.00019
##
## Residual standard error: 75.1 on 13 degrees of freedom
## Multiple R-squared: 0.991, Adjusted R-squared: 0.99
## F-statistic: 748 on 2 and 13 DF, p-value: 3.8e-14
## Display correlations between estimates of model coefficients
summary(allbacks.lm, corr=TRUE)
##
## Call:
## lm(formula = weight ~ volume + area, data = allbacks)
##
## Residuals:
## Min 1Q Median 3Q Max
## -104.1 -30.0 -15.5 16.8 212.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.4134 58.4025 0.38 0.70786
## volume 0.7082 0.0611 11.60 7.1e-08
## area 0.4684 0.1019 4.59 0.00062
##
## Residual standard error: 77.7 on 12 degrees of freedom
## Multiple R-squared: 0.928, Adjusted R-squared: 0.917
## F-statistic: 77.9 on 2 and 12 DF, p-value: 1.34e-07
##
## Correlation of Coefficients:
## (Intercept) volume
## volume -0.88
## area -0.32 0.00
## ss 6.1.2: Diagnostic plots
par(mfrow=c(2,2), pty="s") # Get all 4 plots on one page
plot(allbacks.lm0)

par(mfrow=c(1,1), pty="m")
allbacks.lm13 <- lm(weight ~ -1+volume+area, data=allbacks[-13, ])
summary(allbacks.lm13)
##
## Call:
## lm(formula = weight ~ -1 + volume + area, data = allbacks[-13,
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.72 -25.29 3.43 31.24 58.86
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## volume 0.6949 0.0163 42.6 1.8e-14
## area 0.5539 0.0527 10.5 2.1e-07
##
## Residual standard error: 41 on 12 degrees of freedom
## Multiple R-squared: 0.997, Adjusted R-squared: 0.997
## F-statistic: 2.25e+03 on 2 and 12 DF, p-value: 3.52e-16
## Sec 6.2: The Interpretation of Model Coefficients
## ss 6.2.1: Times for Northern Irish hill races
## Footnote Code
## : data frame nihills (DAAG)
## Panel A: Scatterplot matrix, untransformed data, data frame nihills (DAAG)
library(lattice); library(DAAG)
splom(~ nihills[, c("dist","climb","time")], cex.labels=1.2,
varnames=c("dist\n(miles)","climb\n(feet)", "time\n(hours)"))

## Panel B: log transformed data
splom(~ log(nihills[, c("dist","climb","time")]), cex.labels=1.2,
varnames=c("dist\n(log miles)", "climb\n(log feet)", "time\n(log hours)"))

nihills.lm <- lm(log(time) ~ log(dist) + log(climb), data = nihills)
par(mfrow=c(2,2), pty="s")
plot(nihills.lm)

par(mfrow=c(1,1), pty="m")
summary(nihills.lm)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.9611 0.27387 -18.11 7.085e-14
## log(dist) 0.6814 0.05518 12.35 8.186e-11
## log(climb) 0.4658 0.04530 10.28 1.981e-09
## Interpreting the coefficients
## A meaningful coefficient for {logdist}
lognihills <- log(nihills)
names(lognihills) <- paste("log", names(nihills), sep="")
lognihills$logGrad <- with(nihills, log(climb/dist))
nihillsG.lm <- lm(logtime ~ logdist + logGrad, data=lognihills)
summary(nihillsG.lm)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.9611 0.2739 -18.11 7.085e-14
## logdist 1.1471 0.0346 33.15 5.896e-19
## logGrad 0.4658 0.0453 10.28 1.981e-09
## Footnote Code
## Correlations of logGrad and logclimb with logdist
with(lognihills, cor(cbind(logGrad, logclimb), logdist))
## [,1]
## logGrad -0.06529
## logclimb 0.78007
## ss 6.2.2: Plots that show the contribution of individual terms
yterms <- predict(nihills.lm, type="terms")
lognihills <- log(nihills)
names(lognihills) <- paste("log", names(nihills), sep="")
nihills.lm <- lm(logtime ~ logdist + logclimb, data = lognihills)
## To show points, specify partial.resid=TRUE
## For a smooth curve, specify smooth=panel.smooth
par(mfrow=c(1,2), pty="s")
termplot(nihills.lm, partial.resid=TRUE, smooth=panel.smooth,
col.res="gray30")

par(mfrow=c(1,1), pty="m")
## ss 6.2.3: Mouse brain weight example
## Footnote Code
## Scatterplot matrix for data frame litters (DAAG); labels as in figure
library(lattice)
splom(~litters, varnames=c("lsize\n\n(litter size)", "bodywt\n\n(Body Weight)",
"brainwt\n\n(Brain Weight)"))

par(pty="s")
pairs(litters) # For improved labeling, see the footnote.

par(pty="m")
## Regression of brainwt on lsize
summary(lm(brainwt ~ lsize, data = litters))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.447000 0.009625 46.443 3.391e-20
## lsize -0.004033 0.001198 -3.366 3.445e-03
## Regression of brainwt on lsize and bodywt
summary(lm(brainwt ~ lsize + bodywt, data = litters))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.17825 0.075323 2.366 0.030097
## lsize 0.00669 0.003132 2.136 0.047513
## bodywt 0.02431 0.006779 3.586 0.002278
## ss 6.2.4: Book dimensions, density and book weight
## Footnote Code
## Code for Panel A
splom(~log(oddbooks), varnames=c("thick\n\nlog(mm)", "breadth\n\nlog(cm)",
"height\n\nlog(cm)", "weight\n\nlog(g)"), pscales=0)

## Code for Panel B
oddothers <-
with(oddbooks,
data.frame(density = weight/(breadth*height*thick),
area = breadth*height, thick=thick, weight=weight))
splom(~log(oddothers), pscales=0,
varnames=c("log(density)", "log(area)", "log(thick)", "log(weight)"))

## Footnote Code
## Details of calculations
volume <- apply(oddbooks[, 1:3], 1, prod)
area <- apply(oddbooks[, 2:3], 1, prod)
lob1.lm <- lm(log(weight) ~ log(volume), data=oddbooks)
lob2.lm <- lm(log(weight) ~ log(thick)+log(area), data=oddbooks)
lob3.lm <- lm(log(weight) ~ log(thick)+log(breadth)+log(height), data=oddbooks)
coef(summary(lob1.lm))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.942 2.731 -3.274 0.008372
## log(volume) 1.697 0.305 5.562 0.000240
## Similarly for coefficients and SEs for other models
coef(summary(lm(log(weight) ~ log(thick), data=oddbooks)))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.692 0.7076 13.697 8.345e-08
## log(thick) -1.073 0.2190 -4.897 6.263e-04
book.density <- oddbooks$weight/volume
bookDensity.lm <- lm(log(book.density) ~ log(area), data=oddbooks)
coef(summary(bookDensity.lm))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.1091 0.55138 -9.266 3.182e-06
## log(area) 0.4187 0.09576 4.372 1.394e-03
## Sec 6.3: Multiple Regression Assumptions, Diagnostics and Efficacy Measures
## ss 6.3.1: Outliers, leverage, influence and Cook's distance
## Detection of outliers
## $^{*}$ Leverage and the hat matrix
as.vector(hatvalues(nihills.lm)) # as.vector() strips off names
## [1] 0.11927 0.07420 0.13351 0.10948 0.08532 0.14633 0.05176 0.14617
## [9] 0.24651 0.21781 0.09012 0.05667 0.05621 0.12748 0.04945 0.10281
## [17] 0.17548 0.21393 0.44414 0.09012 0.13870 0.04801 0.07650
## Residuals versus leverages
par(pty="s")
plot(nihills.lm, which=5, add.smooth=FALSE)

par(pty="m")
## The points can alternatively be plotted using
## plot(hatvalues(model.matrix(nihills.lm)), residuals(nihills.lm))
## Influential points and Cook's distance
## Influence on the regression coefficients
## Outliers, influential or not, should be taken seriously
## \!\!$^{*}$Additional diagnostic plots
library(car)
##
## Attaching package: 'car'
##
## The following object is masked from 'package:DAAG':
##
## vif
influencePlot(allbacks.lm)

## StudRes Hat CookD
## 11 -1.877 0.3833 0.7765
## 13 5.386 0.1411 0.6903
## ss 6.3.2: Assessment and Comparison of Regression Models
## $R^2$ and adjusted $R^2$
## AIC and related statistics
## Calculation of\, { R}'s version of the AIC
## ss 6.3.3: How accurately does the equation predict?
lognihills <- log(nihills)
names(lognihills) <- paste("log", names(nihills), sep="")
nihills.lm <- lm(logtime ~ logdist + logclimb, data = lognihills)
exp(predict(nihills.lm, interval="confidence"))[1:5, ]
## fit lwr upr
## Binevenagh 0.8932 0.8452 0.9439
## Slieve Gullion 0.4880 0.4672 0.5097
## Glenariff Mountain 0.6404 0.6041 0.6789
## Donard & Commedagh 1.1257 1.0677 1.1868
## McVeigh Classic 0.5699 0.5439 0.5971
## Footnote Code
## Model that is quadratic in logdist
nihills2.lm <- lm(logtime ~ poly(logdist,2) + logclimb, data = lognihills)
citimes2 <- exp(predict(nihills2.lm, interval="confidence"))
## Sec 6.4: A Strategy for Fitting Multiple Regression Models
## Why are linear relationships between explanatory variablespreferable?
## ss 6.4.1: Suggested steps
## ss 6.4.2: Diagnostic checks
## ss 6.4.3: An example -- Scottish hill race data
## Footnote Code
## Scatterplot matrix: data frame hills2000 (DAAG), log scales
splom(~ log(hills2000[, c("dist","climb","time")]), cex.labels=1.2,
varnames=c("dist\n(log miles)", "climb\n(log feet)", "time\n(log hours)"))

## Panel A
lhills2k.lm <- lm(log(time) ~ log(climb) + log(dist),
data = hills2000)
plot(lhills2k.lm, caption="", which=1)

## Panel B
library(MASS) # lqs() is in the MASS package
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:DAAG':
##
## hills
lhills2k.lqs <- lqs(log(time) ~ log(climb) + log(dist),
data = hills2000)
reres <- residuals(lhills2k.lqs)
refit <- fitted(lhills2k.lqs)
big3 <- which(abs(reres) >= sort(abs(reres), decreasing=TRUE)[3])
plot(reres ~ refit, xlab="Fitted values (resistant fit)",
ylab="Residuals (resistant fit)")
lines(lowess(reres ~ refit), col=2)
text(reres[big3] ~ refit[big3], labels=rownames(hills2000)[big3],
pos=4-2*(refit[big3] > mean(refit)), cex=0.8)

plot(lhills2k.lm, which=2)

## Footnote Code
## Probability of a value <= -3 or >= 3
2 * pnorm(-3)
## [1] 0.0027
## The contribution of the separate terms
## Create data frame that has logs of time, dist & climb
lhills2k <- log(hills2000[, c("dist", "climb", "time")])
names(lhills2k) <- paste("log", names(lhills2k), sep="")
lhills2k.lm <- lm(logtime ~ logdist+logclimb, data=lhills2k)
termplot(lhills2k.lm, partial.resid=TRUE, smooth=panel.smooth,
col.res="gray30")


## A resistant fit that has a polynomial term in {logdist}
reres2 <- residuals(lqs(logtime ~ poly(logdist,2)+logclimb, data=lhills2k))
reres2[order(abs(reres2), decreasing=TRUE)[1:4]]
## Caerketton Beinn Lee Yetholm Goatfell
## -0.4170 0.2751 0.2004 0.1886
## Refining the model
add1(lhills2k.lm, ~ logdist+I(logdist^2)+logclimb+logdist:logclimb,
test="F")
## Single term additions
##
## Model:
## logtime ~ logdist + logclimb
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 0.772 -234
## I(logdist^2) 1 0.1668 0.605 -246 14.33 0.0004
## logdist:logclimb 1 0.0634 0.709 -237 4.65 0.0357
lhills2k.lm2 <- lm(logtime ~ poly(logdist,2)+logclimb, data=lhills2k[-42, ])
plot(lhills2k.lm2)




## Footnote Code
## Check addition of interaction term
add1(lhills2k.lm2, ~ poly(logdist,2)+logclimb+logdist:logclimb)
## Single term additions
##
## Model:
## logtime ~ poly(logdist, 2) + logclimb
## Df Sum of Sq RSS AIC
## <none> 0.429 -259
## logclimb:logdist 1 0.012 0.417 -258
## The model without the interaction term
lhills2k.lm <- lm(logtime ~ logdist+logclimb, data=lhills2k[-42, ])
summary(lhills2k.lm)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.0229 0.18651 -21.57 1.332e-27
## logdist 0.7785 0.03349 23.25 3.885e-29
## logclimb 0.3169 0.03019 10.50 1.898e-14
## Are "errors in x" an issue?
## What happens if we do not transform?
## Sec 6.5: Problems with Many Explanatory Variables
## ss 6.5.1: Variable selection issues
## Variable selection -- a simulation with random data
## Footnote Code
## Generate a 100 by 40 matrix of random normal data
y <- rnorm(100)
xx <- matrix(rnorm(4000), ncol = 40)
dimnames(xx)<- list(NULL, paste("X",1:40, sep=""))
## Footnote Code
## Find the best fitting model
library(leaps)
xx.subsets <- regsubsets(xx, y, method = "exhaustive", nvmax = 3, nbest = 1)
subvar <- summary(xx.subsets)$which[3,-1]
best3.lm <- lm(y ~ -1+xx[, subvar])
print(summary(best3.lm, corr = FALSE))
##
## Call:
## lm(formula = y ~ -1 + xx[, subvar])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1537 -0.6167 0.0293 0.8082 2.4697
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## xx[, subvar]X5 -0.2737 0.0995 -2.75 0.0071
## xx[, subvar]X6 -0.1893 0.1065 -1.78 0.0786
## xx[, subvar]X12 -0.2300 0.1093 -2.10 0.0379
##
## Residual standard error: 0.97 on 97 degrees of freedom
## Multiple R-squared: 0.119, Adjusted R-squared: 0.0915
## F-statistic: 4.36 on 3 and 97 DF, p-value: 0.00635
## The following call to bestsetNoise() (from DAAG) achieves the same purpose
bestsetNoise(m=100, n=40)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7596 -0.4958 -0.0494 0.5621 1.6118
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.1204 0.0784 -1.54 0.128
## xV12 -0.1619 0.0691 -2.34 0.021
## xV19 0.1972 0.0780 2.53 0.013
## xV25 0.1778 0.0811 2.19 0.031
##
## Residual standard error: 0.76 on 96 degrees of freedom
## Multiple R-squared: 0.145, Adjusted R-squared: 0.118
## F-statistic: 5.42 on 3 and 96 DF, p-value: 0.00173
## Cross-validation that accounts for the variableselection process
## Regression on principal components
## Sec 6.6: Multicollinearity
## An example -- compositional data
library(compositions)
## Loading required package: tensorA
##
## Attaching package: 'tensorA'
##
## The following object is masked from 'package:base':
##
## norm
##
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
##
## The following object is masked from 'package:DAAG':
##
## milk
##
## Loading required package: energy
## Loading required package: bayesm
## Welcome to compositions, a package for compositional data analysis.
## Find an intro with "? compositions"
##
##
## Attaching package: 'compositions'
##
## The following objects are masked from 'package:stats':
##
## cor, cov, dist, var
##
## The following objects are masked from 'package:base':
##
## %*%, scale, scale.default
data(Coxite) # For pkg compositions, needed to access data
coxite <- as.data.frame(Coxite) # From matrix, create data frame
## Scatterplot matrix for data frame coxite
par(pty="s")
pairs(coxite)

par(pty="m")
coxiteAll.lm <- lm(porosity ~ A+B+C+D+E+depth, data=coxite)
summary(coxiteAll.lm)
##
## Call:
## lm(formula = porosity ~ A + B + C + D + E + depth, data = coxite)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9304 -0.4698 0.0242 0.3522 1.1822
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -217.7466 253.4439 -0.86 0.40
## A 2.6486 2.4825 1.07 0.30
## B 2.1915 2.6015 0.84 0.41
## C 0.2113 2.2271 0.09 0.93
## D 4.9492 4.6720 1.06 0.30
## E NA NA NA NA
## depth 0.0145 0.0333 0.44 0.67
##
## Residual standard error: 0.649 on 19 degrees of freedom
## Multiple R-squared: 0.936, Adjusted R-squared: 0.919
## F-statistic: 55.1 on 5 and 19 DF, p-value: 1.18e-10
hat <- predict(coxiteAll.lm, interval="confidence", level=0.95)
## ss 6.6.1: The variance inflation factor (VIF)
vif(lm(porosity ~ A+B+C+D+depth, data=coxite))
## A B C D depth
## 2717.815 2484.976 192.589 566.144 3.417
cor(coxite$porosity, coxite)
## A B C D E depth porosity
## [1,] 0.869 -0.5511 -0.7233 -0.3199 -0.4076 -0.1468 1
summary(coxiteABC.lm <- lm(porosity ~ A+B+C, data=coxite))
##
## Call:
## lm(formula = porosity ~ A + B + C, data = coxite)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9814 -0.3745 0.0229 0.4174 1.2727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.3046 13.2219 4.03 0.00060
## A -0.0125 0.1558 -0.08 0.93700
## B -0.5867 0.1513 -3.88 0.00087
## C -2.2188 0.3389 -6.55 1.7e-06
##
## Residual standard error: 0.642 on 21 degrees of freedom
## Multiple R-squared: 0.93, Adjusted R-squared: 0.92
## F-statistic: 93.3 on 3 and 21 DF, p-value: 2.64e-12
vif(coxiteABC.lm)
## A B C
## 10.936 8.592 4.555
summary(coxiteBC.lm <- lm(porosity ~ B+C, data=coxite))
##
## Call:
## lm(formula = porosity ~ B + C, data = coxite)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9835 -0.3785 -0.0035 0.4178 1.2645
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.2571 1.7831 29.3 < 2e-16
## B -0.5753 0.0508 -11.3 1.2e-10
## C -2.1949 0.1562 -14.1 1.8e-12
##
## Residual standard error: 0.628 on 22 degrees of freedom
## Multiple R-squared: 0.93, Adjusted R-squared: 0.924
## F-statistic: 147 on 2 and 22 DF, p-value: 1.91e-13
vif(coxiteBC.lm)
## B C
## 1.013 1.013
AIC(coxiteAll.lm, coxiteBC.lm)
## df AIC
## coxiteAll.lm 7 56.50
## coxiteBC.lm 4 52.47
## Numbers that do not quite add up
coxiteR <- coxite
coxiteR[, 1:5] <- round(coxiteR[, 1:5])
summary(lm(porosity ~ ., data=coxiteR))
##
## Call:
## lm(formula = porosity ~ ., data = coxiteR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7056 -0.3369 0.0598 0.4361 0.9831
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.4251 23.4043 -0.06 0.95212
## A 0.5597 0.2457 2.28 0.03515
## B 0.0157 0.2403 0.07 0.94865
## C -1.3180 0.2940 -4.48 0.00029
## D 0.9748 0.4222 2.31 0.03305
## E -0.5530 0.5242 -1.05 0.30543
## depth -0.0149 0.0223 -0.67 0.51310
##
## Residual standard error: 0.708 on 18 degrees of freedom
## Multiple R-squared: 0.927, Adjusted R-squared: 0.903
## F-statistic: 38.3 on 6 and 18 DF, p-value: 2.69e-09
vif(lm(porosity ~ .-E, data=coxiteR))
## A B C D depth
## 16.964 16.487 3.281 4.661 1.252
## ss 6.6.2: Remedies for multicollinearity
## Sec 6.7: Errors in $x$
## Measurement of dietary intake
## Simulations of the effect of measurement error
## *Two explanatory variables
## Two explanatory variables, one measured without error -- a simulation
errorsINseveral()
## Intercept b1 b2 SE(Int) SE(b1) SE(b2)
## Values for simulation 2.481 1.500 0.000 NA NA NA
## Estimates: no error in x1 2.309 1.508 0.006 0.193 0.009 0.009
## LS Estimates: error in x1 23.155 0.402 -0.563 0.591 0.021 0.037
## Theoretical attenuation of b1 Theoretical b2
## 0.2500 -0.5625

## An arbitrary number ot variables
## *The classical error model versus the Berkson error model
## Sec 6.8: Multiple Regression Models -- Additional Points
## ss 6.8.1: Confusion between explanatory and response variables
coef(lm(area ~ volume + weight, data=allbacks))
## (Intercept) volume weight
## 35.4587 -0.9636 1.3611
b <- as.vector(coef(lm(weight ~ volume + area, data=allbacks)))
c("_Intercept_"=-b[1]/b[3], volume=-b[2]/b[3], weight=1/b[3])
## _Intercept_ volume weight
## -47.847 -1.512 2.135
## Unintended correlations
## ss 6.8.2: Missing explanatory variables
## Strategies
## ss 6.8.3: \!\!$^{*}$ The use of transformations
## ss 6.8.4: \!\!$^{*}$ Non-linear methods -- an alternative to transformation?
nihills$climb.mi <- nihills$climb/5280
nihills.nls0 <- nls(time ~ (dist^alpha)*(climb.mi^beta), start =
c(alpha = 0.68, beta = 0.465), data = nihills)
plot(residuals(nihills.nls0) ~ log(predict(nihills.nls0)))

nihills.nls <- nls(time ~ gamma + delta1*dist^alpha +
delta2*climb.mi^beta,
start=c(gamma = .045, delta1 = .09, alpha = 1,
delta2=.9, beta = 1.65), data=nihills)
plot(residuals(nihills.nls) ~ log(predict(nihills.nls)))

## Sec 6.9: Recap
## Sec 6.10: Further Reading
## Set up factor that identifies the `have' cities
## Data frame cities (DAAG)
cities$have <- factor((cities$REGION=="ON")|
(cities$REGION=="WEST"))
par(pty="s")
plot(POP1996 ~ POP1992, data=cities,
col=as.integer(cities$have))

plot(log(POP1996) ~ log(POP1992), data=cities,
col=as.integer(cities$have))

par(pty="m")
cities.lm1 <- lm(POP1996 ~ have+POP1992, data=cities)
cities.lm2 <- lm(log(POP1996) ~ have+log(POP1992),
data=cities)
nihills.lm <- lm(time ~ dist+climb, data=nihills)
nihills2.lm <- lm(time ~ dist+climb+dist:climb, data=nihills)
anova(nihills.lm, nihills2.lm)
## Analysis of Variance Table
##
## Model 1: time ~ dist + climb
## Model 2: time ~ dist + climb + dist:climb
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 20 0.1894
## 2 19 0.0394 1 0.15 72.4 6.6e-08
x1 <- runif(10) # predictor which will be missing
x2 <- rbinom(10, 1, 1-x1) # observed predictor which depends
# on missing predictor
y <- 5*x1 + x2 + rnorm(10,sd=.1) # simulated model; coef
# of x2 is positive
y.lm <- lm(y ~ factor(x2)) # model fitted to observed data
coef(y.lm)
## (Intercept) factor(x2)1
## 3.9609 -0.8824
y.lm2 <- lm(y ~ x1 + factor(x2)) # correct model
coef(y.lm2)
## (Intercept) x1 factor(x2)1
## -0.08243 5.04536 1.15339
library(MASS)
nraw.lqs <- lqs(northRain ~ SOI + CO2, data=bomregions)
north.lqs <- lqs(I(northRain^(1/3)) ~ SOI + CO2, data=bomregions)
par(mfrow=c(2,1))
plot(residuals(nraw.lqs) ~ Year, data=bomregions)
plot(residuals(north.lqs) ~ Year, data=bomregions)

par(mfrow=c(1,1))