multilr.R

## Chapter 6: Multiple Linear Regression 

## Sec 6.1: Basic Ideas: a Book Weight Example 
## Plot weight vs volume: data frame allbacks (DAAG) 
par(pty="s")
library(DAAG)

## Loading required package: lattice

plot(weight ~ volume, data=allbacks, pch=c(16,1)[unclass(cover)]) 
 # unclass(cover) gives the integer codes that identify levels 
with(allbacks, text(weight ~ volume, labels=paste(1:15),  
                    pos=c(2,4)[unclass(cover)]))

plot of chunk unnamed-chunk-1

summary(allbacks.lm <- lm(weight ~ volume+area, data=allbacks))

## 
## Call:
## lm(formula = weight ~ volume + area, data = allbacks)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -104.1  -30.0  -15.5   16.8  212.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  22.4134    58.4025    0.38  0.70786
## volume        0.7082     0.0611   11.60  7.1e-08
## area          0.4684     0.1019    4.59  0.00062
## 
## Residual standard error: 77.7 on 12 degrees of freedom
## Multiple R-squared:  0.928,  Adjusted R-squared:  0.917 
## F-statistic: 77.9 on 2 and 12 DF,  p-value: 1.34e-07

## coefficient estimates and SEs only: summary(allbacks.lm)$coef 
par(pty="m")

## Footnote Code
## 5% critical value; t-statistic with 12 d.f.  
qt(0.975, 12)

## [1] 2.179

anova(allbacks.lm)

## Analysis of Variance Table
## 
## Response: weight
##           Df Sum Sq Mean Sq F value  Pr(>F)
## volume     1 812132  812132   134.7   7e-08
## area       1 127328  127328    21.1 0.00062
## Residuals 12  72373    6031

## Footnote Code
## Correlation of volume with area  
with(allbacks, cor(volume,area))

## [1] 0.001535

model.matrix(allbacks.lm)

##    (Intercept) volume area
## 1            1    885  382
## 2            1   1016  468
## 3            1   1125  387
## 4            1    239  371
## 5            1    701  371
## 6            1    641  367
## 7            1   1228  396
## 8            1    412    0
## 9            1    953    0
## 10           1    929    0
## 11           1   1492    0
## 12           1    419    0
## 13           1   1010    0
## 14           1    595    0
## 15           1   1034    0
## attr(,"assign")
## [1] 0 1 2

## ss 6.1.1: Omission of the intercept term 
allbacks.lm0 <- lm(weight ~ -1+volume+area, data=allbacks) 
summary(allbacks.lm0)

## 
## Call:
## lm(formula = weight ~ -1 + volume + area, data = allbacks)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -112.5  -28.7  -10.5   24.6  213.8 
## 
## Coefficients:
##        Estimate Std. Error t value Pr(>|t|)
## volume   0.7289     0.0277   26.34  1.1e-12
## area     0.4809     0.0934    5.15  0.00019
## 
## Residual standard error: 75.1 on 13 degrees of freedom
## Multiple R-squared:  0.991,  Adjusted R-squared:  0.99 
## F-statistic:  748 on 2 and 13 DF,  p-value: 3.8e-14

## Display correlations between estimates of model coefficients 
summary(allbacks.lm, corr=TRUE)

## 
## Call:
## lm(formula = weight ~ volume + area, data = allbacks)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -104.1  -30.0  -15.5   16.8  212.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  22.4134    58.4025    0.38  0.70786
## volume        0.7082     0.0611   11.60  7.1e-08
## area          0.4684     0.1019    4.59  0.00062
## 
## Residual standard error: 77.7 on 12 degrees of freedom
## Multiple R-squared:  0.928,  Adjusted R-squared:  0.917 
## F-statistic: 77.9 on 2 and 12 DF,  p-value: 1.34e-07
## 
## Correlation of Coefficients:
##        (Intercept) volume
## volume -0.88             
## area   -0.32        0.00

## ss 6.1.2: Diagnostic plots 
par(mfrow=c(2,2), pty="s")    # Get all 4 plots on one page 
plot(allbacks.lm0)

plot of chunk unnamed-chunk-1

par(mfrow=c(1,1), pty="m") 

allbacks.lm13 <- lm(weight ~ -1+volume+area, data=allbacks[-13, ])  
summary(allbacks.lm13)

## 
## Call:
## lm(formula = weight ~ -1 + volume + area, data = allbacks[-13, 
##     ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -61.72 -25.29   3.43  31.24  58.86 
## 
## Coefficients:
##        Estimate Std. Error t value Pr(>|t|)
## volume   0.6949     0.0163    42.6  1.8e-14
## area     0.5539     0.0527    10.5  2.1e-07
## 
## Residual standard error: 41 on 12 degrees of freedom
## Multiple R-squared:  0.997,  Adjusted R-squared:  0.997 
## F-statistic: 2.25e+03 on 2 and 12 DF,  p-value: 3.52e-16

## Sec 6.2: The Interpretation of Model Coefficients 
## ss 6.2.1: Times for Northern Irish hill races 
## Footnote Code
## : data frame nihills (DAAG) 
## Panel A: Scatterplot matrix, untransformed data, data frame nihills (DAAG) 
library(lattice); library(DAAG) 
splom(~ nihills[, c("dist","climb","time")], cex.labels=1.2, 
      varnames=c("dist\n(miles)","climb\n(feet)", "time\n(hours)"))

plot of chunk unnamed-chunk-1

## Panel B: log transformed data 
splom(~ log(nihills[, c("dist","climb","time")]), cex.labels=1.2, 
      varnames=c("dist\n(log miles)", "climb\n(log feet)", "time\n(log hours)"))

plot of chunk unnamed-chunk-1

nihills.lm <- lm(log(time) ~ log(dist) + log(climb), data = nihills) 
par(mfrow=c(2,2), pty="s") 
plot(nihills.lm)

plot of chunk unnamed-chunk-1

par(mfrow=c(1,1), pty="m") 

summary(nihills.lm)$coef

##             Estimate Std. Error t value  Pr(>|t|)
## (Intercept)  -4.9611    0.27387  -18.11 7.085e-14
## log(dist)     0.6814    0.05518   12.35 8.186e-11
## log(climb)    0.4658    0.04530   10.28 1.981e-09

##                   Interpreting the coefficients 
##               A meaningful coefficient for {logdist} 
lognihills <- log(nihills) 
names(lognihills) <- paste("log", names(nihills), sep="") 
lognihills$logGrad <- with(nihills, log(climb/dist)) 
nihillsG.lm <- lm(logtime ~ logdist + logGrad, data=lognihills) 
summary(nihillsG.lm)$coef

##             Estimate Std. Error t value  Pr(>|t|)
## (Intercept)  -4.9611     0.2739  -18.11 7.085e-14
## logdist       1.1471     0.0346   33.15 5.896e-19
## logGrad       0.4658     0.0453   10.28 1.981e-09

## Footnote Code
## Correlations of logGrad and logclimb with logdist 
with(lognihills, cor(cbind(logGrad, logclimb), logdist))

##              [,1]
## logGrad  -0.06529
## logclimb  0.78007

## ss 6.2.2: Plots that show the contribution of individual terms 
yterms <- predict(nihills.lm, type="terms")  

lognihills <- log(nihills) 
names(lognihills) <- paste("log", names(nihills), sep="") 
nihills.lm <- lm(logtime ~ logdist + logclimb, data = lognihills) 
## To show points, specify partial.resid=TRUE 
## For a smooth curve, specify smooth=panel.smooth
par(mfrow=c(1,2), pty="s") 
termplot(nihills.lm, partial.resid=TRUE, smooth=panel.smooth,  
         col.res="gray30")

plot of chunk unnamed-chunk-1

par(mfrow=c(1,1), pty="m") 

## ss 6.2.3: Mouse brain weight example 
## Footnote Code
## Scatterplot matrix for data frame litters (DAAG); labels as in figure 
library(lattice) 
splom(~litters, varnames=c("lsize\n\n(litter size)", "bodywt\n\n(Body Weight)", 
      "brainwt\n\n(Brain Weight)"))

plot of chunk unnamed-chunk-1

par(pty="s") 
pairs(litters)       # For improved labeling, see the footnote.

plot of chunk unnamed-chunk-1

par(pty="m") 

## Regression of brainwt on lsize 
summary(lm(brainwt ~ lsize, data = litters))$coef

##              Estimate Std. Error t value  Pr(>|t|)
## (Intercept)  0.447000   0.009625  46.443 3.391e-20
## lsize       -0.004033   0.001198  -3.366 3.445e-03

## Regression of brainwt on lsize and bodywt 
summary(lm(brainwt ~ lsize + bodywt, data = litters))$coef

##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.17825   0.075323   2.366 0.030097
## lsize        0.00669   0.003132   2.136 0.047513
## bodywt       0.02431   0.006779   3.586 0.002278

## ss 6.2.4: Book dimensions, density and book weight 
## Footnote Code
## Code for Panel A 
splom(~log(oddbooks), varnames=c("thick\n\nlog(mm)", "breadth\n\nlog(cm)", 
      "height\n\nlog(cm)", "weight\n\nlog(g)"), pscales=0)

plot of chunk unnamed-chunk-1

## Code for Panel B 
oddothers <-  
  with(oddbooks, 
       data.frame(density = weight/(breadth*height*thick),  
                  area = breadth*height, thick=thick, weight=weight)) 
splom(~log(oddothers), pscales=0, 
      varnames=c("log(density)", "log(area)", "log(thick)", "log(weight)"))

plot of chunk unnamed-chunk-1

## Footnote Code
## Details of calculations 
volume <- apply(oddbooks[, 1:3], 1, prod)              
area <- apply(oddbooks[, 2:3], 1, prod)                
lob1.lm <- lm(log(weight) ~ log(volume), data=oddbooks)      
lob2.lm <- lm(log(weight) ~ log(thick)+log(area), data=oddbooks) 
lob3.lm <- lm(log(weight) ~ log(thick)+log(breadth)+log(height), data=oddbooks) 
coef(summary(lob1.lm))

##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   -8.942      2.731  -3.274 0.008372
## log(volume)    1.697      0.305   5.562 0.000240

## Similarly for coefficients and SEs for other models 

coef(summary(lm(log(weight) ~ log(thick), data=oddbooks)))

##             Estimate Std. Error t value  Pr(>|t|)
## (Intercept)    9.692     0.7076  13.697 8.345e-08
## log(thick)    -1.073     0.2190  -4.897 6.263e-04

book.density <- oddbooks$weight/volume 
bookDensity.lm <- lm(log(book.density) ~ log(area), data=oddbooks) 
coef(summary(bookDensity.lm))

##             Estimate Std. Error t value  Pr(>|t|)
## (Intercept)  -5.1091    0.55138  -9.266 3.182e-06
## log(area)     0.4187    0.09576   4.372 1.394e-03

## Sec 6.3: Multiple Regression Assumptions, Diagnostics and Efficacy Measures 
## ss 6.3.1: Outliers, leverage, influence and Cook's distance 
##                       Detection of outliers 
##                 $^{*}$ Leverage and the hat matrix 
as.vector(hatvalues(nihills.lm))  # as.vector() strips off names

##  [1] 0.11927 0.07420 0.13351 0.10948 0.08532 0.14633 0.05176 0.14617
##  [9] 0.24651 0.21781 0.09012 0.05667 0.05621 0.12748 0.04945 0.10281
## [17] 0.17548 0.21393 0.44414 0.09012 0.13870 0.04801 0.07650

## Residuals versus leverages 
par(pty="s") 
plot(nihills.lm, which=5, add.smooth=FALSE)

plot of chunk unnamed-chunk-1

par(pty="m") 
## The points can alternatively be plotted using 
## plot(hatvalues(model.matrix(nihills.lm)), residuals(nihills.lm)) 

##               Influential points and Cook's distance 
##              Influence on the regression coefficients 
##      Outliers, influential or not, should be taken seriously 
##               \!\!$^{*}$Additional diagnostic plots 
library(car)

## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:DAAG':
## 
##     vif

influencePlot(allbacks.lm)

plot of chunk unnamed-chunk-1

##    StudRes    Hat  CookD
## 11  -1.877 0.3833 0.7765
## 13   5.386 0.1411 0.6903

## ss 6.3.2: Assessment and Comparison of Regression Models 
##                      $R^2$ and adjusted $R^2$ 
##                     AIC and related statistics 
##             Calculation of\, { R}'s version of the AIC 
## ss 6.3.3: How accurately does the equation predict? 
lognihills <- log(nihills) 
names(lognihills) <- paste("log", names(nihills), sep="") 
nihills.lm <- lm(logtime ~ logdist + logclimb, data = lognihills) 
exp(predict(nihills.lm, interval="confidence"))[1:5, ]

##                       fit    lwr    upr
## Binevenagh         0.8932 0.8452 0.9439
## Slieve Gullion     0.4880 0.4672 0.5097
## Glenariff Mountain 0.6404 0.6041 0.6789
## Donard & Commedagh 1.1257 1.0677 1.1868
## McVeigh Classic    0.5699 0.5439 0.5971

## Footnote Code
## Model that is quadratic in logdist  
nihills2.lm <- lm(logtime ~ poly(logdist,2) + logclimb, data = lognihills)  
citimes2 <- exp(predict(nihills2.lm, interval="confidence")) 


## Sec 6.4: A Strategy for Fitting Multiple Regression Models 
##  Why are linear relationships between explanatory variablespreferable? 
## ss 6.4.1: Suggested steps 
## ss 6.4.2: Diagnostic checks 
## ss 6.4.3: An example -- Scottish hill race data 
## Footnote Code
## Scatterplot matrix: data frame hills2000 (DAAG), log scales 
splom(~ log(hills2000[, c("dist","climb","time")]), cex.labels=1.2, 
      varnames=c("dist\n(log miles)", "climb\n(log feet)", "time\n(log hours)"))

plot of chunk unnamed-chunk-1

## Panel A 
lhills2k.lm <- lm(log(time) ~ log(climb) + log(dist), 
                  data = hills2000) 
plot(lhills2k.lm, caption="", which=1)

plot of chunk unnamed-chunk-1

## Panel B 
library(MASS)        # lqs() is in the MASS package

## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:DAAG':
## 
##     hills

lhills2k.lqs <- lqs(log(time) ~ log(climb) + log(dist),  
                    data = hills2000) 
reres <- residuals(lhills2k.lqs) 
refit <- fitted(lhills2k.lqs) 
big3 <- which(abs(reres) >= sort(abs(reres), decreasing=TRUE)[3]) 
plot(reres ~ refit, xlab="Fitted values (resistant fit)", 
     ylab="Residuals (resistant fit)") 
lines(lowess(reres ~ refit), col=2) 
text(reres[big3] ~ refit[big3], labels=rownames(hills2000)[big3], 
     pos=4-2*(refit[big3] > mean(refit)), cex=0.8)

plot of chunk unnamed-chunk-1

plot(lhills2k.lm, which=2)

plot of chunk unnamed-chunk-1

## Footnote Code
## Probability of a value <= -3 or >= 3  
2 * pnorm(-3)

## [1] 0.0027

##               The contribution of the separate terms 
## Create data frame that has logs of time, dist & climb 
lhills2k <- log(hills2000[, c("dist", "climb", "time")]) 
names(lhills2k) <- paste("log", names(lhills2k), sep="") 
lhills2k.lm <- lm(logtime ~ logdist+logclimb, data=lhills2k) 
termplot(lhills2k.lm, partial.resid=TRUE, smooth=panel.smooth,  
         col.res="gray30")

plot of chunk unnamed-chunk-1

##      A resistant fit that has a polynomial term in {logdist} 
reres2 <- residuals(lqs(logtime ~ poly(logdist,2)+logclimb, data=lhills2k)) 
reres2[order(abs(reres2), decreasing=TRUE)[1:4]]

## Caerketton  Beinn Lee    Yetholm   Goatfell 
##    -0.4170     0.2751     0.2004     0.1886

##                         Refining the model 
add1(lhills2k.lm, ~ logdist+I(logdist^2)+logclimb+logdist:logclimb,  
     test="F")

## Single term additions
## 
## Model:
## logtime ~ logdist + logclimb
##                  Df Sum of Sq   RSS  AIC F value Pr(>F)
## <none>                        0.772 -234               
## I(logdist^2)      1    0.1668 0.605 -246   14.33 0.0004
## logdist:logclimb  1    0.0634 0.709 -237    4.65 0.0357

lhills2k.lm2 <- lm(logtime ~ poly(logdist,2)+logclimb, data=lhills2k[-42, ]) 
plot(lhills2k.lm2)

plot of chunk unnamed-chunk-1

## Footnote Code
## Check addition of interaction term 
add1(lhills2k.lm2, ~ poly(logdist,2)+logclimb+logdist:logclimb)

## Single term additions
## 
## Model:
## logtime ~ poly(logdist, 2) + logclimb
##                  Df Sum of Sq   RSS  AIC
## <none>                        0.429 -259
## logclimb:logdist  1     0.012 0.417 -258

##               The model without the interaction term 
lhills2k.lm <- lm(logtime ~ logdist+logclimb, data=lhills2k[-42, ]) 
summary(lhills2k.lm)$coef

##             Estimate Std. Error t value  Pr(>|t|)
## (Intercept)  -4.0229    0.18651  -21.57 1.332e-27
## logdist       0.7785    0.03349   23.25 3.885e-29
## logclimb      0.3169    0.03019   10.50 1.898e-14

##                    Are "errors in x" an issue? 
##                What happens if we do not transform? 

## Sec 6.5: Problems with Many Explanatory Variables 
## ss 6.5.1: Variable selection issues 
##        Variable selection -- a simulation with random data 
## Footnote Code
## Generate a 100 by 40 matrix of random normal data 
y <- rnorm(100) 
xx <- matrix(rnorm(4000), ncol = 40) 
dimnames(xx)<- list(NULL, paste("X",1:40, sep="")) 

## Footnote Code
## Find the best fitting model 
library(leaps) 
xx.subsets <- regsubsets(xx, y, method = "exhaustive", nvmax = 3, nbest = 1) 
subvar <- summary(xx.subsets)$which[3,-1] 
best3.lm <- lm(y ~ -1+xx[, subvar]) 
print(summary(best3.lm, corr = FALSE))

## 
## Call:
## lm(formula = y ~ -1 + xx[, subvar])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1537 -0.6167  0.0293  0.8082  2.4697 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)
## xx[, subvar]X5   -0.2737     0.0995   -2.75   0.0071
## xx[, subvar]X6   -0.1893     0.1065   -1.78   0.0786
## xx[, subvar]X12  -0.2300     0.1093   -2.10   0.0379
## 
## Residual standard error: 0.97 on 97 degrees of freedom
## Multiple R-squared:  0.119,  Adjusted R-squared:  0.0915 
## F-statistic: 4.36 on 3 and 97 DF,  p-value: 0.00635

## The following call to bestsetNoise() (from DAAG) achieves the same purpose 
bestsetNoise(m=100, n=40)

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7596 -0.4958 -0.0494  0.5621  1.6118 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -0.1204     0.0784   -1.54    0.128
## xV12         -0.1619     0.0691   -2.34    0.021
## xV19          0.1972     0.0780    2.53    0.013
## xV25          0.1778     0.0811    2.19    0.031
## 
## Residual standard error: 0.76 on 96 degrees of freedom
## Multiple R-squared:  0.145,  Adjusted R-squared:  0.118 
## F-statistic: 5.42 on 3 and 96 DF,  p-value: 0.00173

##  Cross-validation that accounts for the variableselection process 
##                 Regression on principal components 

## Sec 6.6: Multicollinearity 
##                  An example -- compositional data 
library(compositions)

## Loading required package: tensorA
## 
## Attaching package: 'tensorA'
## 
## The following object is masked from 'package:base':
## 
##     norm
## 
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## 
## The following object is masked from 'package:DAAG':
## 
##     milk
## 
## Loading required package: energy
## Loading required package: bayesm
## Welcome to compositions, a package for compositional data analysis.
## Find an intro with "? compositions"
## 
## 
## Attaching package: 'compositions'
## 
## The following objects are masked from 'package:stats':
## 
##     cor, cov, dist, var
## 
## The following objects are masked from 'package:base':
## 
##     %*%, scale, scale.default

data(Coxite)         # For pkg compositions, needed to access data 
coxite <- as.data.frame(Coxite)   # From matrix, create data frame 
## Scatterplot matrix for data frame coxite 
par(pty="s")
pairs(coxite)

plot of chunk unnamed-chunk-1

par(pty="m")

coxiteAll.lm <- lm(porosity ~ A+B+C+D+E+depth, data=coxite) 
summary(coxiteAll.lm)

## 
## Call:
## lm(formula = porosity ~ A + B + C + D + E + depth, data = coxite)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9304 -0.4698  0.0242  0.3522  1.1822 
## 
## Coefficients: (1 not defined because of singularities)
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -217.7466   253.4439   -0.86     0.40
## A              2.6486     2.4825    1.07     0.30
## B              2.1915     2.6015    0.84     0.41
## C              0.2113     2.2271    0.09     0.93
## D              4.9492     4.6720    1.06     0.30
## E                  NA         NA      NA       NA
## depth          0.0145     0.0333    0.44     0.67
## 
## Residual standard error: 0.649 on 19 degrees of freedom
## Multiple R-squared:  0.936,  Adjusted R-squared:  0.919 
## F-statistic: 55.1 on 5 and 19 DF,  p-value: 1.18e-10

hat <- predict(coxiteAll.lm, interval="confidence", level=0.95) 

## ss 6.6.1: The variance inflation factor (VIF) 
vif(lm(porosity ~ A+B+C+D+depth, data=coxite))

##        A        B        C        D    depth 
## 2717.815 2484.976  192.589  566.144    3.417

cor(coxite$porosity, coxite)

##          A       B       C       D       E   depth porosity
## [1,] 0.869 -0.5511 -0.7233 -0.3199 -0.4076 -0.1468        1

summary(coxiteABC.lm <- lm(porosity ~ A+B+C, data=coxite))

## 
## Call:
## lm(formula = porosity ~ A + B + C, data = coxite)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9814 -0.3745  0.0229  0.4174  1.2727 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  53.3046    13.2219    4.03  0.00060
## A            -0.0125     0.1558   -0.08  0.93700
## B            -0.5867     0.1513   -3.88  0.00087
## C            -2.2188     0.3389   -6.55  1.7e-06
## 
## Residual standard error: 0.642 on 21 degrees of freedom
## Multiple R-squared:  0.93,   Adjusted R-squared:  0.92 
## F-statistic: 93.3 on 3 and 21 DF,  p-value: 2.64e-12

vif(coxiteABC.lm)

##      A      B      C 
## 10.936  8.592  4.555

summary(coxiteBC.lm <- lm(porosity ~ B+C, data=coxite))

## 
## Call:
## lm(formula = porosity ~ B + C, data = coxite)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9835 -0.3785 -0.0035  0.4178  1.2645 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  52.2571     1.7831    29.3  < 2e-16
## B            -0.5753     0.0508   -11.3  1.2e-10
## C            -2.1949     0.1562   -14.1  1.8e-12
## 
## Residual standard error: 0.628 on 22 degrees of freedom
## Multiple R-squared:  0.93,   Adjusted R-squared:  0.924 
## F-statistic:  147 on 2 and 22 DF,  p-value: 1.91e-13

vif(coxiteBC.lm)

##     B     C 
## 1.013 1.013

AIC(coxiteAll.lm, coxiteBC.lm)

##              df   AIC
## coxiteAll.lm  7 56.50
## coxiteBC.lm   4 52.47

##                  Numbers that do not quite add up 
coxiteR <- coxite 
coxiteR[, 1:5] <- round(coxiteR[, 1:5]) 
summary(lm(porosity ~ ., data=coxiteR))

## 
## Call:
## lm(formula = porosity ~ ., data = coxiteR)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7056 -0.3369  0.0598  0.4361  0.9831 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -1.4251    23.4043   -0.06  0.95212
## A             0.5597     0.2457    2.28  0.03515
## B             0.0157     0.2403    0.07  0.94865
## C            -1.3180     0.2940   -4.48  0.00029
## D             0.9748     0.4222    2.31  0.03305
## E            -0.5530     0.5242   -1.05  0.30543
## depth        -0.0149     0.0223   -0.67  0.51310
## 
## Residual standard error: 0.708 on 18 degrees of freedom
## Multiple R-squared:  0.927,  Adjusted R-squared:  0.903 
## F-statistic: 38.3 on 6 and 18 DF,  p-value: 2.69e-09

vif(lm(porosity ~ .-E, data=coxiteR))

##      A      B      C      D  depth 
## 16.964 16.487  3.281  4.661  1.252

## ss 6.6.2: Remedies for  multicollinearity 

## Sec 6.7: Errors in $x$ 
##                   Measurement of dietary intake 
##           Simulations of the effect of measurement error 
##                     *Two explanatory variables 
##  Two explanatory variables, one measured without error -- a simulation 
errorsINseveral()

##                           Intercept    b1     b2 SE(Int) SE(b1) SE(b2)
## Values for simulation         2.481 1.500  0.000      NA     NA     NA
## Estimates: no error in x1     2.309 1.508  0.006   0.193  0.009  0.009
## LS Estimates: error in x1    23.155 0.402 -0.563   0.591  0.021  0.037
## Theoretical attenuation of b1                Theoretical b2 
##                        0.2500                       -0.5625

plot of chunk unnamed-chunk-1

##                  An arbitrary number ot variables 
##     *The classical error model versus the Berkson error model 

## Sec 6.8: Multiple Regression Models -- Additional Points 
## ss 6.8.1: Confusion between explanatory and response variables 
coef(lm(area ~ volume + weight, data=allbacks))

## (Intercept)      volume      weight 
##     35.4587     -0.9636      1.3611

b <- as.vector(coef(lm(weight ~ volume + area, data=allbacks))) 
c("_Intercept_"=-b[1]/b[3], volume=-b[2]/b[3], weight=1/b[3])

## _Intercept_      volume      weight 
##     -47.847      -1.512       2.135

##                      Unintended correlations 
## ss 6.8.2: Missing explanatory variables 
##                             Strategies 
## ss 6.8.3: \!\!$^{*}$ The use of transformations 
## ss 6.8.4: \!\!$^{*}$ Non-linear methods -- an alternative to transformation? 
nihills$climb.mi <- nihills$climb/5280 
nihills.nls0 <- nls(time ~ (dist^alpha)*(climb.mi^beta), start = 
                  c(alpha = 0.68, beta = 0.465), data = nihills) 
plot(residuals(nihills.nls0) ~ log(predict(nihills.nls0)))

plot of chunk unnamed-chunk-1

nihills.nls <- nls(time ~ gamma + delta1*dist^alpha +  
                   delta2*climb.mi^beta, 
                   start=c(gamma = .045, delta1 = .09, alpha = 1, 
                   delta2=.9, beta = 1.65), data=nihills) 
plot(residuals(nihills.nls) ~ log(predict(nihills.nls)))

plot of chunk unnamed-chunk-1

## Sec 6.9: Recap 

## Sec 6.10: Further Reading 
## Set up factor that identifies the `have' cities 
## Data frame cities (DAAG) 
cities$have <- factor((cities$REGION=="ON")| 
                      (cities$REGION=="WEST")) 

par(pty="s")
plot(POP1996 ~ POP1992, data=cities,  
     col=as.integer(cities$have))

plot of chunk unnamed-chunk-1

plot(log(POP1996) ~ log(POP1992), data=cities,  
                        col=as.integer(cities$have))

plot of chunk unnamed-chunk-1

par(pty="m")

cities.lm1 <- lm(POP1996 ~ have+POP1992, data=cities) 
cities.lm2 <- lm(log(POP1996) ~ have+log(POP1992), 
                 data=cities) 

nihills.lm <- lm(time ~ dist+climb, data=nihills) 
nihills2.lm <- lm(time ~ dist+climb+dist:climb, data=nihills) 
anova(nihills.lm, nihills2.lm)

## Analysis of Variance Table
## 
## Model 1: time ~ dist + climb
## Model 2: time ~ dist + climb + dist:climb
##   Res.Df    RSS Df Sum of Sq    F  Pr(>F)
## 1     20 0.1894                          
## 2     19 0.0394  1      0.15 72.4 6.6e-08

x1 <- runif(10)            # predictor which will be missing 
x2 <- rbinom(10, 1, 1-x1)  # observed predictor which depends  
                           # on missing predictor  
y <- 5*x1 + x2 + rnorm(10,sd=.1)   # simulated model; coef  
                                   # of x2 is positive  
y.lm <- lm(y ~ factor(x2)) # model fitted to observed data 
coef(y.lm)

## (Intercept) factor(x2)1 
##      3.9609     -0.8824

y.lm2 <- lm(y ~ x1 + factor(x2))   # correct model  
coef(y.lm2)

## (Intercept)          x1 factor(x2)1 
##    -0.08243     5.04536     1.15339

library(MASS)
nraw.lqs <- lqs(northRain ~ SOI + CO2, data=bomregions) 
north.lqs <- lqs(I(northRain^(1/3)) ~ SOI + CO2, data=bomregions) 
par(mfrow=c(2,1)) 
plot(residuals(nraw.lqs) ~ Year, data=bomregions) 
plot(residuals(north.lqs) ~ Year, data=bomregions)

par(mfrow=c(1,1))

multilr.R

johnm1

Tue Sep 2 21:22:32 2014