glm.R

## Chapter 8: {G}eneralizedLinear Models, and Survival Analysis 

## Sec 8.1: Generalized Linear Models 
## ss 8.1.1: Transformation of the expected value on the left 
## Footnote Code
## Simplified plot showing the logit link function 
p <- (1:999)/1000 
gitp <- log(p/(1 - p)) 
par(pty="s")
plot(p, gitp, xlab = "Proportion", ylab = "", type = "l", pch = 1)

plot of chunk unnamed-chunk-1

par(pty="m")

## ss 8.1.2: Noise terms need not be normal 
## ss 8.1.3: Log odds in contingency tables 
## ss 8.1.4: Logistic regression with a continuous explanatory variable 
library(DAAG)

## Loading required package: lattice

anestot <- aggregate(anesthetic[, c("move","nomove")],  
                     by=list(conc=anesthetic$conc), FUN=sum) 
## The column 'conc', because from the 'by' list, is then a factor. 
## The next line recovers the numeric values 
anestot$conc <- as.numeric(as.character(anestot$conc)) 
anestot$total <- apply(anestot[, c("move","nomove")], 1 , sum) 
anestot$prop <- anestot$nomove/anestot$total 

## Footnote Code
## Plot proportion moving vs conc: data frame anesthetic (DAAG) 
plot(prop ~ conc, data=anestot, xlab = "Concentration",  
     ylab = "Proportion", xlim = c(.5,2.5), ylim = c(0, 1), pch = 16) 
with(anestot, 
     {text(conc, prop, paste(total), pos=2) 
      abline(h=sum(nomove)/sum(total), lty=2)})

plot of chunk unnamed-chunk-1

## Fit model directly to the 0/1 data in nomove 
anes.logit <- glm(nomove ~ conc, family=binomial(link="logit"), 
                  data=anesthetic) 
## Fit model to the proportions; supply total numbers as weights 
anes1.logit <- glm(prop ~ conc, family=binomial(link="logit"), 
                   weights=total, data=anestot) 

## Footnote Code
## Graphical summary of logistic regression results 
anestot$emplogit <- with(anestot, log((nomove+0.5)/(move+0.5))) 
plot(emplogit ~ conc, data=anestot, 
     xlab = "Concentration", xlim = c(0, 2.75), xaxs="i", 
     ylab = "Empirical logit", ylim=c(-2, 2.4), cex=1.5, pch = 16) 
with(anestot, text(conc, emplogit, paste(round(prop,2)), pos=c(2,4,2,4,4,4))) 
abline(anes.logit)

plot of chunk unnamed-chunk-1

summary(anes.logit)

## 
## Call:
## glm(formula = nomove ~ conc, family = binomial(link = "logit"), 
##     data = anesthetic)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7667  -0.7441   0.0341   0.6867   2.0690  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -6.47       2.42   -2.67   0.0075
## conc            5.57       2.04    2.72   0.0064
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 41.455  on 29  degrees of freedom
## Residual deviance: 27.754  on 28  degrees of freedom
## AIC: 31.75
## 
## Number of Fisher Scoring iterations: 5

## Sec 8.2: Logistic Multiple Regression 
## Footnote Code
## Presence/absence information: data frame frogs (DAAGS) 
plot(northing ~ easting, data=frogs, pch=c(1,16)[frogs$pres.abs+1], 
     xlab="Meters east of reference point", ylab="Meters north", asp=1)

plot of chunk unnamed-chunk-1

## Footnote Code
## Pairs plot; frogs data 
pairs(frogs[, c(5:10,4)], oma=c(2,2,2,2), cex=0.5)

plot of chunk unnamed-chunk-1

## Footnote Code
## Here is code for the top row of plots 
par(mfrow=c(1,3), pty="s") 
for(nam in c("distance","NoOfPools","NoOfSites")){ 
    y <- frogs[, nam] 
    plot(density(y), main="", xlab=nam) 
    }

plot of chunk unnamed-chunk-1

  # The other rows can be obtained by replacing y by 
  # sqrt(y) or log(y) (or, for NoOfSites, by log(y+1)) 
par(mfrow=c(1,1), pty="m") 

## Footnote Code
## Correlation between altitude and meanmax 
with(frogs, cor(altitude,meanmax))

## [1] -0.9966

with(frogs, cor(cbind(altitude,  
                      "meanmax+meanmin" = meanmin+meanmax,  
                      "meanmax-meanmin" = meanmax-meanmin)))

##                 altitude meanmax+meanmin meanmax-meanmin
## altitude          1.0000         -0.9933         -0.9095
## meanmax+meanmin  -0.9933          1.0000          0.8730
## meanmax-meanmin  -0.9095          0.8730          1.0000

## Footnote Code
## ss 8.2.1: Selection of model terms, and fitting the model 
summary(frogs.glm0 <- glm(formula = pres.abs ~ log(distance) + 
                          log(NoOfPools) + NoOfSites + avrain +  
                          I(meanmax+meanmin)+I(meanmax-meanmin), 
                          family = binomial, data = frogs))

## 
## Call:
## glm(formula = pres.abs ~ log(distance) + log(NoOfPools) + NoOfSites + 
##     avrain + I(meanmax + meanmin) + I(meanmax - meanmin), family = binomial, 
##     data = frogs)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.976  -0.719  -0.279   0.797   2.575  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)
## (Intercept)          18.26890   16.13819    1.13   0.2576
## log(distance)        -0.75832    0.25581   -2.96   0.0030
## log(NoOfPools)        0.57090    0.21533    2.65   0.0080
## NoOfSites            -0.00362    0.10615   -0.03   0.9728
## avrain                0.00070    0.04117    0.02   0.9864
## I(meanmax + meanmin)  1.49581    0.31532    4.74  2.1e-06
## I(meanmax - meanmin) -3.85827    1.27839   -3.02   0.0025
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 279.99  on 211  degrees of freedom
## Residual deviance: 197.65  on 205  degrees of freedom
## AIC: 211.7
## 
## Number of Fisher Scoring iterations: 5

frogs.glm <- glm(pres.abs ~ log(distance) + log(NoOfPools) + 
                 I(meanmax+meanmin)+ I(meanmax-meanmin),  
                 family = binomial, data = frogs) 
summary(frogs.glm)

## 
## Call:
## glm(formula = pres.abs ~ log(distance) + log(NoOfPools) + I(meanmax + 
##     meanmin) + I(meanmax - meanmin), family = binomial, data = frogs)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.975  -0.722  -0.278   0.797   2.574  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)
## (Intercept)            18.527      5.267    3.52  0.00044
## log(distance)          -0.755      0.226   -3.34  0.00084
## log(NoOfPools)          0.571      0.215    2.65  0.00800
## I(meanmax + meanmin)    1.498      0.309    4.85  1.2e-06
## I(meanmax - meanmin)   -3.881      0.900   -4.31  1.6e-05
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 279.99  on 211  degrees of freedom
## Residual deviance: 197.66  on 207  degrees of freedom
## AIC: 207.7
## 
## Number of Fisher Scoring iterations: 5

## ss 8.2.2: Fitted values 
fitted(frogs.glm)    # Fitted values' scale of response

##        2        3        4        5        6        7        8        9 
## 0.941669 0.925923 0.902942 0.811962 0.931407 0.727804 0.864155 0.695441 
##       10       11       12       13       14       15       16       17 
## 0.711500 0.834471 0.706990 0.727109 0.574645 0.850361 0.821729 0.575914 
##       18       19       20       21       22       23       24       25 
## 0.420438 0.478623 0.734189 0.711845 0.624518 0.282346 0.594180 0.806511 
##       26       27       28       29       30       31       32       33 
## 0.607670 0.400272 0.698821 0.796676 0.785570 0.768837 0.701169 0.586512 
##       34       35       36       37       38       39       40       41 
## 0.423363 0.637111 0.662719 0.540080 0.796559 0.712311 0.575402 0.496934 
##       42       43       44       45       46       47       48       49 
## 0.748626 0.704090 0.864860 0.777353 0.494929 0.696701 0.744258 0.802261 
##       50       51       52       53       54       55       56       57 
## 0.847208 0.701503 0.593270 0.685272 0.691526 0.583042 0.690469 0.472419 
##       58       59       60       61       62       63       64       65 
## 0.829069 0.484602 0.513528 0.522079 0.840043 0.820194 0.240414 0.623950 
##       66       67       68       69       70       71       72       73 
## 0.195567 0.166309 0.265794 0.323302 0.290319 0.064641 0.210141 0.321942 
##       74       75       76       77       78       79       80       81 
## 0.365431 0.036456 0.038356 0.040025 0.086510 0.023103 0.369142 0.375357 
##       82       83       84       85       86       87       88       89 
## 0.015904 0.022523 0.020334 0.057952 0.286357 0.099896 0.075280 0.133611 
##       90       91       92       93       94       95       96       97 
## 0.219612 0.048931 0.109191 0.067965 0.072204 0.282890 0.857686 0.071038 
##       98       99      100      101      102      103      104      105 
## 0.280654 0.220416 0.161458 0.506438 0.166253 0.660414 0.033694 0.026560 
##      106      107      108      109      110      111      112      113 
## 0.037830 0.016301 0.013103 0.006642 0.252215 0.075621 0.130806 0.163252 
##      114      115      116      117      118      119      120      121 
## 0.237635 0.102676 0.076604 0.082555 0.331927 0.135997 0.020128 0.020248 
##      122      123      124      125      126      127      128      129 
## 0.016313 0.056368 0.024691 0.600743 0.343793 0.308990 0.013649 0.003690 
##      130      131      132      133      134      135      136      137 
## 0.510017 0.635385 0.364664 0.501010 0.707043 0.481454 0.645131 0.006449 
##      138      139      140      141      142      143      144      145 
## 0.007381 0.096685 0.161051 0.630299 0.211762 0.526265 0.092460 0.284760 
##      146      147      148      149      150      151      152      153 
## 0.780909 0.515404 0.229158 0.117452 0.369086 0.399287 0.373019 0.037976 
##      154      155      156      157      158      159      160      161 
## 0.005254 0.004414 0.004480 0.007420 0.325058 0.403678 0.231265 0.339481 
##      162      163      164      165      166      167      168      169 
## 0.106121 0.128927 0.509518 0.296839 0.154725 0.369067 0.559769 0.540728 
##      170      171      172      173      174      175      176      177 
## 0.186427 0.293678 0.195802 0.216879 0.091356 0.372077 0.166865 0.816707 
##      178      179      180      181      182      183      184      185 
## 0.640409 0.857844 0.119140 0.022388 0.016253 0.752317 0.320934 0.562803 
##      186      187      188      189      190      191      192      193 
## 0.676636 0.566281 0.483325 0.069631 0.101570 0.100607 0.156709 0.214156 
##      194      195      196      197      198      199      200      201 
## 0.090205 0.115585 0.202582 0.600756 0.431604 0.478538 0.164915 0.152063 
##      202      203      204      205      206      207      208      209 
## 0.128556 0.427266 0.227111 0.302314 0.155684 0.068940 0.102485 0.145662 
##      210      211      212      213 
## 0.004054 0.004603 0.016190 0.349465

fitresp <- predict(frogs.glm, type="response")  # Same as fitted(frogs.glm) 
fitlp <- predict(frogs.glm, type="link")      # Scale of linear predictor 
## For approximate SEs, specify 
fitlpWithSE <- predict(frogs.glm, type="link", se.fit=TRUE) 

## ss 8.2.3: A plot of contributions of explanatory variables 
## Footnote Code
## For binary response data, plots of residuals are not insightful 
## To see what they look like, try: 
par(mfrow=c(1,4), pty="s")
termplot(frogs.glm, data=frogs, partial.resid=TRUE)

plot of chunk unnamed-chunk-1

par(mfrow=c(1,1), pty="m")

par(mfrow=c(1,4), pty="s") 
frogs$maxminSum <- with(frogs, meanmax+meanmin) 
frogs$maxminDiff <- with(frogs, meanmax-meanmin) 
frogs.glm <- glm(pres.abs ~ log(distance) + log(NoOfPools) +  
                 maxminSum + maxminDiff, family = binomial,  
                 data = frogs) 
termplot(frogs.glm)

plot of chunk unnamed-chunk-1

par(mfrow=c(1,1), pty="s") 

## ss 8.2.4: Cross-validation estimates of predictive accuracy 
## Footnote Code
## The cross-validated estimate of accuracy is stored in the list  
## element acc.cv, in the output from the function CVbinary(), thus: 
for (j in 1:4){ 
     randsam <- sample(1:10, 212, replace=TRUE) 
     initial.acc <- CVbinary(frogs.glm0, rand=randsam,  
                             print.details=FALSE)$acc.cv 
     reduced.acc <- CVbinary(frogs.glm, rand=randsam, 
                             print.details=FALSE)$acc.cv 
     cat("Initial:", round(initial.acc,3), "  Reduced:", round(reduced.acc,3), "\n") 
     }

## Initial: 0.769   Reduced: 0.769 
## Initial: 0.764   Reduced: 0.774 
## Initial: 0.774   Reduced: 0.769 
## Initial: 0.774   Reduced: 0.778

## Sec 8.3: Logistic Models for Categorical Data -- an Example 
## Create data frame from multi-way table UCBAdmissions (datasets) 
dimnames(UCBAdmissions)  # Check levels of table margins

## $Admit
## [1] "Admitted" "Rejected"
## 
## $Gender
## [1] "Male"   "Female"
## 
## $Dept
## [1] "A" "B" "C" "D" "E" "F"

UCB <- as.data.frame.table(UCBAdmissions["Admitted", , ]) 
names(UCB)[3] <- "admit" 
UCB$reject <- as.data.frame.table(UCBAdmissions["Rejected", , ])$Freq 
UCB$Gender <- relevel(UCB$Gender, ref="Male") 
## Add further columns total and p (proportion admitted) 
UCB$total <- UCB$admit + UCB$reject 
UCB$p <- UCB$admit/UCB$total 

UCB.glm <- glm(p ~ Dept*Gender, family=binomial,  
               data=UCB, weights=total) 
anova(UCB.glm, test="Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: p
## 
## Terms added sequentially (first to last)
## 
## 
##             Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL                           11        877         
## Dept         5      855         6         22   <2e-16
## Gender       1        2         5         20   0.2159
## Dept:Gender  5       20         0          0   0.0011

summary(UCB.glm)$coef

##                    Estimate Std. Error  z value  Pr(>|z|)
## (Intercept)         0.49212    0.07175   6.8589 6.941e-12
## DeptB               0.04163    0.11319   0.3678 7.130e-01
## DeptC              -1.02764    0.13550  -7.5842 3.345e-14
## DeptD              -1.19608    0.12641  -9.4622 3.016e-21
## DeptE              -1.44908    0.17681  -8.1956 2.493e-16
## DeptF              -3.26187    0.23120 -14.1087 3.359e-45
## GenderFemale        1.05208    0.26271   4.0047 6.209e-05
## DeptB:GenderFemale -0.83205    0.51039  -1.6302 1.031e-01
## DeptC:GenderFemale -1.17700    0.29956  -3.9291 8.526e-05
## DeptD:GenderFemale -0.97009    0.30262  -3.2056 1.348e-03
## DeptE:GenderFemale -1.25226    0.33032  -3.7910 1.500e-04
## DeptF:GenderFemale -0.86318    0.40267  -2.1437 3.206e-02

## Sec 8.4: Poisson and Quasi-Poisson Regression 
## ss 8.4.1: Data on aberrant crypt foci 
## Footnote Code
## Plot count vs endtime: data frame ACF1 (DAAG) 
plot(count ~ endtime, data=ACF1, pch=16, log="x")

plot of chunk unnamed-chunk-1

summary(ACF.glm0 <- glm(formula = count ~ endtime,  
                         family = poisson, data = ACF1))

## 
## Call:
## glm(formula = count ~ endtime, family = poisson, data = ACF1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4620  -0.4785  -0.0794   0.3816   2.2633  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -0.3215     0.4005   -0.80     0.42
## endtime       0.1192     0.0264    4.51  6.4e-06
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 51.105  on 21  degrees of freedom
## Residual deviance: 28.369  on 20  degrees of freedom
## AIC: 92.21
## 
## Number of Fisher Scoring iterations: 5

summary(ACF.glm <- glm(formula = count ~ endtime + I(endtime^2), 
                       family = poisson, data = ACF1))

## 
## Call:
## glm(formula = count ~ endtime + I(endtime^2), family = poisson, 
##     data = ACF1)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.062  -0.783  -0.281   0.451   2.169  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)   1.72236    1.09249    1.58    0.115
## endtime      -0.26236    0.19969   -1.31    0.189
## I(endtime^2)  0.01514    0.00795    1.90    0.057
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 51.105  on 21  degrees of freedom
## Residual deviance: 24.515  on 19  degrees of freedom
## AIC: 90.35
## 
## Number of Fisher Scoring iterations: 5

(etime <- unique(ACF1$endtime))

## [1]  6 12 18

exp(-0.3215 + 0.1192*etime)  # Linear term only

## [1] 1.482 3.031 6.197

exp(1.72235 - 0.26235*etime + 0.01514*etime^2)  # Quadratic model

## [1] 2.000 2.126 6.722

sum(resid(ACF.glm, type="pearson")^2)/19

## [1] 1.254

ACFq.glm <- glm(formula = count ~ endtime, 
                family = quasipoisson, data = ACF1) 
summary(ACFq.glm)$coef

##             Estimate Std. Error t value  Pr(>|t|)
## (Intercept)  -0.3215    0.45532 -0.7062 0.4882392
## endtime       0.1192    0.03004  3.9679 0.0007584

sapply(split(residuals(ACFq.glm), ACF1$endtime), var)

##      6     12     18 
## 1.0958 1.9793 0.3651

fligner.test(resid(ACFq.glm) ~ factor(ACF1$endtime))

## 
##  Fligner-Killeen test of homogeneity of variances
## 
## data:  resid(ACFq.glm) by factor(ACF1$endtime)
## Fligner-Killeen:med chi-squared = 2.175, df = 2, p-value = 0.337

## ss 8.4.2: Moth habitat example 
## Footnote Code
## Number of moths by habitat: data frame moths (DAAG) 
rbind(Number=table(moths[, 4]), sapply(split(moths[, -4], moths$habitat),  
      apply, 2, sum))

##        Bank Disturbed Lowerside NEsoak NWsoak SEsoak SWsoak Upperside
## Number    1         7         9      6      3      7      3         5
## meters   21        49       191    254     65    193    116       952
## A         0         8        41     14     71     37     20        28
## P         4        33        17     14     19      6     48         8

library(lattice) 
dotplot(habitat ~ A, data=moths, xlab="Number of moths (species A)",  
        panel=function(x, y, ...){  
          panel.dotplot(x,y, pch=1, col="black", ...) 
          av <- sapply(split(x,y),mean);
          ypos <- unique(y)
          lpoints(ypos~av, pch=3, col="black", cex=1.25)          
          },  
        key=list(text=list(c("Individual transects", "Mean")), 
          points=list(pch=c(1,3), cex=c(1,1.25), col=c("black","gray45")), 
          columns=2))

plot of chunk unnamed-chunk-1

##            An unsatisfactory choice of reference level 
summary(A.glm <- glm(A ~ habitat + log(meters),  
                     family = quasipoisson, data = moths))

## 
## Call:
## glm(formula = A ~ habitat + log(meters), family = quasipoisson, 
##     data = moths)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.556  -1.463  -0.001   0.925   3.082  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)
## (Intercept)       -15.696   2096.588   -0.01     0.99
## habitatDisturbed   15.622   2096.588    0.01     0.99
## habitatLowerside   16.906   2096.588    0.01     0.99
## habitatNEsoak      16.084   2096.588    0.01     0.99
## habitatNWsoak      18.468   2096.588    0.01     0.99
## habitatSEsoak      16.968   2096.588    0.01     0.99
## habitatSWsoak      17.137   2096.588    0.01     0.99
## habitatUpperside   16.743   2096.588    0.01     0.99
## log(meters)         0.129      0.153    0.85     0.40
## 
## (Dispersion parameter for quasipoisson family taken to be 2.701)
## 
##     Null deviance: 257.108  on 40  degrees of freedom
## Residual deviance:  93.991  on 32  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 13

subset(moths, habitat=="Bank")

##    meters A P habitat
## 40     21 0 4    Bank

## Footnote Code
## Analysis with tighter convergence criterion 
A.glm <- update(A.glm, epsilon=1e-10) 
summary(A.glm)$coef

##                  Estimate Std. Error    t value Pr(>|t|)
## (Intercept)      -20.6958  2.554e+04 -0.0008103   0.9994
## habitatDisturbed  20.6224  2.554e+04  0.0008074   0.9994
## habitatLowerside  21.9057  2.554e+04  0.0008576   0.9993
## habitatNEsoak     21.0840  2.554e+04  0.0008255   0.9993
## habitatNWsoak     23.4681  2.554e+04  0.0009188   0.9993
## habitatSEsoak     21.9677  2.554e+04  0.0008601   0.9993
## habitatSWsoak     22.1371  2.554e+04  0.0008667   0.9993
## habitatUpperside  21.7433  2.554e+04  0.0008513   0.9993
## log(meters)        0.1292  1.528e-01  0.8454404   0.4041

head(summary(A.glm)$coefficients, 3)

##                  Estimate Std. Error    t value Pr(>|t|)
## (Intercept)        -20.70      25542 -0.0008103   0.9994
## habitatDisturbed    20.62      25542  0.0008074   0.9994
## habitatLowerside    21.91      25542  0.0008576   0.9993

## SEs of predicted values 
A.se <- predict(A.glm, se=T)$se.fit 
A.se[moths$habitat=="Bank"]

##    40 
## 25542

range(A.se[moths$habitat!="Bank"])

## [1] 0.1969 0.6312

##           A more satisfactory choice of reference level 
moths$habitat <- relevel(moths$habitat, ref="Lowerside") 
summary(A.glm <- glm(A ~ habitat + log(meters),  
                     family=quasipoisson, data=moths))

## 
## Call:
## glm(formula = A ~ habitat + log(meters), family = quasipoisson, 
##     data = moths)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.556  -1.463  -0.001   0.925   3.082  
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)         1.2098     0.4553    2.66    0.012
## habitatBank       -16.9057  2096.5884   -0.01    0.994
## habitatDisturbed   -1.2832     0.6474   -1.98    0.056
## habitatNEsoak      -0.8217     0.5370   -1.53    0.136
## habitatNWsoak       1.5624     0.3343    4.67  5.1e-05
## habitatSEsoak       0.0621     0.3852    0.16    0.873
## habitatSWsoak       0.2314     0.4781    0.48    0.632
## habitatUpperside   -0.1623     0.5843   -0.28    0.783
## log(meters)         0.1292     0.1528    0.85    0.404
## 
## (Dispersion parameter for quasipoisson family taken to be 2.701)
## 
##     Null deviance: 257.108  on 40  degrees of freedom
## Residual deviance:  93.991  on 32  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 13

##          The comparison between {Bank} and other habitats 
## Examine change in deviance from merging Bank with NWsoak 
habitatNW <- moths$habitat   
habitatNW[habitatNW=="Bank"] <- "NWsoak" 
habitatNW <- factor(habitatNW)  # NB: Bank and NWsoak  
table(habitatNW)

## habitatNW
## Lowerside Disturbed    NEsoak    NWsoak    SEsoak    SWsoak Upperside 
##         9         7         6         4         7         3         5

ANW.glm <- glm(A ~ habitatNW + log(meters), family = quasipoisson,  
               data=moths) 
anova(A.glm, ANW.glm, test="F")

## Analysis of Deviance Table
## 
## Model 1: A ~ habitat + log(meters)
## Model 2: A ~ habitatNW + log(meters)
##   Resid. Df Resid. Dev Df Deviance    F  Pr(>F)
## 1        32         94                         
## 2        33        135 -1    -40.9 15.1 0.00047

habitatD <- moths$habitat  
habitatD[habitatD=="Bank"] <- "Disturbed"  
habitatD <- factor(habitatD)  
A.glm <- glm(A ~ habitat + log(meters), family = quasipoisson,   
             data=moths)  
AD.glm <- glm(A ~ habitatD + log(meters), family = quasipoisson,   
              data=moths)  
anova(A.glm, AD.glm, test="F")

## Analysis of Deviance Table
## 
## Model 1: A ~ habitat + log(meters)
## Model 2: A ~ habitatD + log(meters)
##   Resid. Df Resid. Dev Df Deviance    F Pr(>F)
## 1        32       94.0                        
## 2        33       96.5 -1    -2.52 0.93   0.34

##                          Diagnostic plots 
A1.glm <- glm(formula = A ~ habitat + log(meters), family = quasipoisson,  
              data = moths, subset=habitat!="Bank") 
par(mfrow=c(1,4), pty="s")
plot(A1.glm, panel=panel.smooth)

plot of chunk unnamed-chunk-1

par(mfrow=c(1,1), pty="m")

## Sec 8.5: Additional Notes on Generalized Linear Models 
## ss 8.5.1: $\!\!^{*}$ Residuals, and estimating the dispersion 
##         Other choices of link function for binomial models 
##              Quasi-binomial and quasi-poisson models 
## ss 8.5.2: Standard errors and $z$- or $t$-statistics for binomial models 
fac <- factor(LETTERS[1:4]) 
p <- c(103, 30, 11, 3)/500 
n <- rep(500,4) 
summary(glm(p ~ fac, family=binomial, weights=n))$coef

##             Estimate Std. Error z value  Pr(>|z|)
## (Intercept)   -1.349     0.1106 -12.201 3.057e-34
## facB          -1.402     0.2184  -6.422 1.349e-10
## facC          -2.445     0.3243  -7.540 4.710e-14
## facD          -3.761     0.5896  -6.379 1.782e-10

## ss 8.5.3: Leverage for binomial models 

## Sec 8.6: Models with an Ordered Categorical or Categorical Response 
## ss 8.6.1: Ordinal Regression Models 
##                        Exploratory analysis 
##          $\!\!^{*}$ Proportional odds logistic regression 
library(MASS)

## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:DAAG':
## 
##     hills

inhaler <-  data.frame(freq=c(99,76,41,55,2,13), 
                       choice=rep(c("inh1","inh2"), 3), 
                       ease=ordered(rep(c("easy","re-read", 
                                          "unclear"), rep(2,3)))) 
inhaler1.polr <-  polr(ease ~ 1, weights=freq, data=inhaler, 
                       Hess=TRUE, subset=inhaler$choice=="inh1")  
 # Setting Hess=TRUE at this point averts possible numerical 
 # problems if this calculation is deferred until later. 
inhaler2.polr <-  polr(ease ~ 1, weights=freq, data=inhaler, 
                       Hess=TRUE, subset=inhaler$choice=="inh2") 

summary(inhaler1.polr)  # inhaler$choice == "inh1"

## Call:
## polr(formula = ease ~ 1, data = inhaler, weights = freq, subset = inhaler$choice == 
##     "inh1", Hess = TRUE)
## 
## No coefficients
## 
## Intercepts:
##                 Value Std. Error t value
## easy|re-read    0.834 0.183      4.566  
## re-read|unclear 4.248 0.712      5.966  
## 
## Residual Deviance: 190.34 
## AIC: 194.34

summary(inhaler2.polr)  # inhaler$choice == "inh2"

## Call:
## polr(formula = ease ~ 1, data = inhaler, weights = freq, subset = inhaler$choice == 
##     "inh2", Hess = TRUE)
## 
## No coefficients
## 
## Intercepts:
##                 Value Std. Error t value
## easy|re-read    0.111 0.167      0.665  
## re-read|unclear 2.310 0.291      7.944  
## 
## Residual Deviance: 265.54 
## AIC: 269.54

summary(inhaler.polr <-  polr(ease ~ choice, weights=freq, 
                              Hess=TRUE, data=inhaler))

## Call:
## polr(formula = ease ~ choice, data = inhaler, weights = freq, 
##     Hess = TRUE)
## 
## Coefficients:
##            Value Std. Error t value
## choiceinh2  0.79      0.245    3.23
## 
## Intercepts:
##                 Value  Std. Error t value
## easy|re-read     0.863  0.181      4.764 
## re-read|unclear  3.353  0.307     10.920 
## 
## Residual Deviance: 459.29 
## AIC: 465.29

deviance(inhaler.polr) - (deviance(inhaler1.polr)  
                          + deviance(inhaler2.polr))

## [1] 3.416

summary(inhaler.polr)

## Call:
## polr(formula = ease ~ choice, data = inhaler, weights = freq, 
##     Hess = TRUE)
## 
## Coefficients:
##            Value Std. Error t value
## choiceinh2  0.79      0.245    3.23
## 
## Intercepts:
##                 Value  Std. Error t value
## easy|re-read     0.863  0.181      4.764 
## re-read|unclear  3.353  0.307     10.920 
## 
## Residual Deviance: 459.29 
## AIC: 465.29

## ss 8.6.2: $\!\!^{*}$ Loglinear Models 
library(MASS) 
example(loglm)

## 
## loglm> # The data frames  Cars93, minn38 and quine are available
## loglm> # in the MASS package.
## loglm> 
## loglm> # Case 1: frequencies specified as an array.
## loglm> sapply(minn38, function(x) length(levels(x)))
##  hs phs fol sex   f 
##   3   4   7   2   0 
## 
## loglm> ## hs phs fol sex f
## loglm> ##  3   4   7   2 0
## loglm> ##minn38a <- array(0, c(3,4,7,2), lapply(minn38[, -5], levels))
## loglm> ##minn38a[data.matrix(minn38[,-5])] <- minn38$f
## loglm> 
## loglm> ## or more simply
## loglm> minn38a <- xtabs(f ~ ., minn38)
## 
## loglm> fm <- loglm(~ 1 + 2 + 3 + 4, minn38a)  # numerals as names.
## 
## loglm> deviance(fm)
## [1] 3712
## 
## loglm> ## [1] 3711.9
## loglm> fm1 <- update(fm, .~.^2)
## 
## loglm> fm2 <- update(fm, .~.^3, print = TRUE)
## 5 iterations: deviation 0.07512 
## 
## loglm> ## 5 iterations: deviation 0.075
## loglm> anova(fm, fm1, fm2)
## LR tests for hierarchical log-linear models
## 
## Model 1:
##  ~1 + 2 + 3 + 4 
## Model 2:
##  . ~ `1` + `2` + `3` + `4` + `1`:`2` + `1`:`3` + `1`:`4` + `2`:`3` + `2`:`4` + `3`:`4` 
## Model 3:
##  . ~ `1` + `2` + `3` + `4` + `1`:`2` + `1`:`3` + `1`:`4` + `2`:`3` + `2`:`4` + `3`:`4` + `1`:`2`:`3` + `1`:`2`:`4` + `1`:`3`:`4` + `2`:`3`:`4` 
## 
##           Deviance  df Delta(Dev) Delta(df) P(> Delta(Dev)
## Model 1    3711.91 155                                    
## Model 2     220.04 108    3491.87        47        0.00000
## Model 3      47.74  36     172.30        72        0.00000
## Saturated     0.00   0      47.74        36        0.09114
## 
## loglm> # Case 1. An array generated with xtabs.
## loglm> 
## loglm> loglm(~ Type + Origin, xtabs(~ Type + Origin, Cars93))
## Call:
## loglm(formula = ~Type + Origin, data = xtabs(~Type + Origin, 
##     Cars93))
## 
## Statistics:
##                    X^2 df P(> X^2)
## Likelihood Ratio 18.36  5 0.002526
## Pearson          14.08  5 0.015110
## 
## loglm> # Case 2.  Frequencies given as a vector in a data frame
## loglm> names(quine)
## [1] "Eth"  "Sex"  "Age"  "Lrn"  "Days"
## 
## loglm> ## [1] "Eth"  "Sex"  "Age"  "Lrn"  "Days"
## loglm> fm <- loglm(Days ~ .^2, quine)
## 
## loglm> gm <- glm(Days ~ .^2, poisson, quine)  # check glm.
## 
## loglm> c(deviance(fm), deviance(gm))          # deviances agree
## [1] 1369 1369
## 
## loglm> ## [1] 1368.7 1368.7
## loglm> c(fm$df, gm$df)                        # resid df do not!
## [1] 127
## 
## loglm> c(fm$df, gm$df.residual)               # resid df do not!
## [1] 127 128
## 
## loglm> ## [1] 127 128
## loglm> # The loglm residual degrees of freedom is wrong because of
## loglm> # a non-detectable redundancy in the model matrix.
## loglm> 
## loglm> 
## loglm>

## Sec 8.7: Survival analysis 
## ss 8.7.1: Analysis of the {Aids2} data 
library(MASS) 
str(Aids2, vec.len=2)

## 'data.frame':    2843 obs. of  7 variables:
##  $ state  : Factor w/ 4 levels "NSW","Other",..: 1 1 1 1 1 ...
##  $ sex    : Factor w/ 2 levels "F","M": 2 2 2 2 2 ...
##  $ diag   : int  10905 11029 9551 9577 10015 ...
##  $ death  : int  11081 11096 9983 9654 10290 ...
##  $ status : Factor w/ 2 levels "A","D": 2 2 2 2 2 ...
##  $ T.categ: Factor w/ 8 levels "hs","hsid","id",..: 1 1 1 5 1 ...
##  $ age    : int  35 53 42 44 39 ...

bloodAids <- subset(Aids2, T.categ=="blood") 
bloodAids$days <- bloodAids$death-bloodAids$diag 
bloodAids$dead <- as.integer(bloodAids$status=="D") 

library(survival)

## Loading required package: splines
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:DAAG':
## 
##     lung

plot(survfit(Surv(days, dead) ~ sex, data=bloodAids),  
     col=c(2,4), conf.int=TRUE)

plot of chunk unnamed-chunk-1

## ss 8.7.2: Right censoring prior to the termination of the study 
hsaids <- subset(Aids2, sex=="M" & T.categ=="hs") 
hsaids$days <- hsaids$death-hsaids$diag 
hsaids$dead <- as.integer(hsaids$status=="D") 
table(hsaids$status,hsaids$death==11504)

##    
##     FALSE TRUE
##   A    16  916
##   D  1531    1

## ss 8.7.3: The survival curve for male homosexuals 
## Survival curve for male homosexuals 
hsaids.surv <- survfit(Surv(days, dead) ~ 1, data=hsaids) 
plot(hsaids.surv)

plot of chunk unnamed-chunk-1

## ss 8.7.4: Hazard rates 
## ss 8.7.5: The Cox proportional hazards model 
bloodAids.coxph <- coxph(Surv(days, dead) ~ sex, data=bloodAids) 

summary(bloodAids.coxph)

## Call:
## coxph(formula = Surv(days, dead) ~ sex, data = bloodAids)
## 
##   n= 94, number of events= 76 
## 
##        coef exp(coef) se(coef)     z Pr(>|z|)
## sexM -0.276     0.759    0.233 -1.18     0.24
## 
##      exp(coef) exp(-coef) lower .95 upper .95
## sexM     0.759       1.32     0.481       1.2
## 
## Concordance= 0.52  (se = 0.032 )
## Rsquare= 0.015   (max possible= 0.998 )
## Likelihood ratio test= 1.38  on 1 df,   p=0.239
## Wald test            = 1.4  on 1 df,   p=0.236
## Score (logrank) test = 1.41  on 1 df,   p=0.235

cox.zph(bloodAids.coxph)

##         rho chisq     p
## sexM -0.127  1.21 0.272

plot(cox.zph(bloodAids.coxph))

plot of chunk unnamed-chunk-1

plot(bloodAids$days, resid(bloodAids.coxph)) 
lines(lowess(bloodAids$days, resid(bloodAids.coxph)))

plot of chunk unnamed-chunk-1

## Sec 8.8: Transformations for Count Data 

## Sec 8.9: Further Reading 
library(mgcv)

## Loading required package: nlme
## This is mgcv 1.8-2. For overview type 'help("mgcv-package")'.

hand <- with(cricketer, unclass(left)-1)  # 0 for left; 1 for right 
hand.gam <- gam(hand ~ s(year), data=cricketer) 
plot(hand.gam)

plot of chunk unnamed-chunk-1

ndf <- data.frame(year=seq(from=1840, to=1960, by=2)) 
fit <- predict(hand.gam, newdata=ndf, type="response", se.fit=TRUE) 
plot(fit$fit ~ ndf$year, ylim=range(fit), type="l")

plot of chunk unnamed-chunk-1

library(survival) 
coxph(Surv(life, dead) ~ left + poly(year, 4), data = cricketer)

## Call:
## coxph(formula = Surv(life, dead) ~ left + poly(year, 4), data = cricketer)
## 
## 
##                   coef exp(coef) se(coef)       z       p
## leftleft         0.044  1.05e+00   0.0442   0.998 3.2e-01
## poly(year, 4)1 -68.477  1.82e-30   6.7158 -10.196 0.0e+00
## poly(year, 4)2 -25.029  1.35e-11   6.0217  -4.156 3.2e-05
## poly(year, 4)3  -3.215  4.02e-02   4.3242  -0.743 4.6e-01
## poly(year, 4)4   3.159  2.35e+01   2.5087   1.259 2.1e-01
## 
## Likelihood ratio test=526  on 5 df, p=0  n= 5960, number of events= 3387

glm.R

johnm1

Sun Sep 7 10:13:37 2014