## Chapter 8: {G}eneralizedLinear Models, and Survival Analysis
## Sec 8.1: Generalized Linear Models
## ss 8.1.1: Transformation of the expected value on the left
## Footnote Code
## Simplified plot showing the logit link function
p <- (1:999)/1000
gitp <- log(p/(1 - p))
par(pty="s")
plot(p, gitp, xlab = "Proportion", ylab = "", type = "l", pch = 1)
par(pty="m")
## ss 8.1.2: Noise terms need not be normal
## ss 8.1.3: Log odds in contingency tables
## ss 8.1.4: Logistic regression with a continuous explanatory variable
library(DAAG)
## Loading required package: lattice
anestot <- aggregate(anesthetic[, c("move","nomove")],
by=list(conc=anesthetic$conc), FUN=sum)
## The column 'conc', because from the 'by' list, is then a factor.
## The next line recovers the numeric values
anestot$conc <- as.numeric(as.character(anestot$conc))
anestot$total <- apply(anestot[, c("move","nomove")], 1 , sum)
anestot$prop <- anestot$nomove/anestot$total
## Footnote Code
## Plot proportion moving vs conc: data frame anesthetic (DAAG)
plot(prop ~ conc, data=anestot, xlab = "Concentration",
ylab = "Proportion", xlim = c(.5,2.5), ylim = c(0, 1), pch = 16)
with(anestot,
{text(conc, prop, paste(total), pos=2)
abline(h=sum(nomove)/sum(total), lty=2)})
## Fit model directly to the 0/1 data in nomove
anes.logit <- glm(nomove ~ conc, family=binomial(link="logit"),
data=anesthetic)
## Fit model to the proportions; supply total numbers as weights
anes1.logit <- glm(prop ~ conc, family=binomial(link="logit"),
weights=total, data=anestot)
## Footnote Code
## Graphical summary of logistic regression results
anestot$emplogit <- with(anestot, log((nomove+0.5)/(move+0.5)))
plot(emplogit ~ conc, data=anestot,
xlab = "Concentration", xlim = c(0, 2.75), xaxs="i",
ylab = "Empirical logit", ylim=c(-2, 2.4), cex=1.5, pch = 16)
with(anestot, text(conc, emplogit, paste(round(prop,2)), pos=c(2,4,2,4,4,4)))
abline(anes.logit)
summary(anes.logit)
##
## Call:
## glm(formula = nomove ~ conc, family = binomial(link = "logit"),
## data = anesthetic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7667 -0.7441 0.0341 0.6867 2.0690
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.47 2.42 -2.67 0.0075
## conc 5.57 2.04 2.72 0.0064
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 41.455 on 29 degrees of freedom
## Residual deviance: 27.754 on 28 degrees of freedom
## AIC: 31.75
##
## Number of Fisher Scoring iterations: 5
## Sec 8.2: Logistic Multiple Regression
## Footnote Code
## Presence/absence information: data frame frogs (DAAGS)
plot(northing ~ easting, data=frogs, pch=c(1,16)[frogs$pres.abs+1],
xlab="Meters east of reference point", ylab="Meters north", asp=1)
## Footnote Code
## Pairs plot; frogs data
pairs(frogs[, c(5:10,4)], oma=c(2,2,2,2), cex=0.5)
## Footnote Code
## Here is code for the top row of plots
par(mfrow=c(1,3), pty="s")
for(nam in c("distance","NoOfPools","NoOfSites")){
y <- frogs[, nam]
plot(density(y), main="", xlab=nam)
}
# The other rows can be obtained by replacing y by
# sqrt(y) or log(y) (or, for NoOfSites, by log(y+1))
par(mfrow=c(1,1), pty="m")
## Footnote Code
## Correlation between altitude and meanmax
with(frogs, cor(altitude,meanmax))
## [1] -0.9966
with(frogs, cor(cbind(altitude,
"meanmax+meanmin" = meanmin+meanmax,
"meanmax-meanmin" = meanmax-meanmin)))
## altitude meanmax+meanmin meanmax-meanmin
## altitude 1.0000 -0.9933 -0.9095
## meanmax+meanmin -0.9933 1.0000 0.8730
## meanmax-meanmin -0.9095 0.8730 1.0000
## Footnote Code
## ss 8.2.1: Selection of model terms, and fitting the model
summary(frogs.glm0 <- glm(formula = pres.abs ~ log(distance) +
log(NoOfPools) + NoOfSites + avrain +
I(meanmax+meanmin)+I(meanmax-meanmin),
family = binomial, data = frogs))
##
## Call:
## glm(formula = pres.abs ~ log(distance) + log(NoOfPools) + NoOfSites +
## avrain + I(meanmax + meanmin) + I(meanmax - meanmin), family = binomial,
## data = frogs)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.976 -0.719 -0.279 0.797 2.575
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 18.26890 16.13819 1.13 0.2576
## log(distance) -0.75832 0.25581 -2.96 0.0030
## log(NoOfPools) 0.57090 0.21533 2.65 0.0080
## NoOfSites -0.00362 0.10615 -0.03 0.9728
## avrain 0.00070 0.04117 0.02 0.9864
## I(meanmax + meanmin) 1.49581 0.31532 4.74 2.1e-06
## I(meanmax - meanmin) -3.85827 1.27839 -3.02 0.0025
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 279.99 on 211 degrees of freedom
## Residual deviance: 197.65 on 205 degrees of freedom
## AIC: 211.7
##
## Number of Fisher Scoring iterations: 5
frogs.glm <- glm(pres.abs ~ log(distance) + log(NoOfPools) +
I(meanmax+meanmin)+ I(meanmax-meanmin),
family = binomial, data = frogs)
summary(frogs.glm)
##
## Call:
## glm(formula = pres.abs ~ log(distance) + log(NoOfPools) + I(meanmax +
## meanmin) + I(meanmax - meanmin), family = binomial, data = frogs)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.975 -0.722 -0.278 0.797 2.574
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 18.527 5.267 3.52 0.00044
## log(distance) -0.755 0.226 -3.34 0.00084
## log(NoOfPools) 0.571 0.215 2.65 0.00800
## I(meanmax + meanmin) 1.498 0.309 4.85 1.2e-06
## I(meanmax - meanmin) -3.881 0.900 -4.31 1.6e-05
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 279.99 on 211 degrees of freedom
## Residual deviance: 197.66 on 207 degrees of freedom
## AIC: 207.7
##
## Number of Fisher Scoring iterations: 5
## ss 8.2.2: Fitted values
fitted(frogs.glm) # Fitted values' scale of response
## 2 3 4 5 6 7 8 9
## 0.941669 0.925923 0.902942 0.811962 0.931407 0.727804 0.864155 0.695441
## 10 11 12 13 14 15 16 17
## 0.711500 0.834471 0.706990 0.727109 0.574645 0.850361 0.821729 0.575914
## 18 19 20 21 22 23 24 25
## 0.420438 0.478623 0.734189 0.711845 0.624518 0.282346 0.594180 0.806511
## 26 27 28 29 30 31 32 33
## 0.607670 0.400272 0.698821 0.796676 0.785570 0.768837 0.701169 0.586512
## 34 35 36 37 38 39 40 41
## 0.423363 0.637111 0.662719 0.540080 0.796559 0.712311 0.575402 0.496934
## 42 43 44 45 46 47 48 49
## 0.748626 0.704090 0.864860 0.777353 0.494929 0.696701 0.744258 0.802261
## 50 51 52 53 54 55 56 57
## 0.847208 0.701503 0.593270 0.685272 0.691526 0.583042 0.690469 0.472419
## 58 59 60 61 62 63 64 65
## 0.829069 0.484602 0.513528 0.522079 0.840043 0.820194 0.240414 0.623950
## 66 67 68 69 70 71 72 73
## 0.195567 0.166309 0.265794 0.323302 0.290319 0.064641 0.210141 0.321942
## 74 75 76 77 78 79 80 81
## 0.365431 0.036456 0.038356 0.040025 0.086510 0.023103 0.369142 0.375357
## 82 83 84 85 86 87 88 89
## 0.015904 0.022523 0.020334 0.057952 0.286357 0.099896 0.075280 0.133611
## 90 91 92 93 94 95 96 97
## 0.219612 0.048931 0.109191 0.067965 0.072204 0.282890 0.857686 0.071038
## 98 99 100 101 102 103 104 105
## 0.280654 0.220416 0.161458 0.506438 0.166253 0.660414 0.033694 0.026560
## 106 107 108 109 110 111 112 113
## 0.037830 0.016301 0.013103 0.006642 0.252215 0.075621 0.130806 0.163252
## 114 115 116 117 118 119 120 121
## 0.237635 0.102676 0.076604 0.082555 0.331927 0.135997 0.020128 0.020248
## 122 123 124 125 126 127 128 129
## 0.016313 0.056368 0.024691 0.600743 0.343793 0.308990 0.013649 0.003690
## 130 131 132 133 134 135 136 137
## 0.510017 0.635385 0.364664 0.501010 0.707043 0.481454 0.645131 0.006449
## 138 139 140 141 142 143 144 145
## 0.007381 0.096685 0.161051 0.630299 0.211762 0.526265 0.092460 0.284760
## 146 147 148 149 150 151 152 153
## 0.780909 0.515404 0.229158 0.117452 0.369086 0.399287 0.373019 0.037976
## 154 155 156 157 158 159 160 161
## 0.005254 0.004414 0.004480 0.007420 0.325058 0.403678 0.231265 0.339481
## 162 163 164 165 166 167 168 169
## 0.106121 0.128927 0.509518 0.296839 0.154725 0.369067 0.559769 0.540728
## 170 171 172 173 174 175 176 177
## 0.186427 0.293678 0.195802 0.216879 0.091356 0.372077 0.166865 0.816707
## 178 179 180 181 182 183 184 185
## 0.640409 0.857844 0.119140 0.022388 0.016253 0.752317 0.320934 0.562803
## 186 187 188 189 190 191 192 193
## 0.676636 0.566281 0.483325 0.069631 0.101570 0.100607 0.156709 0.214156
## 194 195 196 197 198 199 200 201
## 0.090205 0.115585 0.202582 0.600756 0.431604 0.478538 0.164915 0.152063
## 202 203 204 205 206 207 208 209
## 0.128556 0.427266 0.227111 0.302314 0.155684 0.068940 0.102485 0.145662
## 210 211 212 213
## 0.004054 0.004603 0.016190 0.349465
fitresp <- predict(frogs.glm, type="response") # Same as fitted(frogs.glm)
fitlp <- predict(frogs.glm, type="link") # Scale of linear predictor
## For approximate SEs, specify
fitlpWithSE <- predict(frogs.glm, type="link", se.fit=TRUE)
## ss 8.2.3: A plot of contributions of explanatory variables
## Footnote Code
## For binary response data, plots of residuals are not insightful
## To see what they look like, try:
par(mfrow=c(1,4), pty="s")
termplot(frogs.glm, data=frogs, partial.resid=TRUE)
par(mfrow=c(1,1), pty="m")
par(mfrow=c(1,4), pty="s")
frogs$maxminSum <- with(frogs, meanmax+meanmin)
frogs$maxminDiff <- with(frogs, meanmax-meanmin)
frogs.glm <- glm(pres.abs ~ log(distance) + log(NoOfPools) +
maxminSum + maxminDiff, family = binomial,
data = frogs)
termplot(frogs.glm)
par(mfrow=c(1,1), pty="s")
## ss 8.2.4: Cross-validation estimates of predictive accuracy
## Footnote Code
## The cross-validated estimate of accuracy is stored in the list
## element acc.cv, in the output from the function CVbinary(), thus:
for (j in 1:4){
randsam <- sample(1:10, 212, replace=TRUE)
initial.acc <- CVbinary(frogs.glm0, rand=randsam,
print.details=FALSE)$acc.cv
reduced.acc <- CVbinary(frogs.glm, rand=randsam,
print.details=FALSE)$acc.cv
cat("Initial:", round(initial.acc,3), " Reduced:", round(reduced.acc,3), "\n")
}
## Initial: 0.769 Reduced: 0.769
## Initial: 0.764 Reduced: 0.774
## Initial: 0.774 Reduced: 0.769
## Initial: 0.774 Reduced: 0.778
## Sec 8.3: Logistic Models for Categorical Data -- an Example
## Create data frame from multi-way table UCBAdmissions (datasets)
dimnames(UCBAdmissions) # Check levels of table margins
## $Admit
## [1] "Admitted" "Rejected"
##
## $Gender
## [1] "Male" "Female"
##
## $Dept
## [1] "A" "B" "C" "D" "E" "F"
UCB <- as.data.frame.table(UCBAdmissions["Admitted", , ])
names(UCB)[3] <- "admit"
UCB$reject <- as.data.frame.table(UCBAdmissions["Rejected", , ])$Freq
UCB$Gender <- relevel(UCB$Gender, ref="Male")
## Add further columns total and p (proportion admitted)
UCB$total <- UCB$admit + UCB$reject
UCB$p <- UCB$admit/UCB$total
UCB.glm <- glm(p ~ Dept*Gender, family=binomial,
data=UCB, weights=total)
anova(UCB.glm, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: p
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 11 877
## Dept 5 855 6 22 <2e-16
## Gender 1 2 5 20 0.2159
## Dept:Gender 5 20 0 0 0.0011
summary(UCB.glm)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.49212 0.07175 6.8589 6.941e-12
## DeptB 0.04163 0.11319 0.3678 7.130e-01
## DeptC -1.02764 0.13550 -7.5842 3.345e-14
## DeptD -1.19608 0.12641 -9.4622 3.016e-21
## DeptE -1.44908 0.17681 -8.1956 2.493e-16
## DeptF -3.26187 0.23120 -14.1087 3.359e-45
## GenderFemale 1.05208 0.26271 4.0047 6.209e-05
## DeptB:GenderFemale -0.83205 0.51039 -1.6302 1.031e-01
## DeptC:GenderFemale -1.17700 0.29956 -3.9291 8.526e-05
## DeptD:GenderFemale -0.97009 0.30262 -3.2056 1.348e-03
## DeptE:GenderFemale -1.25226 0.33032 -3.7910 1.500e-04
## DeptF:GenderFemale -0.86318 0.40267 -2.1437 3.206e-02
## Sec 8.4: Poisson and Quasi-Poisson Regression
## ss 8.4.1: Data on aberrant crypt foci
## Footnote Code
## Plot count vs endtime: data frame ACF1 (DAAG)
plot(count ~ endtime, data=ACF1, pch=16, log="x")
summary(ACF.glm0 <- glm(formula = count ~ endtime,
family = poisson, data = ACF1))
##
## Call:
## glm(formula = count ~ endtime, family = poisson, data = ACF1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4620 -0.4785 -0.0794 0.3816 2.2633
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3215 0.4005 -0.80 0.42
## endtime 0.1192 0.0264 4.51 6.4e-06
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 51.105 on 21 degrees of freedom
## Residual deviance: 28.369 on 20 degrees of freedom
## AIC: 92.21
##
## Number of Fisher Scoring iterations: 5
summary(ACF.glm <- glm(formula = count ~ endtime + I(endtime^2),
family = poisson, data = ACF1))
##
## Call:
## glm(formula = count ~ endtime + I(endtime^2), family = poisson,
## data = ACF1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.062 -0.783 -0.281 0.451 2.169
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.72236 1.09249 1.58 0.115
## endtime -0.26236 0.19969 -1.31 0.189
## I(endtime^2) 0.01514 0.00795 1.90 0.057
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 51.105 on 21 degrees of freedom
## Residual deviance: 24.515 on 19 degrees of freedom
## AIC: 90.35
##
## Number of Fisher Scoring iterations: 5
(etime <- unique(ACF1$endtime))
## [1] 6 12 18
exp(-0.3215 + 0.1192*etime) # Linear term only
## [1] 1.482 3.031 6.197
exp(1.72235 - 0.26235*etime + 0.01514*etime^2) # Quadratic model
## [1] 2.000 2.126 6.722
sum(resid(ACF.glm, type="pearson")^2)/19
## [1] 1.254
ACFq.glm <- glm(formula = count ~ endtime,
family = quasipoisson, data = ACF1)
summary(ACFq.glm)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.3215 0.45532 -0.7062 0.4882392
## endtime 0.1192 0.03004 3.9679 0.0007584
sapply(split(residuals(ACFq.glm), ACF1$endtime), var)
## 6 12 18
## 1.0958 1.9793 0.3651
fligner.test(resid(ACFq.glm) ~ factor(ACF1$endtime))
##
## Fligner-Killeen test of homogeneity of variances
##
## data: resid(ACFq.glm) by factor(ACF1$endtime)
## Fligner-Killeen:med chi-squared = 2.175, df = 2, p-value = 0.337
## ss 8.4.2: Moth habitat example
## Footnote Code
## Number of moths by habitat: data frame moths (DAAG)
rbind(Number=table(moths[, 4]), sapply(split(moths[, -4], moths$habitat),
apply, 2, sum))
## Bank Disturbed Lowerside NEsoak NWsoak SEsoak SWsoak Upperside
## Number 1 7 9 6 3 7 3 5
## meters 21 49 191 254 65 193 116 952
## A 0 8 41 14 71 37 20 28
## P 4 33 17 14 19 6 48 8
library(lattice)
dotplot(habitat ~ A, data=moths, xlab="Number of moths (species A)",
panel=function(x, y, ...){
panel.dotplot(x,y, pch=1, col="black", ...)
av <- sapply(split(x,y),mean);
ypos <- unique(y)
lpoints(ypos~av, pch=3, col="black", cex=1.25)
},
key=list(text=list(c("Individual transects", "Mean")),
points=list(pch=c(1,3), cex=c(1,1.25), col=c("black","gray45")),
columns=2))
## An unsatisfactory choice of reference level
summary(A.glm <- glm(A ~ habitat + log(meters),
family = quasipoisson, data = moths))
##
## Call:
## glm(formula = A ~ habitat + log(meters), family = quasipoisson,
## data = moths)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.556 -1.463 -0.001 0.925 3.082
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -15.696 2096.588 -0.01 0.99
## habitatDisturbed 15.622 2096.588 0.01 0.99
## habitatLowerside 16.906 2096.588 0.01 0.99
## habitatNEsoak 16.084 2096.588 0.01 0.99
## habitatNWsoak 18.468 2096.588 0.01 0.99
## habitatSEsoak 16.968 2096.588 0.01 0.99
## habitatSWsoak 17.137 2096.588 0.01 0.99
## habitatUpperside 16.743 2096.588 0.01 0.99
## log(meters) 0.129 0.153 0.85 0.40
##
## (Dispersion parameter for quasipoisson family taken to be 2.701)
##
## Null deviance: 257.108 on 40 degrees of freedom
## Residual deviance: 93.991 on 32 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 13
subset(moths, habitat=="Bank")
## meters A P habitat
## 40 21 0 4 Bank
## Footnote Code
## Analysis with tighter convergence criterion
A.glm <- update(A.glm, epsilon=1e-10)
summary(A.glm)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -20.6958 2.554e+04 -0.0008103 0.9994
## habitatDisturbed 20.6224 2.554e+04 0.0008074 0.9994
## habitatLowerside 21.9057 2.554e+04 0.0008576 0.9993
## habitatNEsoak 21.0840 2.554e+04 0.0008255 0.9993
## habitatNWsoak 23.4681 2.554e+04 0.0009188 0.9993
## habitatSEsoak 21.9677 2.554e+04 0.0008601 0.9993
## habitatSWsoak 22.1371 2.554e+04 0.0008667 0.9993
## habitatUpperside 21.7433 2.554e+04 0.0008513 0.9993
## log(meters) 0.1292 1.528e-01 0.8454404 0.4041
head(summary(A.glm)$coefficients, 3)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -20.70 25542 -0.0008103 0.9994
## habitatDisturbed 20.62 25542 0.0008074 0.9994
## habitatLowerside 21.91 25542 0.0008576 0.9993
## SEs of predicted values
A.se <- predict(A.glm, se=T)$se.fit
A.se[moths$habitat=="Bank"]
## 40
## 25542
range(A.se[moths$habitat!="Bank"])
## [1] 0.1969 0.6312
## A more satisfactory choice of reference level
moths$habitat <- relevel(moths$habitat, ref="Lowerside")
summary(A.glm <- glm(A ~ habitat + log(meters),
family=quasipoisson, data=moths))
##
## Call:
## glm(formula = A ~ habitat + log(meters), family = quasipoisson,
## data = moths)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.556 -1.463 -0.001 0.925 3.082
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.2098 0.4553 2.66 0.012
## habitatBank -16.9057 2096.5884 -0.01 0.994
## habitatDisturbed -1.2832 0.6474 -1.98 0.056
## habitatNEsoak -0.8217 0.5370 -1.53 0.136
## habitatNWsoak 1.5624 0.3343 4.67 5.1e-05
## habitatSEsoak 0.0621 0.3852 0.16 0.873
## habitatSWsoak 0.2314 0.4781 0.48 0.632
## habitatUpperside -0.1623 0.5843 -0.28 0.783
## log(meters) 0.1292 0.1528 0.85 0.404
##
## (Dispersion parameter for quasipoisson family taken to be 2.701)
##
## Null deviance: 257.108 on 40 degrees of freedom
## Residual deviance: 93.991 on 32 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 13
## The comparison between {Bank} and other habitats
## Examine change in deviance from merging Bank with NWsoak
habitatNW <- moths$habitat
habitatNW[habitatNW=="Bank"] <- "NWsoak"
habitatNW <- factor(habitatNW) # NB: Bank and NWsoak
table(habitatNW)
## habitatNW
## Lowerside Disturbed NEsoak NWsoak SEsoak SWsoak Upperside
## 9 7 6 4 7 3 5
ANW.glm <- glm(A ~ habitatNW + log(meters), family = quasipoisson,
data=moths)
anova(A.glm, ANW.glm, test="F")
## Analysis of Deviance Table
##
## Model 1: A ~ habitat + log(meters)
## Model 2: A ~ habitatNW + log(meters)
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 32 94
## 2 33 135 -1 -40.9 15.1 0.00047
habitatD <- moths$habitat
habitatD[habitatD=="Bank"] <- "Disturbed"
habitatD <- factor(habitatD)
A.glm <- glm(A ~ habitat + log(meters), family = quasipoisson,
data=moths)
AD.glm <- glm(A ~ habitatD + log(meters), family = quasipoisson,
data=moths)
anova(A.glm, AD.glm, test="F")
## Analysis of Deviance Table
##
## Model 1: A ~ habitat + log(meters)
## Model 2: A ~ habitatD + log(meters)
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 32 94.0
## 2 33 96.5 -1 -2.52 0.93 0.34
## Diagnostic plots
A1.glm <- glm(formula = A ~ habitat + log(meters), family = quasipoisson,
data = moths, subset=habitat!="Bank")
par(mfrow=c(1,4), pty="s")
plot(A1.glm, panel=panel.smooth)
par(mfrow=c(1,1), pty="m")
## Sec 8.5: Additional Notes on Generalized Linear Models
## ss 8.5.1: $\!\!^{*}$ Residuals, and estimating the dispersion
## Other choices of link function for binomial models
## Quasi-binomial and quasi-poisson models
## ss 8.5.2: Standard errors and $z$- or $t$-statistics for binomial models
fac <- factor(LETTERS[1:4])
p <- c(103, 30, 11, 3)/500
n <- rep(500,4)
summary(glm(p ~ fac, family=binomial, weights=n))$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.349 0.1106 -12.201 3.057e-34
## facB -1.402 0.2184 -6.422 1.349e-10
## facC -2.445 0.3243 -7.540 4.710e-14
## facD -3.761 0.5896 -6.379 1.782e-10
## ss 8.5.3: Leverage for binomial models
## Sec 8.6: Models with an Ordered Categorical or Categorical Response
## ss 8.6.1: Ordinal Regression Models
## Exploratory analysis
## $\!\!^{*}$ Proportional odds logistic regression
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:DAAG':
##
## hills
inhaler <- data.frame(freq=c(99,76,41,55,2,13),
choice=rep(c("inh1","inh2"), 3),
ease=ordered(rep(c("easy","re-read",
"unclear"), rep(2,3))))
inhaler1.polr <- polr(ease ~ 1, weights=freq, data=inhaler,
Hess=TRUE, subset=inhaler$choice=="inh1")
# Setting Hess=TRUE at this point averts possible numerical
# problems if this calculation is deferred until later.
inhaler2.polr <- polr(ease ~ 1, weights=freq, data=inhaler,
Hess=TRUE, subset=inhaler$choice=="inh2")
summary(inhaler1.polr) # inhaler$choice == "inh1"
## Call:
## polr(formula = ease ~ 1, data = inhaler, weights = freq, subset = inhaler$choice ==
## "inh1", Hess = TRUE)
##
## No coefficients
##
## Intercepts:
## Value Std. Error t value
## easy|re-read 0.834 0.183 4.566
## re-read|unclear 4.248 0.712 5.966
##
## Residual Deviance: 190.34
## AIC: 194.34
summary(inhaler2.polr) # inhaler$choice == "inh2"
## Call:
## polr(formula = ease ~ 1, data = inhaler, weights = freq, subset = inhaler$choice ==
## "inh2", Hess = TRUE)
##
## No coefficients
##
## Intercepts:
## Value Std. Error t value
## easy|re-read 0.111 0.167 0.665
## re-read|unclear 2.310 0.291 7.944
##
## Residual Deviance: 265.54
## AIC: 269.54
summary(inhaler.polr <- polr(ease ~ choice, weights=freq,
Hess=TRUE, data=inhaler))
## Call:
## polr(formula = ease ~ choice, data = inhaler, weights = freq,
## Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## choiceinh2 0.79 0.245 3.23
##
## Intercepts:
## Value Std. Error t value
## easy|re-read 0.863 0.181 4.764
## re-read|unclear 3.353 0.307 10.920
##
## Residual Deviance: 459.29
## AIC: 465.29
deviance(inhaler.polr) - (deviance(inhaler1.polr)
+ deviance(inhaler2.polr))
## [1] 3.416
summary(inhaler.polr)
## Call:
## polr(formula = ease ~ choice, data = inhaler, weights = freq,
## Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## choiceinh2 0.79 0.245 3.23
##
## Intercepts:
## Value Std. Error t value
## easy|re-read 0.863 0.181 4.764
## re-read|unclear 3.353 0.307 10.920
##
## Residual Deviance: 459.29
## AIC: 465.29
## ss 8.6.2: $\!\!^{*}$ Loglinear Models
library(MASS)
example(loglm)
##
## loglm> # The data frames Cars93, minn38 and quine are available
## loglm> # in the MASS package.
## loglm>
## loglm> # Case 1: frequencies specified as an array.
## loglm> sapply(minn38, function(x) length(levels(x)))
## hs phs fol sex f
## 3 4 7 2 0
##
## loglm> ## hs phs fol sex f
## loglm> ## 3 4 7 2 0
## loglm> ##minn38a <- array(0, c(3,4,7,2), lapply(minn38[, -5], levels))
## loglm> ##minn38a[data.matrix(minn38[,-5])] <- minn38$f
## loglm>
## loglm> ## or more simply
## loglm> minn38a <- xtabs(f ~ ., minn38)
##
## loglm> fm <- loglm(~ 1 + 2 + 3 + 4, minn38a) # numerals as names.
##
## loglm> deviance(fm)
## [1] 3712
##
## loglm> ## [1] 3711.9
## loglm> fm1 <- update(fm, .~.^2)
##
## loglm> fm2 <- update(fm, .~.^3, print = TRUE)
## 5 iterations: deviation 0.07512
##
## loglm> ## 5 iterations: deviation 0.075
## loglm> anova(fm, fm1, fm2)
## LR tests for hierarchical log-linear models
##
## Model 1:
## ~1 + 2 + 3 + 4
## Model 2:
## . ~ `1` + `2` + `3` + `4` + `1`:`2` + `1`:`3` + `1`:`4` + `2`:`3` + `2`:`4` + `3`:`4`
## Model 3:
## . ~ `1` + `2` + `3` + `4` + `1`:`2` + `1`:`3` + `1`:`4` + `2`:`3` + `2`:`4` + `3`:`4` + `1`:`2`:`3` + `1`:`2`:`4` + `1`:`3`:`4` + `2`:`3`:`4`
##
## Deviance df Delta(Dev) Delta(df) P(> Delta(Dev)
## Model 1 3711.91 155
## Model 2 220.04 108 3491.87 47 0.00000
## Model 3 47.74 36 172.30 72 0.00000
## Saturated 0.00 0 47.74 36 0.09114
##
## loglm> # Case 1. An array generated with xtabs.
## loglm>
## loglm> loglm(~ Type + Origin, xtabs(~ Type + Origin, Cars93))
## Call:
## loglm(formula = ~Type + Origin, data = xtabs(~Type + Origin,
## Cars93))
##
## Statistics:
## X^2 df P(> X^2)
## Likelihood Ratio 18.36 5 0.002526
## Pearson 14.08 5 0.015110
##
## loglm> # Case 2. Frequencies given as a vector in a data frame
## loglm> names(quine)
## [1] "Eth" "Sex" "Age" "Lrn" "Days"
##
## loglm> ## [1] "Eth" "Sex" "Age" "Lrn" "Days"
## loglm> fm <- loglm(Days ~ .^2, quine)
##
## loglm> gm <- glm(Days ~ .^2, poisson, quine) # check glm.
##
## loglm> c(deviance(fm), deviance(gm)) # deviances agree
## [1] 1369 1369
##
## loglm> ## [1] 1368.7 1368.7
## loglm> c(fm$df, gm$df) # resid df do not!
## [1] 127
##
## loglm> c(fm$df, gm$df.residual) # resid df do not!
## [1] 127 128
##
## loglm> ## [1] 127 128
## loglm> # The loglm residual degrees of freedom is wrong because of
## loglm> # a non-detectable redundancy in the model matrix.
## loglm>
## loglm>
## loglm>
## Sec 8.7: Survival analysis
## ss 8.7.1: Analysis of the {Aids2} data
library(MASS)
str(Aids2, vec.len=2)
## 'data.frame': 2843 obs. of 7 variables:
## $ state : Factor w/ 4 levels "NSW","Other",..: 1 1 1 1 1 ...
## $ sex : Factor w/ 2 levels "F","M": 2 2 2 2 2 ...
## $ diag : int 10905 11029 9551 9577 10015 ...
## $ death : int 11081 11096 9983 9654 10290 ...
## $ status : Factor w/ 2 levels "A","D": 2 2 2 2 2 ...
## $ T.categ: Factor w/ 8 levels "hs","hsid","id",..: 1 1 1 5 1 ...
## $ age : int 35 53 42 44 39 ...
bloodAids <- subset(Aids2, T.categ=="blood")
bloodAids$days <- bloodAids$death-bloodAids$diag
bloodAids$dead <- as.integer(bloodAids$status=="D")
library(survival)
## Loading required package: splines
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:DAAG':
##
## lung
plot(survfit(Surv(days, dead) ~ sex, data=bloodAids),
col=c(2,4), conf.int=TRUE)
## ss 8.7.2: Right censoring prior to the termination of the study
hsaids <- subset(Aids2, sex=="M" & T.categ=="hs")
hsaids$days <- hsaids$death-hsaids$diag
hsaids$dead <- as.integer(hsaids$status=="D")
table(hsaids$status,hsaids$death==11504)
##
## FALSE TRUE
## A 16 916
## D 1531 1
## ss 8.7.3: The survival curve for male homosexuals
## Survival curve for male homosexuals
hsaids.surv <- survfit(Surv(days, dead) ~ 1, data=hsaids)
plot(hsaids.surv)
## ss 8.7.4: Hazard rates
## ss 8.7.5: The Cox proportional hazards model
bloodAids.coxph <- coxph(Surv(days, dead) ~ sex, data=bloodAids)
summary(bloodAids.coxph)
## Call:
## coxph(formula = Surv(days, dead) ~ sex, data = bloodAids)
##
## n= 94, number of events= 76
##
## coef exp(coef) se(coef) z Pr(>|z|)
## sexM -0.276 0.759 0.233 -1.18 0.24
##
## exp(coef) exp(-coef) lower .95 upper .95
## sexM 0.759 1.32 0.481 1.2
##
## Concordance= 0.52 (se = 0.032 )
## Rsquare= 0.015 (max possible= 0.998 )
## Likelihood ratio test= 1.38 on 1 df, p=0.239
## Wald test = 1.4 on 1 df, p=0.236
## Score (logrank) test = 1.41 on 1 df, p=0.235
cox.zph(bloodAids.coxph)
## rho chisq p
## sexM -0.127 1.21 0.272
plot(cox.zph(bloodAids.coxph))
plot(bloodAids$days, resid(bloodAids.coxph))
lines(lowess(bloodAids$days, resid(bloodAids.coxph)))
## Sec 8.8: Transformations for Count Data
## Sec 8.9: Further Reading
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-2. For overview type 'help("mgcv-package")'.
hand <- with(cricketer, unclass(left)-1) # 0 for left; 1 for right
hand.gam <- gam(hand ~ s(year), data=cricketer)
plot(hand.gam)
ndf <- data.frame(year=seq(from=1840, to=1960, by=2))
fit <- predict(hand.gam, newdata=ndf, type="response", se.fit=TRUE)
plot(fit$fit ~ ndf$year, ylim=range(fit), type="l")
library(survival)
coxph(Surv(life, dead) ~ left + poly(year, 4), data = cricketer)
## Call:
## coxph(formula = Surv(life, dead) ~ left + poly(year, 4), data = cricketer)
##
##
## coef exp(coef) se(coef) z p
## leftleft 0.044 1.05e+00 0.0442 0.998 3.2e-01
## poly(year, 4)1 -68.477 1.82e-30 6.7158 -10.196 0.0e+00
## poly(year, 4)2 -25.029 1.35e-11 6.0217 -4.156 3.2e-05
## poly(year, 4)3 -3.215 4.02e-02 4.3242 -0.743 4.6e-01
## poly(year, 4)4 3.159 2.35e+01 2.5087 1.259 2.1e-01
##
## Likelihood ratio test=526 on 5 df, p=0 n= 5960, number of events= 3387