## Chapter 8: {G}eneralizedLinear Models, and Survival Analysis ## Sec 8.1: Generalized Linear Models ## ss 8.1.1: Transformation of the expected value on the left ## Footnote Code ## Simplified plot showing the logit link function p <- (1:999)/1000 gitp <- log(p/(1 - p)) par(pty="s") plot(p, gitp, xlab = "Proportion", ylab = "", type = "l", pch = 1) par(pty="m") ## ss 8.1.2: Noise terms need not be normal ## ss 8.1.3: Log odds in contingency tables ## ss 8.1.4: Logistic regression with a continuous explanatory variable library(DAAG) anestot <- aggregate(anesthetic[, c("move","nomove")], by=list(conc=anesthetic$conc), FUN=sum) ## The column 'conc', because from the 'by' list, is then a factor. ## The next line recovers the numeric values anestot$conc <- as.numeric(as.character(anestot$conc)) anestot$total <- apply(anestot[, c("move","nomove")], 1 , sum) anestot$prop <- anestot$nomove/anestot$total ## Footnote Code ## Plot proportion moving vs conc: data frame anesthetic (DAAG) plot(prop ~ conc, data=anestot, xlab = "Concentration", ylab = "Proportion", xlim = c(.5,2.5), ylim = c(0, 1), pch = 16) with(anestot, {text(conc, prop, paste(total), pos=2) abline(h=sum(nomove)/sum(total), lty=2)}) ## Fit model directly to the 0/1 data in nomove anes.logit <- glm(nomove ~ conc, family=binomial(link="logit"), data=anesthetic) ## Fit model to the proportions; supply total numbers as weights anes1.logit <- glm(prop ~ conc, family=binomial(link="logit"), weights=total, data=anestot) ## Footnote Code ## Graphical summary of logistic regression results anestot$emplogit <- with(anestot, log((nomove+0.5)/(move+0.5))) plot(emplogit ~ conc, data=anestot, xlab = "Concentration", xlim = c(0, 2.75), xaxs="i", ylab = "Empirical logit", ylim=c(-2, 2.4), cex=1.5, pch = 16) with(anestot, text(conc, emplogit, paste(round(prop,2)), pos=c(2,4,2,4,4,4))) abline(anes.logit) summary(anes.logit) ## Sec 8.2: Logistic Multiple Regression ## Footnote Code ## Presence/absence information: data frame frogs (DAAGS) plot(northing ~ easting, data=frogs, pch=c(1,16)[frogs$pres.abs+1], xlab="Meters east of reference point", ylab="Meters north", asp=1) ## Footnote Code ## Pairs plot; frogs data pairs(frogs[, c(5:10,4)], oma=c(2,2,2,2), cex=0.5) ## Footnote Code ## Here is code for the top row of plots par(mfrow=c(1,3), pty="s") for(nam in c("distance","NoOfPools","NoOfSites")){ y <- frogs[, nam] plot(density(y), main="", xlab=nam) } # The other rows can be obtained by replacing y by # sqrt(y) or log(y) (or, for NoOfSites, by log(y+1)) par(mfrow=c(1,1), pty="m") ## Footnote Code ## Correlation between altitude and meanmax with(frogs, cor(altitude,meanmax)) with(frogs, cor(cbind(altitude, "meanmax+meanmin" = meanmin+meanmax, "meanmax-meanmin" = meanmax-meanmin))) ## Footnote Code ## ss 8.2.1: Selection of model terms, and fitting the model summary(frogs.glm0 <- glm(formula = pres.abs ~ log(distance) + log(NoOfPools) + NoOfSites + avrain + I(meanmax+meanmin)+I(meanmax-meanmin), family = binomial, data = frogs)) frogs.glm <- glm(pres.abs ~ log(distance) + log(NoOfPools) + I(meanmax+meanmin)+ I(meanmax-meanmin), family = binomial, data = frogs) summary(frogs.glm) ## ss 8.2.2: Fitted values fitted(frogs.glm) # Fitted values' scale of response fitresp <- predict(frogs.glm, type="response") # Same as fitted(frogs.glm) fitlp <- predict(frogs.glm, type="link") # Scale of linear predictor ## For approximate SEs, specify fitlpWithSE <- predict(frogs.glm, type="link", se.fit=TRUE) ## ss 8.2.3: A plot of contributions of explanatory variables ## Footnote Code ## For binary response data, plots of residuals are not insightful ## To see what they look like, try: par(mfrow=c(1,4), pty="s") termplot(frogs.glm, data=frogs, partial.resid=TRUE) par(mfrow=c(1,1), pty="m") par(mfrow=c(1,4), pty="s") frogs$maxminSum <- with(frogs, meanmax+meanmin) frogs$maxminDiff <- with(frogs, meanmax-meanmin) frogs.glm <- glm(pres.abs ~ log(distance) + log(NoOfPools) + maxminSum + maxminDiff, family = binomial, data = frogs) termplot(frogs.glm) par(mfrow=c(1,1), pty="s") ## ss 8.2.4: Cross-validation estimates of predictive accuracy ## Footnote Code ## The cross-validated estimate of accuracy is stored in the list ## element acc.cv, in the output from the function CVbinary(), thus: for (j in 1:4){ randsam <- sample(1:10, 212, replace=TRUE) initial.acc <- CVbinary(frogs.glm0, rand=randsam, print.details=FALSE)$acc.cv reduced.acc <- CVbinary(frogs.glm, rand=randsam, print.details=FALSE)$acc.cv cat("Initial:", round(initial.acc,3), " Reduced:", round(reduced.acc,3), "\n") } ## Sec 8.3: Logistic Models for Categorical Data -- an Example ## Create data frame from multi-way table UCBAdmissions (datasets) dimnames(UCBAdmissions) # Check levels of table margins UCB <- as.data.frame.table(UCBAdmissions["Admitted", , ]) names(UCB)[3] <- "admit" UCB$reject <- as.data.frame.table(UCBAdmissions["Rejected", , ])$Freq UCB$Gender <- relevel(UCB$Gender, ref="Male") ## Add further columns total and p (proportion admitted) UCB$total <- UCB$admit + UCB$reject UCB$p <- UCB$admit/UCB$total UCB.glm <- glm(p ~ Dept*Gender, family=binomial, data=UCB, weights=total) anova(UCB.glm, test="Chisq") summary(UCB.glm)$coef ## Sec 8.4: Poisson and Quasi-Poisson Regression ## ss 8.4.1: Data on aberrant crypt foci ## Footnote Code ## Plot count vs endtime: data frame ACF1 (DAAG) plot(count ~ endtime, data=ACF1, pch=16, log="x") summary(ACF.glm0 <- glm(formula = count ~ endtime, family = poisson, data = ACF1)) summary(ACF.glm <- glm(formula = count ~ endtime + I(endtime^2), family = poisson, data = ACF1)) (etime <- unique(ACF1$endtime)) exp(-0.3215 + 0.1192*etime) # Linear term only exp(1.72235 - 0.26235*etime + 0.01514*etime^2) # Quadratic model sum(resid(ACF.glm, type="pearson")^2)/19 ACFq.glm <- glm(formula = count ~ endtime, family = quasipoisson, data = ACF1) summary(ACFq.glm)$coef sapply(split(residuals(ACFq.glm), ACF1$endtime), var) fligner.test(resid(ACFq.glm) ~ factor(ACF1$endtime)) ## ss 8.4.2: Moth habitat example ## Footnote Code ## Number of moths by habitat: data frame moths (DAAG) rbind(Number=table(moths[, 4]), sapply(split(moths[, -4], moths$habitat), apply, 2, sum)) library(lattice) dotplot(habitat ~ A, data=moths, xlab="Number of moths (species A)", panel=function(x, y, ...){ panel.dotplot(x,y, pch=1, col="black", ...) av <- sapply(split(x,y),mean); ypos <- unique(y) lpoints(ypos~av, pch=3, col="black", cex=1.25) }, key=list(text=list(c("Individual transects", "Mean")), points=list(pch=c(1,3), cex=c(1,1.25), col=c("black","gray45")), columns=2)) ## An unsatisfactory choice of reference level summary(A.glm <- glm(A ~ habitat + log(meters), family = quasipoisson, data = moths)) subset(moths, habitat=="Bank") ## Footnote Code ## Analysis with tighter convergence criterion A.glm <- update(A.glm, epsilon=1e-10) summary(A.glm)$coef head(summary(A.glm)$coefficients, 3) ## SEs of predicted values A.se <- predict(A.glm, se=T)$se.fit A.se[moths$habitat=="Bank"] range(A.se[moths$habitat!="Bank"]) ## A more satisfactory choice of reference level moths$habitat <- relevel(moths$habitat, ref="Lowerside") summary(A.glm <- glm(A ~ habitat + log(meters), family=quasipoisson, data=moths)) ## The comparison between {Bank} and other habitats ## Examine change in deviance from merging Bank with NWsoak habitatNW <- moths$habitat habitatNW[habitatNW=="Bank"] <- "NWsoak" habitatNW <- factor(habitatNW) # NB: Bank and NWsoak table(habitatNW) ANW.glm <- glm(A ~ habitatNW + log(meters), family = quasipoisson, data=moths) anova(A.glm, ANW.glm, test="F") habitatD <- moths$habitat habitatD[habitatD=="Bank"] <- "Disturbed" habitatD <- factor(habitatD) A.glm <- glm(A ~ habitat + log(meters), family = quasipoisson, data=moths) AD.glm <- glm(A ~ habitatD + log(meters), family = quasipoisson, data=moths) anova(A.glm, AD.glm, test="F") ## Diagnostic plots A1.glm <- glm(formula = A ~ habitat + log(meters), family = quasipoisson, data = moths, subset=habitat!="Bank") par(mfrow=c(1,4), pty="s") plot(A1.glm, panel=panel.smooth) par(mfrow=c(1,1), pty="m") ## Sec 8.5: Additional Notes on Generalized Linear Models ## ss 8.5.1: $\!\!^{*}$ Residuals, and estimating the dispersion ## Other choices of link function for binomial models ## Quasi-binomial and quasi-poisson models ## ss 8.5.2: Standard errors and $z$- or $t$-statistics for binomial models fac <- factor(LETTERS[1:4]) p <- c(103, 30, 11, 3)/500 n <- rep(500,4) summary(glm(p ~ fac, family=binomial, weights=n))$coef ## ss 8.5.3: Leverage for binomial models ## Sec 8.6: Models with an Ordered Categorical or Categorical Response ## ss 8.6.1: Ordinal Regression Models ## Exploratory analysis ## $\!\!^{*}$ Proportional odds logistic regression library(MASS) inhaler <- data.frame(freq=c(99,76,41,55,2,13), choice=rep(c("inh1","inh2"), 3), ease=ordered(rep(c("easy","re-read", "unclear"), rep(2,3)))) inhaler1.polr <- polr(ease ~ 1, weights=freq, data=inhaler, Hess=TRUE, subset=inhaler$choice=="inh1") # Setting Hess=TRUE at this point averts possible numerical # problems if this calculation is deferred until later. inhaler2.polr <- polr(ease ~ 1, weights=freq, data=inhaler, Hess=TRUE, subset=inhaler$choice=="inh2") summary(inhaler1.polr) # inhaler$choice == "inh1" summary(inhaler2.polr) # inhaler$choice == "inh2" summary(inhaler.polr <- polr(ease ~ choice, weights=freq, Hess=TRUE, data=inhaler)) deviance(inhaler.polr) - (deviance(inhaler1.polr) + deviance(inhaler2.polr)) summary(inhaler.polr) ## ss 8.6.2: $\!\!^{*}$ Loglinear Models library(MASS) example(loglm) ## Sec 8.7: Survival analysis ## ss 8.7.1: Analysis of the {Aids2} data library(MASS) str(Aids2, vec.len=2) bloodAids <- subset(Aids2, T.categ=="blood") bloodAids$days <- bloodAids$death-bloodAids$diag bloodAids$dead <- as.integer(bloodAids$status=="D") library(survival) plot(survfit(Surv(days, dead) ~ sex, data=bloodAids), col=c(2,4), conf.int=TRUE) ## ss 8.7.2: Right censoring prior to the termination of the study hsaids <- subset(Aids2, sex=="M" & T.categ=="hs") hsaids$days <- hsaids$death-hsaids$diag hsaids$dead <- as.integer(hsaids$status=="D") table(hsaids$status,hsaids$death==11504) ## ss 8.7.3: The survival curve for male homosexuals ## Survival curve for male homosexuals hsaids.surv <- survfit(Surv(days, dead) ~ 1, data=hsaids) plot(hsaids.surv) ## ss 8.7.4: Hazard rates ## ss 8.7.5: The Cox proportional hazards model bloodAids.coxph <- coxph(Surv(days, dead) ~ sex, data=bloodAids) summary(bloodAids.coxph) cox.zph(bloodAids.coxph) plot(cox.zph(bloodAids.coxph)) plot(bloodAids$days, resid(bloodAids.coxph)) lines(lowess(bloodAids$days, resid(bloodAids.coxph))) ## Sec 8.8: Transformations for Count Data ## Sec 8.9: Further Reading library(mgcv) hand <- with(cricketer, unclass(left)-1) # 0 for left; 1 for right hand.gam <- gam(hand ~ s(year), data=cricketer) plot(hand.gam) ndf <- data.frame(year=seq(from=1840, to=1960, by=2)) fit <- predict(hand.gam, newdata=ndf, type="response", se.fit=TRUE) plot(fit$fit ~ ndf$year, ylim=range(fit), type="l") library(survival) coxph(Surv(life, dead) ~ left + poly(year, 4), data = cricketer)