## Chapter 8: {G}eneralizedLinear Models, and Survival Analysis 

## Sec 8.1: Generalized Linear Models 
## ss 8.1.1: Transformation of the expected value on the left 
## Footnote Code
## Simplified plot showing the logit link function 
p <- (1:999)/1000 
gitp <- log(p/(1 - p)) 
par(pty="s")
plot(p, gitp, xlab = "Proportion", ylab = "", type = "l", pch = 1)
par(pty="m")

## ss 8.1.2: Noise terms need not be normal 
## ss 8.1.3: Log odds in contingency tables 
## ss 8.1.4: Logistic regression with a continuous explanatory variable 
library(DAAG) 
anestot <- aggregate(anesthetic[, c("move","nomove")],  
                     by=list(conc=anesthetic$conc), FUN=sum) 
## The column 'conc', because from the 'by' list, is then a factor. 
## The next line recovers the numeric values 
anestot$conc <- as.numeric(as.character(anestot$conc)) 
anestot$total <- apply(anestot[, c("move","nomove")], 1 , sum) 
anestot$prop <- anestot$nomove/anestot$total 

## Footnote Code
## Plot proportion moving vs conc: data frame anesthetic (DAAG) 
plot(prop ~ conc, data=anestot, xlab = "Concentration",  
     ylab = "Proportion", xlim = c(.5,2.5), ylim = c(0, 1), pch = 16) 
with(anestot, 
     {text(conc, prop, paste(total), pos=2) 
      abline(h=sum(nomove)/sum(total), lty=2)}) 

## Fit model directly to the 0/1 data in nomove 
anes.logit <- glm(nomove ~ conc, family=binomial(link="logit"), 
                  data=anesthetic) 
## Fit model to the proportions; supply total numbers as weights 
anes1.logit <- glm(prop ~ conc, family=binomial(link="logit"), 
                   weights=total, data=anestot) 

## Footnote Code
## Graphical summary of logistic regression results 
anestot$emplogit <- with(anestot, log((nomove+0.5)/(move+0.5))) 
plot(emplogit ~ conc, data=anestot, 
     xlab = "Concentration", xlim = c(0, 2.75), xaxs="i", 
     ylab = "Empirical logit", ylim=c(-2, 2.4), cex=1.5, pch = 16) 
with(anestot, text(conc, emplogit, paste(round(prop,2)), pos=c(2,4,2,4,4,4))) 
abline(anes.logit) 

summary(anes.logit) 


## Sec 8.2: Logistic Multiple Regression 
## Footnote Code
## Presence/absence information: data frame frogs (DAAGS) 
plot(northing ~ easting, data=frogs, pch=c(1,16)[frogs$pres.abs+1], 
     xlab="Meters east of reference point", ylab="Meters north", asp=1) 

## Footnote Code
## Pairs plot; frogs data 
pairs(frogs[, c(5:10,4)], oma=c(2,2,2,2), cex=0.5) 

## Footnote Code
## Here is code for the top row of plots 
par(mfrow=c(1,3), pty="s") 
for(nam in c("distance","NoOfPools","NoOfSites")){ 
    y <- frogs[, nam] 
    plot(density(y), main="", xlab=nam) 
    } 
  # The other rows can be obtained by replacing y by 
  # sqrt(y) or log(y) (or, for NoOfSites, by log(y+1)) 
par(mfrow=c(1,1), pty="m") 

## Footnote Code
## Correlation between altitude and meanmax 
with(frogs, cor(altitude,meanmax)) 

with(frogs, cor(cbind(altitude,  
                      "meanmax+meanmin" = meanmin+meanmax,  
                      "meanmax-meanmin" = meanmax-meanmin))) 

## Footnote Code
## ss 8.2.1: Selection of model terms, and fitting the model 
summary(frogs.glm0 <- glm(formula = pres.abs ~ log(distance) + 
                          log(NoOfPools) + NoOfSites + avrain +  
                          I(meanmax+meanmin)+I(meanmax-meanmin), 
                          family = binomial, data = frogs)) 

frogs.glm <- glm(pres.abs ~ log(distance) + log(NoOfPools) + 
                 I(meanmax+meanmin)+ I(meanmax-meanmin),  
                 family = binomial, data = frogs) 
summary(frogs.glm) 

## ss 8.2.2: Fitted values 
fitted(frogs.glm)    # Fitted values' scale of response 
fitresp <- predict(frogs.glm, type="response")  # Same as fitted(frogs.glm) 
fitlp <- predict(frogs.glm, type="link")      # Scale of linear predictor 
## For approximate SEs, specify 
fitlpWithSE <- predict(frogs.glm, type="link", se.fit=TRUE) 

## ss 8.2.3: A plot of contributions of explanatory variables 
## Footnote Code
## For binary response data, plots of residuals are not insightful 
## To see what they look like, try: 
par(mfrow=c(1,4), pty="s")
termplot(frogs.glm, data=frogs, partial.resid=TRUE) 
par(mfrow=c(1,1), pty="m")

par(mfrow=c(1,4), pty="s") 
frogs$maxminSum <- with(frogs, meanmax+meanmin) 
frogs$maxminDiff <- with(frogs, meanmax-meanmin) 
frogs.glm <- glm(pres.abs ~ log(distance) + log(NoOfPools) +  
                 maxminSum + maxminDiff, family = binomial,  
                 data = frogs) 
termplot(frogs.glm) 
par(mfrow=c(1,1), pty="s") 

## ss 8.2.4: Cross-validation estimates of predictive accuracy 
## Footnote Code
## The cross-validated estimate of accuracy is stored in the list  
## element acc.cv, in the output from the function CVbinary(), thus: 
for (j in 1:4){ 
     randsam <- sample(1:10, 212, replace=TRUE) 
     initial.acc <- CVbinary(frogs.glm0, rand=randsam,  
                             print.details=FALSE)$acc.cv 
     reduced.acc <- CVbinary(frogs.glm, rand=randsam, 
                             print.details=FALSE)$acc.cv 
     cat("Initial:", round(initial.acc,3), "  Reduced:", round(reduced.acc,3), "\n") 
     } 


## Sec 8.3: Logistic Models for Categorical Data -- an Example 
## Create data frame from multi-way table UCBAdmissions (datasets) 
dimnames(UCBAdmissions)  # Check levels of table margins 
UCB <- as.data.frame.table(UCBAdmissions["Admitted", , ]) 
names(UCB)[3] <- "admit" 
UCB$reject <- as.data.frame.table(UCBAdmissions["Rejected", , ])$Freq 
UCB$Gender <- relevel(UCB$Gender, ref="Male") 
## Add further columns total and p (proportion admitted) 
UCB$total <- UCB$admit + UCB$reject 
UCB$p <- UCB$admit/UCB$total 

UCB.glm <- glm(p ~ Dept*Gender, family=binomial,  
               data=UCB, weights=total) 
anova(UCB.glm, test="Chisq") 

summary(UCB.glm)$coef 


## Sec 8.4: Poisson and Quasi-Poisson Regression 
## ss 8.4.1: Data on aberrant crypt foci 
## Footnote Code
## Plot count vs endtime: data frame ACF1 (DAAG) 
plot(count ~ endtime, data=ACF1, pch=16, log="x") 

summary(ACF.glm0 <- glm(formula = count ~ endtime,  
                         family = poisson, data = ACF1)) 

summary(ACF.glm <- glm(formula = count ~ endtime + I(endtime^2), 
                       family = poisson, data = ACF1)) 

(etime <- unique(ACF1$endtime)) 
exp(-0.3215 + 0.1192*etime)  # Linear term only 
exp(1.72235 - 0.26235*etime + 0.01514*etime^2)  # Quadratic model 

sum(resid(ACF.glm, type="pearson")^2)/19 

ACFq.glm <- glm(formula = count ~ endtime, 
                family = quasipoisson, data = ACF1) 
summary(ACFq.glm)$coef 

sapply(split(residuals(ACFq.glm), ACF1$endtime), var) 

fligner.test(resid(ACFq.glm) ~ factor(ACF1$endtime)) 

## ss 8.4.2: Moth habitat example 
## Footnote Code
## Number of moths by habitat: data frame moths (DAAG) 
rbind(Number=table(moths[, 4]), sapply(split(moths[, -4], moths$habitat),  
      apply, 2, sum)) 

library(lattice) 
dotplot(habitat ~ A, data=moths, xlab="Number of moths (species A)",  
        panel=function(x, y, ...){  
          panel.dotplot(x,y, pch=1, col="black", ...) 
          av <- sapply(split(x,y),mean);
          ypos <- unique(y)
          lpoints(ypos~av, pch=3, col="black", cex=1.25)          
          },  
        key=list(text=list(c("Individual transects", "Mean")), 
          points=list(pch=c(1,3), cex=c(1,1.25), col=c("black","gray45")), 
          columns=2)) 

##            An unsatisfactory choice of reference level 
summary(A.glm <- glm(A ~ habitat + log(meters),  
                     family = quasipoisson, data = moths)) 

subset(moths, habitat=="Bank") 

## Footnote Code
## Analysis with tighter convergence criterion 
A.glm <- update(A.glm, epsilon=1e-10) 
summary(A.glm)$coef 
head(summary(A.glm)$coefficients, 3) 

## SEs of predicted values 
A.se <- predict(A.glm, se=T)$se.fit 
A.se[moths$habitat=="Bank"] 
range(A.se[moths$habitat!="Bank"]) 

##           A more satisfactory choice of reference level 
moths$habitat <- relevel(moths$habitat, ref="Lowerside") 
summary(A.glm <- glm(A ~ habitat + log(meters),  
                     family=quasipoisson, data=moths)) 

##          The comparison between {Bank} and other habitats 
## Examine change in deviance from merging Bank with NWsoak 
habitatNW <- moths$habitat   
habitatNW[habitatNW=="Bank"] <- "NWsoak" 
habitatNW <- factor(habitatNW)  # NB: Bank and NWsoak  
table(habitatNW) 
ANW.glm <- glm(A ~ habitatNW + log(meters), family = quasipoisson,  
               data=moths) 
anova(A.glm, ANW.glm, test="F") 

habitatD <- moths$habitat  
habitatD[habitatD=="Bank"] <- "Disturbed"  
habitatD <- factor(habitatD)  
A.glm <- glm(A ~ habitat + log(meters), family = quasipoisson,   
             data=moths)  
AD.glm <- glm(A ~ habitatD + log(meters), family = quasipoisson,   
              data=moths)  
anova(A.glm, AD.glm, test="F")  

##                          Diagnostic plots 
A1.glm <- glm(formula = A ~ habitat + log(meters), family = quasipoisson,  
              data = moths, subset=habitat!="Bank") 
par(mfrow=c(1,4), pty="s")
plot(A1.glm, panel=panel.smooth) 
par(mfrow=c(1,1), pty="m")

## Sec 8.5: Additional Notes on Generalized Linear Models 
## ss 8.5.1: $\!\!^{*}$ Residuals, and estimating the dispersion 
##         Other choices of link function for binomial models 
##              Quasi-binomial and quasi-poisson models 
## ss 8.5.2: Standard errors and $z$- or $t$-statistics for binomial models 
fac <- factor(LETTERS[1:4]) 
p <- c(103, 30, 11, 3)/500 
n <- rep(500,4) 
summary(glm(p ~ fac, family=binomial, weights=n))$coef 

## ss 8.5.3: Leverage for binomial models 

## Sec 8.6: Models with an Ordered Categorical or Categorical Response 
## ss 8.6.1: Ordinal Regression Models 
##                        Exploratory analysis 
##          $\!\!^{*}$ Proportional odds logistic regression 
library(MASS) 
inhaler <-  data.frame(freq=c(99,76,41,55,2,13), 
                       choice=rep(c("inh1","inh2"), 3), 
                       ease=ordered(rep(c("easy","re-read", 
                                          "unclear"), rep(2,3)))) 
inhaler1.polr <-  polr(ease ~ 1, weights=freq, data=inhaler, 
                       Hess=TRUE, subset=inhaler$choice=="inh1")  
 # Setting Hess=TRUE at this point averts possible numerical 
 # problems if this calculation is deferred until later. 
inhaler2.polr <-  polr(ease ~ 1, weights=freq, data=inhaler, 
                       Hess=TRUE, subset=inhaler$choice=="inh2") 

summary(inhaler1.polr)  # inhaler$choice == "inh1" 
summary(inhaler2.polr)  # inhaler$choice == "inh2" 

summary(inhaler.polr <-  polr(ease ~ choice, weights=freq, 
                              Hess=TRUE, data=inhaler)) 

deviance(inhaler.polr) - (deviance(inhaler1.polr)  
                          + deviance(inhaler2.polr)) 

summary(inhaler.polr) 

## ss 8.6.2: $\!\!^{*}$ Loglinear Models 
library(MASS) 
example(loglm) 


## Sec 8.7: Survival analysis 
## ss 8.7.1: Analysis of the {Aids2} data 
library(MASS) 
str(Aids2, vec.len=2) 

bloodAids <- subset(Aids2, T.categ=="blood") 
bloodAids$days <- bloodAids$death-bloodAids$diag 
bloodAids$dead <- as.integer(bloodAids$status=="D") 

library(survival) 
plot(survfit(Surv(days, dead) ~ sex, data=bloodAids),  
     col=c(2,4), conf.int=TRUE) 

## ss 8.7.2: Right censoring prior to the termination of the study 
hsaids <- subset(Aids2, sex=="M" & T.categ=="hs") 
hsaids$days <- hsaids$death-hsaids$diag 
hsaids$dead <- as.integer(hsaids$status=="D") 
table(hsaids$status,hsaids$death==11504)    

## ss 8.7.3: The survival curve for male homosexuals 
## Survival curve for male homosexuals 
hsaids.surv <- survfit(Surv(days, dead) ~ 1, data=hsaids) 
plot(hsaids.surv) 

## ss 8.7.4: Hazard rates 
## ss 8.7.5: The Cox proportional hazards model 
bloodAids.coxph <- coxph(Surv(days, dead) ~ sex, data=bloodAids) 

summary(bloodAids.coxph) 

cox.zph(bloodAids.coxph) 

plot(cox.zph(bloodAids.coxph)) 

plot(bloodAids$days, resid(bloodAids.coxph)) 
lines(lowess(bloodAids$days, resid(bloodAids.coxph)))  


## Sec 8.8: Transformations for Count Data 

## Sec 8.9: Further Reading 
library(mgcv) 
hand <- with(cricketer, unclass(left)-1)  # 0 for left; 1 for right 
hand.gam <- gam(hand ~ s(year), data=cricketer) 
plot(hand.gam) 

ndf <- data.frame(year=seq(from=1840, to=1960, by=2)) 
fit <- predict(hand.gam, newdata=ndf, type="response", se.fit=TRUE) 
plot(fit$fit ~ ndf$year, ylim=range(fit), type="l") 

library(survival) 
coxph(Surv(life, dead) ~ left + poly(year, 4), data = cricketer)