## Chapter 3: Statistical Models ## Sec 3.1: Statistical Models ## ss 3.1.1: Incorporation of a error or noise component ## Footnote Code ## Plot depression vs weight: data frame roller (DAAG) library(DAAG) par(pty="s") plot(depression ~ weight, data = roller, xlim=c(0,1.04*max(weight)), ylim=c(0,1.04*max(depression)), xaxs="i", yaxs="i", # "i"=inner: Fit axes exactly to the limits xlab = "Weight of roller (t)", ylab = "Depression(mm)", pch = 16) abline(0, 2.25) # A slope of 2.25 looks about right par(pty="m") ## Predicting with models ## Which model is best? ## ss 3.1.2: Fitting models -- the model formula ## Fit line - by default, this fits intercept & slope. ## requires data frame roller (DAAG) roller.lm <- lm(depression ~ weight, data=roller) ## Compare with the code used to plot the data par(pty="s") plot(depression ~ weight, data=roller) ## Add the fitted line to the plot abline(roller.lm) par(pty="m") ## Footnote Code ## For a model that omits the intercept term, specify lm(depression ~ -1 + weight, data=roller) ## Fitted values, residuals and coefficients round(fitted(roller.lm), 1) round(resid(roller.lm), 1) coef(roller.lm) ## Model objects names(roller.lm) # Get names of list elements roller.lm$coef ## Summary information about model objects summary(roller.lm) ## Sec 3.2: Distributions: Models for the Random Component ## ss 3.2.1: Discrete distributions -- models for counts ## Bernoulli distribution ## Binomial distribution ## Footnote Code ## To get the labeling (0, 1, 2) as in the text, specify: probs <- dbinom(0:2, size=2, prob=0.5) names(probs) <- 0:2 probs pbinom(q=2, size=4, prob=0.5) pbinom(q=4, size=50, prob=0.2) qbinom(p = 0.70, size = 4, prob = 0.5) ## Means and standard deviations ## Poisson distribution ## ss 3.2.2: Continuous distributions ## Normal distribution ## Footnote Code ## Plot the normal density, in the range -3.25 to 3.25 z <- pretty(c(-3.25,3.25), 30) # Find ~30 equally spaced points ht <- dnorm(z) # By default: mean=0, standard deviation=1 plot(z, ht, type="l", xlab="Normal deviate", ylab="Ordinate", yaxs="i") polygon(c(z[z <= 1.0], 1.0), c(dnorm(z[z <= 1.0]), 0), col="grey") # Around 84.1% of the total area is to the left of the vertical line. ## Plot the normal density, in the range -3 to 3 z <- pretty(c(-3,3), 30) # Find ~30 equally spaced points ht <- dnorm(z) # By default: mean=0, standard deviation=1 plot(z, ht, type="l", xlab="Normal deviate", ylab="Density", yaxs="i") # yaxs="i" locates the axes at the limits of the data pnorm(1.0) # by default, mean=0 and SD=1 ## Footnote Code ## Additional examples pnorm(0) # .5 (exactly half the area is to the left of the mean) pnorm(-1.96) # .025 pnorm(1.96) # .975 pnorm(1.96, mean=2) # .484 (a normal distribution with mean 2 and SD 1) pnorm(1.96, sd=2) # .836 (sd = standard deviation) qnorm(.9) # 90th percentile; mean=0 and SD=1 ## Footnote Code ## Additional examples: qnorm(0.841) # 1.0 qnorm(0.5) # 0 qnorm(0.975) # 1.96 qnorm(c(.1,.2,.3)) # -1.282 -0.842 -0.524 (10th, 20th and 30th percentiles) qnorm(.1, mean=100, sd=10) # 87.2 (10th percentile, mean=100, SD=10) ## Other continuous distributions ## Different ways to describe distributions ## Sec 3.3: Simulation of Random Numbers and Random Samples set.seed(23286) # Use to reproduce the sample below rbinom(10, size=1, p=.5) ## Sampling from discrete distributions rbinom(10, size=1, p=.5) # 10 Bernoulli trials, prob=0.5 rbinom(25, size=4, prob=0.5) set.seed(9388) rpois(20, 3) ## ss 3.3.1: Sampling from the normal and other continuous distributions options(digits=2) # Suggest number of digits to display rnorm(10) # 10 random values from the normal distribution ## Footnote Code ## The following gives a rough equivalent of the figure: set.seed (21) # Use to reproduce the data in the figure par(mfrow=c(2,3)) x <- pretty(c(6.5,13.5), 40) for(i in 1:5){ y <- rnorm(50, mean=10, sd=1) hist(y, prob=TRUE, xlim=c(6.5,13.5), ylim=c(0,0.5), main="") lines(x, dnorm(x,10,1)) } par(mfrow=c(1,1)) runif(n = 20, min=0, max=1) # 20 numbers, uniform distn on (0, 1) rexp(n=10, rate=3) # 10 numbers, exponential, mean 1/3. ## Exercises at the end of this chapter explore further possibilities. ## ss 3.3.2: Simulation of regression data options(digits=3) n <- 8; x <- seq(1,n); sigma <- 2.5; b0 <- 2; b1 <- 3 error <- rnorm(n, sd=sigma) y <- b0 + b1*x + error t(data.frame(x,y)) roller.lm <- lm(depression ~ weight, data=roller) roller.sim <- simulate(roller.lm, nsim=20) # 20 simulations matplot(roller$weight, roller.sim, pch=1, ylim=range(roller$depression)) points(roller, pch=16) ## ss 3.3.3: Simulation of the sampling distribution of the mean ## Function to generate n sample values; skew population sampvals <- function(n)exp(rnorm(n, mean = 0.5, sd = 0.3)) ## Means across rows of a dimension nsamp x sampsize matrix of ## sample values gives nsamp means of samples of size sampsize. samplingDist <- function(sampsize=3, nsamp=1000, FUN=mean) apply(matrix(sampvals(sampsize*nsamp), ncol=sampsize), 1, FUN) size <- c(3,10,30) ## Simulate means of samples of 3, 9 and 30; place in dataframe df <- data.frame(y3=samplingDist(sampsize=size[1]), y9=samplingDist(sampsize=size[2]), y30=samplingDist(sampsize=size[3])) ## Simulate source population (sampsize=1) y <- samplingDist(sampsize=1) densityplot(~y3+y9+y30, data = df, outer=TRUE, layout = c(3,1), plot.points = FALSE, panel = function(x, ...) { panel.densityplot(x, ..., col = "black") panel.densityplot(y, col = "gray40", lty = 2, ...) }) ## Footnote Code ## Use strip.custom to customize the strip labeling doStrip <- strip.custom(strip.names = TRUE, factor.levels = as.expression(size), var.name = "Sample size", sep = expression(" = ")) ## Then include the argument 'strip=doStrip' in the call to densityplot ## ss 3.3.4: Sampling from finite populations ## For the sequence below, precede with set.seed(3676) sample(1:9384, 15, replace=FALSE) ## For the sequence below, precede with set.seed(366) split(sample(seq(1:10)), rep(c("Control","Treatment"), 5)) # sample(1:10) gives a random re-arrangement (permutation) # of 1, 2, ..., 10 ## With replacement samples sample(1:10, replace=TRUE) ## Cluster sampling ## Simulation in teaching and research ## Sec 3.4: Model Assumptions ## ss 3.4.1: Random sampling assumptions -- independence ## ss 3.4.2: Checks for normality ## Graphical tools for checking for normality ## The normal probability plot ## Use qreference() (DAAG) ## With seed=21, the random numbers are as in the previous figure qreference(m=50, seed=21, nrep=5, nrows=1) # 50 values per panel ## Footnote Code ## Set seed to get the same data as earlier library(lattice) qqmath(~rnorm(50*5)|rep(1:5,rep(50,5)), layout=c(5,1), aspect=1) ## The sample plot, set alongside plots for random normal data ## Compare normal probability plot for normal-ambient difference ## with simulated normal values: data frame pair65 (DAAG) qreference(pair65$heated - pair65$ambient, nrep=8) ## How close to normal is the sampling distribution of themean? ## Formal statistical testing for normality? ## ss 3.4.3: Checking other model assumptions ## ss 3.4.4: Are non-parametric methods the answer? ## ss 3.4.5: Why models matter -- adding across contingency tables str(UCBAdmissions) ## Tabulate by Admit and Gender byGender <- margin.table(UCBAdmissions, margin=1:2) round(100*prop.table(byGender, margin=2)["Admitted", ], 1) ## Admission rates, by department round(100*prop.table(UCBAdmissions, margin=2:3)["Admitted", , ], 1) ## Calculate totals, by department, of males & females applying (applicants <- margin.table(UCBAdmissions, margin=2:3)) ## Calculate proportions of male & female applicants round(100*prop.table(applicants, margin=1), 1) ## Sec 3.5: Recap ## Sec 3.6: Further Reading x <- rpois(7, 78.3) mean(x); var(x) Markov <- function (N=100, initial.value=1, P) { X <- numeric(N) X[1] <- initial.value + 1 # States 0:5; subscripts 1:6 n <- nrow(P) for (i in 2:N){ X[i] <- sample(1:n, size=1, prob=P[X[i-1], ])} X - 1 } plotmarkov <- function(n=10000, start=0, window=100, transition=Pb, npanels=5){ xc2 <- Markov(n, start, transition) mav0 <- rollmean(as.integer(xc2==0), window) mav1 <- rollmean(as.integer(xc2==0), window) npanel <- cut(1:length(mav0), breaks=seq(from=1, to=length(mav0), length=npanels+1), include.lowest=TRUE) df <- data.frame(av0=mav0, av1=mav1, x=1:length(mav0), gp=npanel) print(xyplot(av0+av1 ~ x | gp, data=df, layout=c(1,npanels), type="l", par.strip.text=list(cex=0.65), scales=list(x=list(relation="free")))) }