## Chapter 3: Statistical Models
## Sec 3.1: Statistical Models
## ss 3.1.1: Incorporation of a error or noise component
## Footnote Code
## Plot depression vs weight: data frame roller (DAAG)
library(DAAG)
## Loading required package: lattice
par(pty="s")
plot(depression ~ weight, data = roller, xlim=c(0,1.04*max(weight)),
ylim=c(0,1.04*max(depression)),
xaxs="i", yaxs="i", # "i"=inner: Fit axes exactly to the limits
xlab = "Weight of roller (t)", ylab = "Depression(mm)", pch = 16)
abline(0, 2.25) # A slope of 2.25 looks about right
par(pty="m")
## Predicting with models
## Which model is best?
## ss 3.1.2: Fitting models -- the model formula
## Fit line - by default, this fits intercept & slope.
## requires data frame roller (DAAG)
roller.lm <- lm(depression ~ weight, data=roller)
## Compare with the code used to plot the data
par(pty="s")
plot(depression ~ weight, data=roller)
## Add the fitted line to the plot
abline(roller.lm)
par(pty="m")
## Footnote Code
## For a model that omits the intercept term, specify
lm(depression ~ -1 + weight, data=roller)
##
## Call:
## lm(formula = depression ~ -1 + weight, data = roller)
##
## Coefficients:
## weight
## 2.39
## Fitted values, residuals and coefficients
round(fitted(roller.lm), 1)
## 1 2 3 4 5 6 7 8 9 10
## 3.0 6.2 6.7 10.7 12.0 14.2 15.0 18.2 24.0 31.0
round(resid(roller.lm), 1)
## 1 2 3 4 5 6 7 8 9 10
## -1.0 -5.2 -1.7 -5.7 8.0 5.8 8.0 -8.2 6.0 -6.0
coef(roller.lm)
## (Intercept) weight
## -2.087 2.667
## Model objects
names(roller.lm) # Get names of list elements
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
roller.lm$coef
## (Intercept) weight
## -2.087 2.667
## Summary information about model objects
summary(roller.lm)
##
## Call:
## lm(formula = depression ~ weight, data = roller)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.18 -5.58 -1.35 5.92 8.02
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.09 4.75 -0.44 0.6723
## weight 2.67 0.70 3.81 0.0052
##
## Residual standard error: 6.74 on 8 degrees of freedom
## Multiple R-squared: 0.644, Adjusted R-squared: 0.6
## F-statistic: 14.5 on 1 and 8 DF, p-value: 0.00518
## Sec 3.2: Distributions: Models for the Random Component
## ss 3.2.1: Discrete distributions -- models for counts
## Bernoulli distribution
## Binomial distribution
## Footnote Code
## To get the labeling (0, 1, 2) as in the text, specify:
probs <- dbinom(0:2, size=2, prob=0.5)
names(probs) <- 0:2
probs
## 0 1 2
## 0.25 0.50 0.25
pbinom(q=2, size=4, prob=0.5)
## [1] 0.6875
pbinom(q=4, size=50, prob=0.2)
## [1] 0.0185
qbinom(p = 0.70, size = 4, prob = 0.5)
## [1] 3
## Means and standard deviations
## Poisson distribution
## ss 3.2.2: Continuous distributions
## Normal distribution
## Footnote Code
## Plot the normal density, in the range -3.25 to 3.25
z <- pretty(c(-3.25,3.25), 30) # Find ~30 equally spaced points
ht <- dnorm(z) # By default: mean=0, standard deviation=1
plot(z, ht, type="l", xlab="Normal deviate", ylab="Ordinate", yaxs="i")
polygon(c(z[z <= 1.0], 1.0), c(dnorm(z[z <= 1.0]), 0), col="grey")
# Around 84.1% of the total area is to the left of the vertical line.
## Plot the normal density, in the range -3 to 3
z <- pretty(c(-3,3), 30) # Find ~30 equally spaced points
ht <- dnorm(z) # By default: mean=0, standard deviation=1
plot(z, ht, type="l", xlab="Normal deviate", ylab="Density", yaxs="i")
# yaxs="i" locates the axes at the limits of the data
pnorm(1.0) # by default, mean=0 and SD=1
## [1] 0.8413
## Footnote Code
## Additional examples
pnorm(0) # .5 (exactly half the area is to the left of the mean)
## [1] 0.5
pnorm(-1.96) # .025
## [1] 0.025
pnorm(1.96) # .975
## [1] 0.975
pnorm(1.96, mean=2) # .484 (a normal distribution with mean 2 and SD 1)
## [1] 0.484
pnorm(1.96, sd=2) # .836 (sd = standard deviation)
## [1] 0.8365
qnorm(.9) # 90th percentile; mean=0 and SD=1
## [1] 1.282
## Footnote Code
## Additional examples:
qnorm(0.841) # 1.0
## [1] 0.9986
qnorm(0.5) # 0
## [1] 0
qnorm(0.975) # 1.96
## [1] 1.96
qnorm(c(.1,.2,.3)) # -1.282 -0.842 -0.524 (10th, 20th and 30th percentiles)
## [1] -1.2816 -0.8416 -0.5244
qnorm(.1, mean=100, sd=10) # 87.2 (10th percentile, mean=100, SD=10)
## [1] 87.18
## Other continuous distributions
## Different ways to describe distributions
## Sec 3.3: Simulation of Random Numbers and Random Samples
set.seed(23286) # Use to reproduce the sample below
rbinom(10, size=1, p=.5)
## [1] 0 0 0 0 1 0 1 0 1 1
## Sampling from discrete distributions
rbinom(10, size=1, p=.5) # 10 Bernoulli trials, prob=0.5
## [1] 1 1 1 0 0 1 1 1 1 0
rbinom(25, size=4, prob=0.5)
## [1] 3 2 1 2 1 1 2 1 4 2 2 2 1 2 3 2 4 2 2 3 2 2 2 2 3
set.seed(9388)
rpois(20, 3)
## [1] 3 3 4 1 2 2 3 1 1 4 3 0 1 1 3 1 0 4 5 2
## ss 3.3.1: Sampling from the normal and other continuous distributions
options(digits=2) # Suggest number of digits to display
rnorm(10) # 10 random values from the normal distribution
## [1] -1.01 0.59 -0.74 -0.85 -0.42 1.87 2.18 0.01 0.64 1.92
## Footnote Code
## The following gives a rough equivalent of the figure:
set.seed (21) # Use to reproduce the data in the figure
par(mfrow=c(2,3))
x <- pretty(c(6.5,13.5), 40)
for(i in 1:5){
y <- rnorm(50, mean=10, sd=1)
hist(y, prob=TRUE, xlim=c(6.5,13.5), ylim=c(0,0.5), main="")
lines(x, dnorm(x,10,1))
}
par(mfrow=c(1,1))
runif(n = 20, min=0, max=1) # 20 numbers, uniform distn on (0, 1)
## [1] 0.16 0.33 0.61 0.34 0.95 0.24 0.93 0.49 0.59 0.38 0.21 0.85 0.53 0.46
## [15] 0.95 0.47 0.12 0.72 0.97 0.53
rexp(n=10, rate=3) # 10 numbers, exponential, mean 1/3.
## [1] 0.440 0.153 0.094 0.054 0.324 0.024 0.045 0.311 0.576 0.149
## Exercises at the end of this chapter explore further possibilities.
## ss 3.3.2: Simulation of regression data
options(digits=3)
n <- 8; x <- seq(1,n); sigma <- 2.5; b0 <- 2; b1 <- 3
error <- rnorm(n, sd=sigma)
y <- b0 + b1*x + error
t(data.frame(x,y))
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## x 1.00 2.0 3.0 4.0 5.0 6.0 7.0 8.0
## y 8.14 5.3 11.3 14.3 21.2 19.1 22.4 23.1
roller.lm <- lm(depression ~ weight, data=roller)
roller.sim <- simulate(roller.lm, nsim=20) # 20 simulations
matplot(roller$weight, roller.sim, pch=1, ylim=range(roller$depression))
points(roller, pch=16)
## ss 3.3.3: Simulation of the sampling distribution of the mean
## Function to generate n sample values; skew population
sampvals <- function(n)exp(rnorm(n, mean = 0.5, sd = 0.3))
## Means across rows of a dimension nsamp x sampsize matrix of
## sample values gives nsamp means of samples of size sampsize.
samplingDist <- function(sampsize=3, nsamp=1000, FUN=mean)
apply(matrix(sampvals(sampsize*nsamp), ncol=sampsize), 1, FUN)
size <- c(3,10,30)
## Simulate means of samples of 3, 9 and 30; place in dataframe
df <- data.frame(y3=samplingDist(sampsize=size[1]),
y9=samplingDist(sampsize=size[2]),
y30=samplingDist(sampsize=size[3]))
## Simulate source population (sampsize=1)
y <- samplingDist(sampsize=1)
densityplot(~y3+y9+y30, data = df, outer=TRUE, layout = c(3,1),
plot.points = FALSE, panel = function(x, ...) {
panel.densityplot(x, ..., col = "black")
panel.densityplot(y, col = "gray40", lty = 2, ...)
})
## Footnote Code
## Use strip.custom to customize the strip labeling
doStrip <- strip.custom(strip.names = TRUE, factor.levels = as.expression(size),
var.name = "Sample size", sep = expression(" = "))
## Then include the argument 'strip=doStrip' in the call to densityplot
## ss 3.3.4: Sampling from finite populations
## For the sequence below, precede with set.seed(3676)
sample(1:9384, 15, replace=FALSE)
## [1] 850 7077 8261 9379 3601 105 4963 6488 5651 8913 2022 9348 9165 9375
## [15] 920
## For the sequence below, precede with set.seed(366)
split(sample(seq(1:10)), rep(c("Control","Treatment"), 5))
## $Control
## [1] 2 3 9 8 7
##
## $Treatment
## [1] 1 5 10 4 6
# sample(1:10) gives a random re-arrangement (permutation)
# of 1, 2, ..., 10
## With replacement samples
sample(1:10, replace=TRUE)
## [1] 2 4 1 5 2 10 2 1 4 1
## Cluster sampling
## Simulation in teaching and research
## Sec 3.4: Model Assumptions
## ss 3.4.1: Random sampling assumptions -- independence
## ss 3.4.2: Checks for normality
## Graphical tools for checking for normality
## The normal probability plot
## Use qreference() (DAAG)
## With seed=21, the random numbers are as in the previous figure
qreference(m=50, seed=21, nrep=5, nrows=1) # 50 values per panel
## Footnote Code
## Set seed to get the same data as earlier
library(lattice)
qqmath(~rnorm(50*5)|rep(1:5,rep(50,5)), layout=c(5,1), aspect=1)
## The sample plot, set alongside plots for random normal data
## Compare normal probability plot for normal-ambient difference
## with simulated normal values: data frame pair65 (DAAG)
qreference(pair65$heated - pair65$ambient, nrep=8)
## How close to normal is the sampling distribution of themean?
## Formal statistical testing for normality?
## ss 3.4.3: Checking other model assumptions
## ss 3.4.4: Are non-parametric methods the answer?
## ss 3.4.5: Why models matter -- adding across contingency tables
str(UCBAdmissions)
## table [1:2, 1:2, 1:6] 512 313 89 19 353 207 17 8 120 205 ...
## - attr(*, "dimnames")=List of 3
## ..$ Admit : chr [1:2] "Admitted" "Rejected"
## ..$ Gender: chr [1:2] "Male" "Female"
## ..$ Dept : chr [1:6] "A" "B" "C" "D" ...
## Tabulate by Admit and Gender
byGender <- margin.table(UCBAdmissions, margin=1:2)
round(100*prop.table(byGender, margin=2)["Admitted", ], 1)
## Male Female
## 44.5 30.4
## Admission rates, by department
round(100*prop.table(UCBAdmissions,
margin=2:3)["Admitted", , ], 1)
## Dept
## Gender A B C D E F
## Male 62.1 63 36.9 33.1 27.7 5.9
## Female 82.4 68 34.1 34.9 23.9 7.0
## Calculate totals, by department, of males & females applying
(applicants <- margin.table(UCBAdmissions, margin=2:3))
## Dept
## Gender A B C D E F
## Male 825 560 325 417 191 373
## Female 108 25 593 375 393 341
## Calculate proportions of male & female applicants
round(100*prop.table(applicants, margin=1), 1)
## Dept
## Gender A B C D E F
## Male 30.7 20.8 12.1 15.5 7.1 13.9
## Female 5.9 1.4 32.3 20.4 21.4 18.6
## Sec 3.5: Recap
## Sec 3.6: Further Reading
x <- rpois(7, 78.3)
mean(x); var(x)
## [1] 80.9
## [1] 37.8
Markov <- function (N=100, initial.value=1, P)
{
X <- numeric(N)
X[1] <- initial.value + 1 # States 0:5; subscripts 1:6
n <- nrow(P)
for (i in 2:N){
X[i] <- sample(1:n, size=1, prob=P[X[i-1], ])}
X - 1
}
plotmarkov <-
function(n=10000, start=0, window=100, transition=Pb, npanels=5){
xc2 <- Markov(n, start, transition)
mav0 <- rollmean(as.integer(xc2==0), window)
mav1 <- rollmean(as.integer(xc2==0), window)
npanel <- cut(1:length(mav0), breaks=seq(from=1, to=length(mav0),
length=npanels+1), include.lowest=TRUE)
df <- data.frame(av0=mav0, av1=mav1, x=1:length(mav0),
gp=npanel)
print(xyplot(av0+av1 ~ x | gp, data=df, layout=c(1,npanels),
type="l", par.strip.text=list(cex=0.65),
scales=list(x=list(relation="free"))))
}