## Chapter 3: Statistical Models 

## Sec 3.1: Statistical Models 
## ss 3.1.1: Incorporation of a error or noise component 
## Footnote Code
## Plot depression vs weight: data frame roller (DAAG) 
library(DAAG)
## Loading required package: lattice
par(pty="s")
plot(depression ~ weight, data = roller, xlim=c(0,1.04*max(weight)), 
     ylim=c(0,1.04*max(depression)),
     xaxs="i", yaxs="i",   # "i"=inner: Fit axes exactly to the limits 
     xlab = "Weight of roller (t)", ylab = "Depression(mm)", pch = 16)  
abline(0, 2.25)            # A slope of 2.25 looks about right 

plot of chunk unnamed-chunk-1

par(pty="m")

##                       Predicting with models 
##                        Which model is best? 
## ss 3.1.2: Fitting models -- the model formula 
## Fit line - by default, this fits intercept & slope. 
## requires data frame roller (DAAG) 
roller.lm <- lm(depression ~ weight, data=roller) 
## Compare with the code used to plot the data 
par(pty="s")
plot(depression ~ weight, data=roller) 
## Add the fitted line to the plot 
abline(roller.lm)

plot of chunk unnamed-chunk-1

par(pty="m")

## Footnote Code
## For a model that omits the intercept term, specify 
lm(depression ~ -1 + weight, data=roller)   
## 
## Call:
## lm(formula = depression ~ -1 + weight, data = roller)
## 
## Coefficients:
## weight  
##   2.39
##             Fitted values, residuals and coefficients 
round(fitted(roller.lm), 1) 
##    1    2    3    4    5    6    7    8    9   10 
##  3.0  6.2  6.7 10.7 12.0 14.2 15.0 18.2 24.0 31.0
round(resid(roller.lm), 1) 
##    1    2    3    4    5    6    7    8    9   10 
## -1.0 -5.2 -1.7 -5.7  8.0  5.8  8.0 -8.2  6.0 -6.0
coef(roller.lm) 
## (Intercept)      weight 
##      -2.087       2.667
##                           Model objects 
names(roller.lm)     # Get names of list elements 
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
roller.lm$coef 
## (Intercept)      weight 
##      -2.087       2.667
##              Summary information about model objects 
summary(roller.lm) 
## 
## Call:
## lm(formula = depression ~ weight, data = roller)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8.18  -5.58  -1.35   5.92   8.02 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    -2.09       4.75   -0.44   0.6723
## weight          2.67       0.70    3.81   0.0052
## 
## Residual standard error: 6.74 on 8 degrees of freedom
## Multiple R-squared:  0.644,  Adjusted R-squared:   0.6 
## F-statistic: 14.5 on 1 and 8 DF,  p-value: 0.00518
## Sec 3.2: Distributions: Models for the Random Component 
## ss 3.2.1: Discrete distributions -- models for counts 
##                       Bernoulli distribution 
##                       Binomial distribution 
## Footnote Code
## To get the labeling (0, 1, 2) as in the text, specify: 
probs <- dbinom(0:2, size=2, prob=0.5) 
names(probs) <- 0:2 
probs 
##    0    1    2 
## 0.25 0.50 0.25
pbinom(q=2, size=4, prob=0.5) 
## [1] 0.6875
pbinom(q=4, size=50, prob=0.2) 
## [1] 0.0185
qbinom(p = 0.70, size = 4, prob = 0.5) 
## [1] 3
##                   Means and standard deviations 
##                        Poisson distribution 
## ss 3.2.2: Continuous distributions 
##                        Normal distribution 
## Footnote Code
## Plot the normal density, in the range -3.25 to 3.25 
z <- pretty(c(-3.25,3.25), 30) # Find ~30 equally spaced points 
ht <- dnorm(z)                 # By default: mean=0, standard deviation=1 
plot(z, ht, type="l", xlab="Normal deviate", ylab="Ordinate", yaxs="i") 
polygon(c(z[z <= 1.0], 1.0), c(dnorm(z[z <= 1.0]), 0), col="grey") 

plot of chunk unnamed-chunk-1

 # Around 84.1% of the total area is to the left of the vertical line. 

## Plot the normal density, in the range -3 to 3 
z <- pretty(c(-3,3), 30)   # Find ~30 equally spaced points 
ht <- dnorm(z)             # By default: mean=0, standard deviation=1 
plot(z, ht, type="l", xlab="Normal deviate", ylab="Density", yaxs="i") 

plot of chunk unnamed-chunk-1

 # yaxs="i" locates the axes at the limits of the data 

pnorm(1.0)         # by default, mean=0 and SD=1 
## [1] 0.8413
## Footnote Code
## Additional examples 
pnorm(0)             # .5    (exactly half the area is to the left of the mean) 
## [1] 0.5
pnorm(-1.96)         # .025 
## [1] 0.025
pnorm(1.96)          # .975 
## [1] 0.975
pnorm(1.96, mean=2)  # .484  (a normal distribution with mean 2 and SD 1) 
## [1] 0.484
pnorm(1.96, sd=2)    # .836  (sd = standard deviation) 
## [1] 0.8365
qnorm(.9)          # 90th percentile; mean=0 and SD=1 
## [1] 1.282
## Footnote Code
## Additional examples: 
qnorm(0.841)         # 1.0 
## [1] 0.9986
qnorm(0.5)           # 0 
## [1] 0
qnorm(0.975)         # 1.96 
## [1] 1.96
qnorm(c(.1,.2,.3))   # -1.282 -0.842 -0.524  (10th, 20th and 30th percentiles) 
## [1] -1.2816 -0.8416 -0.5244
qnorm(.1, mean=100, sd=10)  # 87.2 (10th percentile, mean=100, SD=10) 
## [1] 87.18
##                   Other continuous distributions 
##              Different ways to describe distributions 

## Sec 3.3: Simulation of Random Numbers and Random Samples 
set.seed(23286)  # Use to reproduce the sample below 
rbinom(10, size=1, p=.5) 
##  [1] 0 0 0 0 1 0 1 0 1 1
##                Sampling from discrete distributions 
rbinom(10, size=1, p=.5)   # 10 Bernoulli trials, prob=0.5 
##  [1] 1 1 1 0 0 1 1 1 1 0
rbinom(25, size=4, prob=0.5)   
##  [1] 3 2 1 2 1 1 2 1 4 2 2 2 1 2 3 2 4 2 2 3 2 2 2 2 3
set.seed(9388) 
rpois(20, 3) 
##  [1] 3 3 4 1 2 2 3 1 1 4 3 0 1 1 3 1 0 4 5 2
## ss 3.3.1: Sampling from the normal and other continuous distributions 
options(digits=2)  # Suggest number of digits to display 
rnorm(10)          # 10 random values from the normal distribution 
##  [1] -1.01  0.59 -0.74 -0.85 -0.42  1.87  2.18  0.01  0.64  1.92
## Footnote Code
## The following gives a rough equivalent of the figure: 
set.seed (21)        # Use to reproduce the data in the figure 
par(mfrow=c(2,3)) 
x <- pretty(c(6.5,13.5), 40) 
for(i in 1:5){ 
    y <- rnorm(50, mean=10, sd=1) 
    hist(y, prob=TRUE, xlim=c(6.5,13.5), ylim=c(0,0.5), main="") 
    lines(x, dnorm(x,10,1)) 
    } 
par(mfrow=c(1,1)) 

plot of chunk unnamed-chunk-1

runif(n = 20, min=0, max=1) # 20 numbers, uniform distn on (0, 1) 
##  [1] 0.16 0.33 0.61 0.34 0.95 0.24 0.93 0.49 0.59 0.38 0.21 0.85 0.53 0.46
## [15] 0.95 0.47 0.12 0.72 0.97 0.53
rexp(n=10, rate=3)          # 10 numbers, exponential, mean 1/3. 
##  [1] 0.440 0.153 0.094 0.054 0.324 0.024 0.045 0.311 0.576 0.149
## Exercises at the end of this chapter explore further possibilities. 

## ss 3.3.2: Simulation of regression data 
options(digits=3) 
n <- 8;  x <- seq(1,n); sigma <- 2.5; b0 <- 2;  b1 <- 3 
error <- rnorm(n, sd=sigma) 
y <- b0 + b1*x + error  
 
t(data.frame(x,y)) 
##   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## x 1.00  2.0  3.0  4.0  5.0  6.0  7.0  8.0
## y 8.14  5.3 11.3 14.3 21.2 19.1 22.4 23.1
roller.lm <- lm(depression ~ weight, data=roller) 
roller.sim <- simulate(roller.lm, nsim=20)  # 20 simulations 

matplot(roller$weight, roller.sim, pch=1, ylim=range(roller$depression)) 
points(roller, pch=16) 

plot of chunk unnamed-chunk-1

## ss 3.3.3: Simulation of the sampling distribution of the mean 
## Function to generate n sample values; skew population 
sampvals <- function(n)exp(rnorm(n, mean = 0.5, sd = 0.3)) 
## Means across rows of a dimension nsamp x sampsize matrix of  
## sample values gives nsamp means of samples of size sampsize. 
samplingDist <- function(sampsize=3, nsamp=1000, FUN=mean) 
  apply(matrix(sampvals(sampsize*nsamp), ncol=sampsize), 1, FUN) 
size <- c(3,10,30) 
## Simulate means of samples of 3, 9 and 30; place in dataframe  
df <- data.frame(y3=samplingDist(sampsize=size[1]),  
                 y9=samplingDist(sampsize=size[2]),  
                 y30=samplingDist(sampsize=size[3])) 

## Simulate source population (sampsize=1) 
y <- samplingDist(sampsize=1) 
densityplot(~y3+y9+y30, data = df, outer=TRUE, layout = c(3,1),     
            plot.points = FALSE, panel = function(x, ...) {          
                panel.densityplot(x, ..., col = "black")             
                panel.densityplot(y, col = "gray40", lty = 2, ...)   
              })                                                      

plot of chunk unnamed-chunk-1

## Footnote Code
## Use strip.custom to customize the strip labeling 
doStrip <- strip.custom(strip.names = TRUE, factor.levels = as.expression(size), 
                        var.name = "Sample size", sep = expression(" = "))  
## Then include the argument 'strip=doStrip' in the call to densityplot  

## ss 3.3.4: Sampling from finite populations 
## For the sequence below, precede with set.seed(3676) 
sample(1:9384, 15, replace=FALSE) 
##  [1]  850 7077 8261 9379 3601  105 4963 6488 5651 8913 2022 9348 9165 9375
## [15]  920
## For the sequence below, precede with set.seed(366) 
split(sample(seq(1:10)), rep(c("Control","Treatment"), 5)) 
## $Control
## [1] 2 3 9 8 7
## 
## $Treatment
## [1]  1  5 10  4  6
 # sample(1:10) gives a random re-arrangement (permutation)  
 # of 1, 2, ..., 10 

##                      With replacement samples 
sample(1:10, replace=TRUE) 
##  [1]  2  4  1  5  2 10  2  1  4  1
##                          Cluster sampling 
##                Simulation in teaching and research 

## Sec 3.4: Model Assumptions 
## ss 3.4.1: Random sampling assumptions -- independence 
## ss 3.4.2: Checks for normality 
##             Graphical tools for checking for normality 
##                    The normal probability plot 
## Use qreference() (DAAG) 
## With seed=21, the random numbers are as in the previous figure 
qreference(m=50, seed=21, nrep=5, nrows=1)  # 50 values per panel 

plot of chunk unnamed-chunk-1

## Footnote Code
## Set seed to get the same data as earlier 
library(lattice) 
qqmath(~rnorm(50*5)|rep(1:5,rep(50,5)), layout=c(5,1), aspect=1) 

plot of chunk unnamed-chunk-1

##    The sample plot, set alongside plots for random normal data 
## Compare normal probability plot for normal-ambient difference 
## with simulated normal values: data frame pair65 (DAAG) 
qreference(pair65$heated - pair65$ambient, nrep=8) 

plot of chunk unnamed-chunk-1

##    How close to normal is the sampling distribution of themean? 
##             Formal statistical testing for normality? 
## ss 3.4.3: Checking other model assumptions 
## ss 3.4.4: Are non-parametric methods the answer? 
## ss 3.4.5: Why models matter -- adding across contingency tables 
str(UCBAdmissions) 
##  table [1:2, 1:2, 1:6] 512 313 89 19 353 207 17 8 120 205 ...
##  - attr(*, "dimnames")=List of 3
##   ..$ Admit : chr [1:2] "Admitted" "Rejected"
##   ..$ Gender: chr [1:2] "Male" "Female"
##   ..$ Dept  : chr [1:6] "A" "B" "C" "D" ...
## Tabulate by Admit and Gender 
byGender <- margin.table(UCBAdmissions, margin=1:2) 
round(100*prop.table(byGender, margin=2)["Admitted", ], 1) 
##   Male Female 
##   44.5   30.4
## Admission rates, by department 
round(100*prop.table(UCBAdmissions,  
                     margin=2:3)["Admitted", , ], 1) 
##         Dept
## Gender      A  B    C    D    E   F
##   Male   62.1 63 36.9 33.1 27.7 5.9
##   Female 82.4 68 34.1 34.9 23.9 7.0
## Calculate totals, by department, of males & females applying 
(applicants <- margin.table(UCBAdmissions, margin=2:3)) 
##         Dept
## Gender     A   B   C   D   E   F
##   Male   825 560 325 417 191 373
##   Female 108  25 593 375 393 341
## Calculate proportions of male & female applicants 
round(100*prop.table(applicants, margin=1), 1) 
##         Dept
## Gender      A    B    C    D    E    F
##   Male   30.7 20.8 12.1 15.5  7.1 13.9
##   Female  5.9  1.4 32.3 20.4 21.4 18.6
## Sec 3.5: Recap 

## Sec 3.6: Further Reading 
x <- rpois(7, 78.3) 
mean(x); var(x) 
## [1] 80.9
## [1] 37.8
Markov <- function (N=100, initial.value=1, P) 
{ 
    X <- numeric(N) 
    X[1] <- initial.value + 1  # States 0:5; subscripts 1:6 
    n <- nrow(P) 
    for (i in 2:N){ 
    X[i] <- sample(1:n, size=1, prob=P[X[i-1], ])} 
    X - 1 
} 

plotmarkov <- 
  function(n=10000, start=0, window=100, transition=Pb, npanels=5){ 
    xc2 <- Markov(n, start, transition) 
    mav0 <- rollmean(as.integer(xc2==0), window) 
    mav1 <- rollmean(as.integer(xc2==0), window) 
    npanel <- cut(1:length(mav0), breaks=seq(from=1, to=length(mav0), 
                  length=npanels+1), include.lowest=TRUE) 
    df <- data.frame(av0=mav0, av1=mav1, x=1:length(mav0), 
                     gp=npanel) 
    print(xyplot(av0+av1 ~ x | gp, data=df, layout=c(1,npanels),  
                 type="l", par.strip.text=list(cex=0.65),  
                 scales=list(x=list(relation="free")))) 
  }