## Chapter 14: The R System -- Additional Topics ## Sec 14.1: Graphical User Interfaces to R ## Installing the R Commander ## install.packages("Rcmdr", dependencies=TRUE) ## ss 14.1.1: The R Commander's interface -- a guide to getting started ## library(Rcmdr) ## ss 14.1.2: The {rattle} GUI ## library(rattle) ## rattle() ## ss 14.1.3: The Creation of Simple GUIs -- the {fgui} Package library(fgui) library(DAAG); library(lattice) ## Create function that does the smoothing fitsmooth <- function(form=mdbRain^(1/3) ~ Year, df=bomregions, myspan=0.75) print(xyplot(form, data=df, type=c("p","smooth"), span=myspan)) ## Create function that sets up a GUI interface to fitsmooth() callsmooth <- function()gui(func=fitsmooth, argSlider=list(myspan=c(0.1,1.5,0.025)), # start,stop,stepsize output=NULL, callback=guiExec) ## callsmooth() ## Click on and/or move the slider to display the graph. ## Sec 14.2: Working Directories, Workspaces and the Search List ## ss 14.2.1: *The search path search() ## ss 14.2.2: Workspace management xy <- matrix(rnorm(60000), nrow=600) xy.rowrange <- apply(xy, 1, range) save(xy, xy.rowrange, file="xy.RData") rm(xy, xy.rowrange) rm(list = ls(all=TRUE)) ## Changing the Working Directory and/or Workspace ## ss 14.2.3: Utility functions # dir() # List files in the working directory # file.choose() # Choose a file interactively # sessionInfo() # Print version numbers for R and for # # attached packages # system.file() # Show path, by default to 'package="base"' # # Try, e.g.: system.file(package="DAAG") # R.home() # Give the path to the R home directory # .Library # Path to the default library. # Sys.getenv() # Show settings of environment variables # object.size() # Show size of R object dir() Sys.getenv("R_HOME") normalizePath(Sys.getenv("R_HOME")) system.file("misc/ViewTemps.RData", package="DAAG") ## Sec 14.3: R System Configuration ## ss 14.3.1: The R Windows installation directory tree ## ss 14.3.2: The Library directories .libPaths("D:/R-2.8.1/library") .First <- function().libPaths("D:/R-2.8.1/library") ## ss 14.3.3: The startup mechanism ## Sec 14.4: Data Input and Output library(DAAG) datafile("oneBadRow") datafile("scan-demo") ## ss 14.4.1: Input of data ## Issues for data input with {read.table()} and variants ## read.table("quote.txt") ## read.table("quote.txt", quote="'") ## Tracking errors in input data nfields <- count.fields("oneBadRow.txt") nfields # Number of fields, for each row ## Now identify rows where the number of fields seems anomalous (1:length(nfields))[nfields == 4] ## One character string for each input row -- {readLines()} readLines("oneBadRow.txt", n=3) # First 3 lines readLines("oneBadRow.txt", n=-1) # All lines ## Input of fixed format data ## Input of large rectangular files scan("oneBadRow.txt", skip = 1, quiet= TRUE) data.frame(scan("scan-demo.txt", skip = 1, quiet= TRUE, what=list(ab="", col2=1, col3=1))) ## Example -- input of the Boston housing data ## readLines("bostonc.txt", n=11)[10:11] ## strsplit(readLines("bostonc.txt", n=11)[10:11], split="\t") datafile('bostonc') boston <- scan("bostonc.txt", n=-1, sep="\t", skip=10, what=c(list(1,""), as.list(rep(1,19)))) colnams <- scan("bostonc.txt", skip=9, n=21, what="") boston <- data.frame(boston) names(boston) <- colnams boston <- read.table("bostonc.txt", sep="\t", skip=9, comment.char="", header=TRUE) ## ss 14.4.2: Data output ## Output of data frames write.table(fossilfuel, file="fuel.txt") ## Redirection of screen output to a file fossilfuel # Below, this will be written to the file ## sink("fuel2.txt") ## fossilfuel # NB: No output on screen ## sink() ## Output to a file using {cat()} ## ss 14.4.3: Database connections library(DAAG); library(DAAG) library(RSQLite) driveLite <- dbDriver("SQLite") con <- dbConnect(driveLite, dbname="hillracesDB") dbWriteTable(con, "hills2000", hills2000, overwrite=TRUE) dbWriteTable(con, "nihills", nihills, overwrite=TRUE) dbListTables(con) ## Obtain rows 11 to 20 from the newly created nihills table dbGetQuery(con, "select * from nihills limit 10 offset 10") dbDisconnect(con) ## Sec 14.5: Functions and operators -- Some Further Details ## Issues for the writing and use of functions ## ss 14.5.1: Function arguments ## The {args()} function args(write.table) # Version 2.10.0 of R ## The {...} argument x <- c(1,5,7); z <- c(4,9,10,NA); u <- 1:10 # Now do calculations that use x, z and u rm(x, z, u) ## ss 14.5.2: Character string and vector functions substring("abracadabra",3, 8) # Extract characters 3 to 8 nchar("abracadabra") # Count the number of characters strsplit("abracadabra", "r") # Split wherever "r" appears strsplit("abcd", split="") # Split into separate characters paste("ab","c","d", sep="") # Join together paste(c("a","b","c"), collapse="") # Join vector elements ## ss 14.5.3: Anonymous functions ssfun <- function(x)sum(x^2) sapply(elastic1, ssfun) # elastic1 is from the DAAG package sapply(elastic1, function(x)sum(x^2)) growthfun <- function(x)(x[9] - x[1])/x[1] sapply(austpop[, -c(1,10)], growthfun) sapply(austpop[, -c(1,10)], function(x)(x[9] - x[1])/x[1]) ## ss 14.5.4: Functions for working with dates (and times) # Electricity Billing Dates dd <- as.Date(c("2003-08-24","2003-11-23","2004-02-22","2004-05-03")) diff(dd) as.Date("1/1/1960", format="%d/%m/%Y") as.Date("1:12:1960",format="%d:%m:%Y") as.Date("1960-12-1") - as.Date("1960-1-1") as.Date("31/12/1960","%d/%m/%Y") julian(as.Date("1/1/1970","%d/%m/%Y")) julian(as.Date("1/1/2000","%d/%m/%Y")) dec1 <- as.Date("2004-12-1") format(dec1, format="%b %d %Y") format(dec1, format="%a %b %d %Y") ## Labeling of graph: data frame jobs (DAAG) startofmonth <- seq(from=as.Date("1Jan1995", format="%d%b%Y"), by="1 month", length=24) atdates <- seq(from=as.Date("1Jan1995", format="%d%b%Y"), by="6 month", length=4) datelabs <- format(atdates, "%b%y") xyplot(BC+Alberta ~ startofmonth, data=jobs, outer=TRUE, scales=list(x=list(at=atdates, labels=datelabs)), auto.key=list(columns=2, between=1)) dd <- as.Date(c("2003-08-24","2003-11-23","2004-02-22","2004-05-03")) weekdays(dd) months(dd) quarters(dd) ## ss 14.5.5: Creating Groups library(MASS) catBP <- cut(Pima.tr2$bp, breaks=4) table(catBP, exclude=NULL) ## ss 14.5.6: Logical operators c(TRUE, TRUE, FALSE) & c(FALSE, TRUE, FALSE) c(TRUE, TRUE, FALSE) && c(FALSE, TRUE, FALSE) ## A vector form of {if} statement ## Operators are functions "+"(2, 3) ## Sec 14.6: Factors fac <- factor(c("c", "b", "a")) for (i in fac) print(i) ## Ordered factors stress.level <- rep(c("low","medium","high"), 2) ordf.stress <- ordered(stress.level, levels=c("low", "medium", "high")) ordf.stress ordf.stress < "medium" ordf.stress >= "medium" ## Factor contrasts contr.treatment(3) contr.sum(3) cities <- factor(c("Melbourne","Sydney","Adelaide")) contr.sum(cities) ## *Tests for main effects in the presence of interactions? ## Sec 14.7: Missing Values x <- c(1, 6, 2, NA) is.na(x) # TRUE for NAs, otherwise FALSE x == NA # All elements are set to NA NA == NA x <- c(1, NA, 6, 2, 10) x > 4 # The second element will be NA x[x>4] # NB: This generates a vector of length 3 ## x[x > 4] <- c(101, 102) x[!is.na(x) & x > 4] <- c(101, 102) x ## Counting and identifying { NA}s -- the use of {table()} with(nswdemo, table(trt, re74>0, useNA="ifany")) fac_re74 <- with(nswdemo, factor(re74>0, exclude=NULL)) levels(fac_re74) levels(fac_re74) <- c("0", "gt0", "") with(nswdemo, table(trt, fac_re74)) ## The removal of NAs ## {NA}s in modeling functions options()$na.action # Version 2.10.0, following startup ## Sorting and ordering, where there are NAs x <- c(1, 20, 2, NA, 22) order(x) # By default, na.last=TRUE x[order(x)] sort(x) # By default na.last=NA sort(x, na.last=TRUE) ## Sec 14.8: * Matrices and Arrays xx <- matrix(1:6, ncol=3) # Equivalently, enter xx dim(xx) ## Use as.vector() x <- as.vector(xx) ## Alternatively, directly remove the dimension attribute x <- xx dim(x) <- NULL ## The extraction of submatrices x34 <- matrix(1:12, ncol=4) x34 x34[2:3, c(1,4)] # Extract rows 2 & 3 and columns 1 & 4 x34[-2, ] # Extract all rows except the second x34[-2, -3] # Omit row 2 and column 3 x34[2, ] # The dimension attribute is dropped x34[2, , drop=FALSE] # Retain the dimension attribute ## Conversion of data frames and tables into matrices ## ss 14.8.1: Matrix arithmetic ## Set up example matrices G, H and B G <- matrix(1:12, nrow=4); H <- matrix(112:101, nrow=4); B <- 1:3 G + H # Element-wise addition (X & Y both n by m) G * H # Element-wise multiplication G %*% B # Matrix multiplication (X is n by k; B is k by m) ## Set up example matrices X (square, full rank) and Y X <- matrix(c(1,1,1, -1,0,1, 2,4,2), nrow=3) Y <- matrix(1:6, nrow=3) solve(X, Y) # Solve XB = Y (X must be square) ## Computational Efficiency xy <- matrix(rnorm(1500000),ncol=50) dim(xy) system.time(xy+1) # user and system are processor times xy.df <- data.frame(xy) system.time(xy.df+1) ## ss 14.8.2: Outer products outer(1:4, 1:10) rbgshades <- outer(c("red","blue","green"), c("",paste(1:4)), function(x,y)paste(x,y, sep="")) rbgshades # Display the matrix plot(rep(0:4, rep(3,5)), rep(1:3, 5), col=rbgshades, pch=15, cex=8) ## ss 14.8.3: Arrays x <- 1:24 dim(x) <- c(2, 12); print(x) dim(x) <- c(3, 4, 2); print(x) ## Sec 14.9: Manipulations with Lists, Data Frames, Matrices and Time Series ## ss 14.9.1: Lists -- an extension of the notion of ``vector'' zz <- list("Shireen", "Peter", c("Luke","Amelia","Ted"), c(8,4,0)) zz[c(3,1)] zz[c(3,1)] ## Return list whose only element is the vector c("Luke","Amelia") zz[3] ## Return the vector c("Luke","Amelia","Ted") zz[[3]] duo <- list(family="Braun", names=c("Matthew","Phillip"), ages=c(14,9)) duo[["names"]] # Alternatively, specify duo$names ## The dual identity of data frames xyz <- data.frame(x=1:4, y=11:14, z=I(letters[1:4])) xyz[1, ] # Returns a data frame, i.e., a list) xyz[1, , drop=TRUE] unlist(xyz[1, ]) # Returns, here, a vector of atomic mode unlist(xyz[1, , drop=TRUE]) ## ss 14.9.2: Changing the shape of data frames (or matrices) ## Melting and casting library(reshape) Jobs <- melt(jobs, measure.vars=names(jobs)[1:6], variable_name="Region", id.vars="Date") head(Jobs, 3) jobsBack <- cast(Jobs, Date ~ Region) head(jobsBack,3) ## ss 14.9.3: * Merging data frames -- {merge()} new.Cars93 <- merge(x=Cars93, y=Cars93.summary[, "abbrev", drop=F], by.x="Type", by.y="row.names") ## ss 14.9.4: Joining data frames, matrices and vectors -- {cbind()} ## Conversion of tables and arrays into data frames ## ss 14.9.5: The {apply} family of functions ## The {tapply()} function ## Compare tapply() with aggregate(): data frame kiwishade (DAAG) with(kiwishade, tapply(yield, INDEX=list(block, shade), FUN=mean)) with(kiwishade, aggregate(yield, by=list(block, shade), FUN=mean)) ## The {apply()} function ## The functions {lapply()} and {sapply()} ## Uses data frame rainforest (DAAG) sapply(rainforest[, -7], range, na.rm=TRUE) ## ss 14.9.6: Splitting vectors and data frames into lists -- {split()} with(Cars93.summary, split(No.of.cars, Max.passengers)) ## Split dataframe by Max.passengers (2nd column) split(Cars93.summary[, -2], Cars93.summary[, 2]) ## ss 14.9.7: Multivariate time series jobts <- ts(jobs[,1:6], start=1995, frequency=12) colnames(jobts) ## Subseries through to the third month of 1995 window(jobts, end=1995+2/12) # Rows are 1995+0/12, 1990+1/12, 1990+2/12 plot(jobts, plot.type="single") # Use one panel for all plot(jobts, plot.type="multiple") # Separate panels. ## Alternative code for Figure 2.9A; plot as time series plot(jobts, plot.type="single", xlim=c(1995,1997.2), lty=1:6, log="y", xaxt="n", xlab="", ylab="Number of Jobs") ## Move label positions so that labels do not overlap ylast <- bounce(window(jobts, 1996+11/12), d=strheight("O"), log=TRUE) # bounce() is from DAAG text(rep(1996+11/12,6), ylast, colnames(ylast), pos=4) datlab <- format(seq(from=as.Date("1Jan1995", format="%d%b%Y"), by="3 month", length=8), "%b%Y") axis(1, at=seq(from=1995, by=0.25, length=8), datlab) ## Sec 14.10: Classes and Methods ## S3 methods and classes print ## ss 14.10.1: Printing and summarizing model objects elastic.lm <- lm(distance ~ stretch, data=elasticband) elastic.sum <- summary(elastic.lm) ## ss 14.10.2: Extracting information from model objects names(elastic.lm) elastic.lm$coefficients elastic.lm[["coefficients"]] elastic.lm[[1]] coef(elastic.lm) x <- 1:5 class(x) <- "lm" # Inappropriate assignment of class ## print(x) ## ss 14.10.3: S4 classes and methods library(lme4) hp.lmList <- lmList(o2 ~ wattsPerKg | id, data=humanpower1) slotNames(hp.lmList) slot(hp.lmList, "call") hp.lmList@call ## Sec 14.11: Manipulation of Language Constructs ## ss 14.11.1: Model and graphics formulae plot.mtcars <- function(xvar="disp", yvar="hp"){ mt.form <- paste(yvar, "~", xvar) plot(formula(mt.form), data=mtcars) } ## Plot using data frame mtcars (datasets) names(mtcars) plot.mtcars() plot.mtcars(xvar="disp", yvar="mpg") ## Extraction of Variable Names from Formula Objects all.vars(mpg ~ disp) plot.mtcars <- function(form = mpg~disp){ ## Include information that allows a meaningful label mtcars.info <- c(mpg= "Miles/(US) gallon", cyl= "Number of cylinders", disp= "Displacement (cu.in.)", hp= "Gross horsepower", drat= "Rear axle ratio", wt= "Weight (lb/1000)", qsec= "1/4 mile time", vs= "V/S", am= "Transmission (0 = automatic, 1 = manual)", gear= "Number of forward gears", carb= "Number of carburettors") xlab <- mtcars.info[all.vars(form)[1]] ylab <- mtcars.info[all.vars(form)[2]] plot(form, xlab=xlab, ylab=ylab, data=mtcars) } ## ss 14.11.2: The use of a list to pass arguments mean(rnorm(20)) do.call("mean", args=list(x=rnorm(20))) simulate.distribution <- function(distn="weibull", params=list(n=10)){ ## Simulates one of: weibull, gaussian, logistic if(distn=="weibull" & !("shape" %in% names(params))) params <- c(shape=1, params) ## weibull requires a default shape argument. ## ===================================================== ## Choose the function that will generate the random sample rfun <- switch(distn, weibull = rweibull, normal = rnorm, logistic= rlogis) ## Call rfun(). Use of do.call() makes it possible to give ## the argument list as a list of named values. ## Pass shape argument (or NULL), plus n = # of numbers do.call("rfun", args=params) } simulate.distribution() plot(density(simulate.distribution("normal", params=list(n=100)))) plot(density(simulate.distribution("weibull", params=list(n=100, scale=0.5)))) mean.call <- call("mean", x=rnorm(5)) eval(mean.call) eval(mean.call) mean.call ## ss 14.11.3: Expressions local(a+b*x+c*x^2, envir=list(x=1:4, a=3, b=5, c=1)) ## ss 14.11.4: Environments test <- function()as.character(sys.call()) test() newtest <- test # Create a copy with a different name newtest() ## Automatic naming of a file that holds function output hcopy <- function(width=2.25, height=2.25, pointsize=8){ funtxt <- sys.call(1) fnam <- paste(funtxt, ".pdf", sep="") print(paste("Output is to the file '", fnam, "'", sep="")) pdf(file=fnam, width=width, height=height, pointsize= pointsize) } fig1 <- function(){ hcopy() # Call with default arguments curve(sin, -pi, 2*pi) dev.off() } ## ss 14.11.5: Function environments, and lazy evaluation ## Lazy evaluation lazyfoo <- function(x=4, y = x^2)y lazyfoo() lazyfoo(x=3) # lazyfoo(y=x^3) # We specify a value for y in the function call # x <- 9 # Now, in the parent environment, x=9 # lazyfoo(y=x^3) ## Example -- a function that identifies objects added during asession dsetnames <- objects() additions <- function(objnames = dsetnames){ newnames <- objects(envir=.GlobalEnv) existing <- newnames %in% objnames newnames[!existing] } additions(dsetnames) ## Sec 14.12: *Creation of R Packages system.file("misc/ViewTemps.RData", package="DAAG") ## package.skeleton(name = "ViewTemps") ## Namespaces ## Sec 14.13: Document Preparation --- {Sweave()} and {xtable()} ## Sweave("sec1-1.Rnw", keep.source=TRUE) # actually, Sweave("sec1-1") is sufficient # The argument keep.source=TRUE preserves the code layout # and ensures that comments are retained. R.home(component="share/texmf/Sweave.sty") ## Sec 14.14: Further Reading citation() citation("DAAG") ## Vignettes vignette() # All vignettes in all installed packages ## vignette(package="grid") ## vignette("viewports") # Equivalent to vignette(topic="viewports") ## Use of split() and sapply(): data frame science (DAAG) with(science, sapply(split(school, PrivPub), function(x)length(unique(x)))) ### Thanks to Markus Hegland (ANU), who wrote the initial version ##1 Generate the data cat("generate data \n") n <- 800 # length of noise vector m <- 100 # length of signal vector xsignal <- runif(m) sig <- 0.01 enoise <- rnorm(m)*sig ysignal <- xsignal**2+enoise maxys <- max(ysignal) minys <- min(ysignal) x <- c(runif(n), xsignal) y <- c(runif(n)*(maxys-minys)+minys, ysignal) # random permutation of the data vectors iperm <- sample(seq(x)) x <- x[iperm] y <- y[iperm] # normalize the data, i.e., scale x & y values to # lie between 0 & 1 xn <- (x - min(x))/(max(x) - min(x)) yn <- (y - min(y))/(max(y) - min(y)) ##1 End ##2 determine number of neighbors within # a distance <= h = 1/sqrt(length(xn)) nx <- length(xn) # determine distance matrix d <- sqrt( (matrix(xn, nx, nx) - t(matrix(xn, nx, nx)) )**2 + (matrix(yn, nx, nx) - t(matrix(yn, nx, nx)) )**2 ) h <- 1/sqrt(nx) nnear <- apply(d <= h, 1, sum) ##2 End ##3 Plot data, with reconstructed signal overlaid. cat("produce plots \n") # identify the points which have many such neighbors ns <- 8 plot(x,y) points(x[nnear > ns], y[nnear > ns], col="red", pch=16) ##3 End n <- 10000; system.time(sd(rnorm(n))) library(plyr) aaply(.data=UCBAdmissions, .margins=1:2, .fun=sum) adply(.data=UCBAdmissions, .margins=1:3, .fun=identity) library(DAAG) daply(.data=tinting, .variables=c("sex","agegp"), .fun=length) ## References ## References for further reading ## References ## References for further reading