# Getting Started with R Graphics # November 2014 # StatLab@UVa Library # Clay Ford # Helpful R Studio commands ----------------------------------------------- # Description Windows & Linux Mac # ---------------------------------------------------------------- # Run current line Ctrl+Enter Command+Enter # Previous plot Ctrl+Shift+PageUp Command+Shift+PageUp # Next plot Ctrl+Shift+PageDown Command+Shift+PageDown # Packages ---------------------------------------------------------------- # we'll use the following packages today. # only submit these lines if you don't already have them installed. install.packages("ggplot2") install.packages("RColorBrewer") install.packages("googleVis") install.packages("likert") # R Graphics Framework ---------------------------------------------------- # Today we'll use data that come with R (see datasets package); # airquality data: # Daily air quality measurements in New York, May to September 1973. head(airquality) # three types of plotting functions: # (1) High-level plotting - create the graphic # create a scatterplot (High-level plot) plot(x=airquality$Temp, y=airquality$Ozone) # (2) Low-level plotting - add extra information points(x=mean(airquality$Temp,na.rm=T), y=mean(airquality$Ozone,na.rm=T), pch=19,col="red") # (3) interactive plotting - add/extract information # label point(s) you click (default is row number) identify(x=airquality$Temp,y=airquality$Ozone) # show coordinates of where I click (extract information) locator(1) # add a point or line (add and extract information) locator(1, type="p") locator(3, type="o") # Using Graphics Parameters # Use the par() function to access and modify the list of graphics parameters # for the current graphics device. # The "graphics device" is usually the window your graphic appears in. par() # see all graphics parameters help(par) # see what the parameters mean # common graphics parameter to set: mfrow (multi-frame row); # allows you to plot more than one graph in window; # set this OUTSIDE of plotting functions; par("mfrow") # see current setting; 1 x 1 plotting area par(mfrow = c(1, 2)) # set to 1 row x 2 columns (ie, 2 graphs side-by-side) # plot histograms of Temp and Ozone (high-level plots) hist(airquality$Temp) hist(airquality$Ozone) # now go up and re-do the scatter plot and see what happens. # notice the mfrow setting persists; # need to reset it. par(mfrow = c(1, 1)) # set to 1 x 1 (ie, the default) # Or just close the graphics device; that restores all par() defaults; # "clear all" button in R Studio # some par() settings can be set INSIDE plotting functions hist(airquality$Temp) # change the size of the title; # cex.main is a graphics parameter, not an argument for hist() hist(airquality$Temp, cex.main=1.5) # active during the execution of the function only par("cex.main") # see current setting; still set to 1.2 # R Graphics parameters... # some parameters can only be set by a call to par() (example: mfrow) # others can be set as arguments to high-level plotting functions (example: cex.main) # Common graphs in R ------------------------------------------------------ # --- SCATTER PLOT --- # # use data that comes with R # iris: measurements of sepal length and width and petal length and width # from each of 3 species of iris head(iris) str(iris) # basic scatter plot plot(iris$Petal.Width, iris$Petal.Length) # typing name of data before each variable gets tedious; # can use "formula" notation with plot (y ~ x) plot(Petal.Length ~ Petal.Width, data=iris) # let's customize the plot using arguments # with title and labels and blue dots plot(Petal.Length ~ Petal.Width, data=iris, main="Petal Length vs. Petal Width\nIris Dataset", xlab="Petal Length", ylab="Petal Width", pch=20, col="blue", ylim=c(0,7), xlim=c(0,2.5)) # pch = plotting character; to see what first 25 look like plot(1:25, pch=1:25) # 26:31 - unused (and ignored). # 32:127 - ASCII characters. # scatter plot by Species # col = Species plot(Petal.Length ~ Petal.Width, data=iris, main="Petal Length vs. Petal Width\nIris Dataset", xlab="Petal Length", ylab="Petal Width", pch=20, col=Species, ylim=c(0,7), xlim=c(0,2.5)) # color codes mapped to palette palette() # see available colors (657!) colors() # customze your palette palette(c("gray20","gray40","gray60")) palette() # scatter plot by Species - grayscale plot(Petal.Length ~ Petal.Width, data=iris, main="Petal Length vs. Petal Width\nIris Dataset", xlab="Petal Length", ylab="Petal Width", pch=20, col=Species, ylim=c(0,7), xlim=c(0,2.5)) # add legend with a low-level legend() function legend(x="topleft",legend=levels(iris$Species), pch=20, col=1:3) # In legend can also set x = bottomright", "bottom", "bottomleft", # "left", "top", "topright", "right", "center". palette("default") # restore color palette to default # --- HISTOGRAM/DENSITY PLOT --- # # visualize distribution of continuous data # use faithful data set; Waiting time between eruptions and the duration of the # eruption for the Old Faithful geyser in Yellowstone National Park. hist(faithful$eruptions) # length of eruptions in mins hist(faithful$eruptions, breaks=20) # add a rug (low-level function) # draw data on x axis as small vertical lines rug(faithful$eruptions) # add points to identify mean and median # cex=2 means 2 times bigger than the default points(x = mean(faithful$eruptions), y = 0, pch=19, col="blue", cex=2) points(x = median(faithful$eruptions), y = 0, pch=19, col="red", cex=2) # save hist() output values; add counts to histogram sh <- hist(faithful$eruptions, breaks=20, main="Distribution of Old Faithful Eruptions") sh # the text function adds text to a graph # the labels argument specifies the text text(x=sh$mids, y=sh$counts+1, labels=sh$counts, col = "blue3") # plot proportions (ie, a true histogram) with freq=FALSE hist(faithful$eruptions, breaks=20, freq=FALSE, main="Distribution of Old Faithful Eruptions", xlab="Eruption time in mins") # add kernel density estimate (ie, smooth line) lines(density(faithful$eruptions)) # adjust the bandwidth if you want lines(density(faithful$eruptions, adjust = 0.5), col="red") # more wiggly lines(density(faithful$eruptions, adjust = 0.25), col="blue") # even more wiggly lines(density(faithful$eruptions, adjust = 1.25), col="orange") # less wiggly # add a legend legend(x = "topright",legend = c(0.25, 0.5, 1, 1.25), col = c("blue", "red", "black", "orange"), lty=1, title = "bandwidth", cex=0.8) # --- BOX PLOT --- # # distribution of numeric values by groups boxplot(Sepal.Length ~ Species, data=iris) # with a "notch" and y label boxplot(Sepal.Length ~ Species, data=iris, ylab="Petal Width (cm)", notch = TRUE) # if notch is TRUE, a notch is drawn in each side of the boxes. # If the notches of two plots do not overlap this is 'strong evidence' # that the two medians differ # an extended graphics example... # make boxes narrower, add scale for inches on right, and add point for mean; # first set margins using par(); need more space on right side; # mar = number of lines of margin to be specified (bot, left, top, right); # simulateously set new, and save old, par settings par("mar") op <- par(mar=c(5.1, 4.1, 4.1, 5.1)) op # old settings saved as "op" par("mar") # current settings boxplot(Sepal.Length ~ Species, data=iris, ylab="Sepal Length (cm)", xlab="Species", main="Sepal Length by Species", boxwex = 0.25, # make boxes thinner col="gray") # add axis for inches using low-level axis() function # first create "pretty breakpoints" # multiply by 0.393701 to convert cm to inches wdInches <- pretty(range(iris$Sepal.Length*0.393701)) wdInches # now add axis: 4=right side, at=where to place tick marks, # lab=what to put on the axis axis(side=4, at=wdInches/0.393701, lab=wdInches) # finally add axis label using low-level mtext() function mtext("Sepal Length (in)", side=4, line=3) # add means to box plot using low-level points() function meanWidth <- aggregate(Sepal.Length ~ Species, data=iris, mean) meanWidth points(1:3, meanWidth[,2], pch=18) # to restore to original settings, call par on "op" object we created earlier: par(op) # verify settings restored par("mar") # -- STRIPCHART --- # # one dimensional scatter plots; good alternative to boxplots when data is small # Data: experiment to measure effectiveness of feed supplements on growth rate # of chickens head(chickwts) stripchart(weight ~ feed, data=chickwts) # add jitter (a small amount of "noise") stripchart(weight ~ feed, data=chickwts, method="jitter") # rotate and change dots stripchart(weight ~ feed, data=chickwts, method="jitter", vertical=TRUE, pch=1, col="grey60", main="Chick Weight by Feed") # add mean with standard error bars # calculate mean for each group # tapply: (1) the measure (2) the grouping variable (3) the statistic meanWeight <- tapply(chickwts$weight, chickwts$feed, mean) # calculate SE for each grpup seWeight <- tapply(chickwts$weight, chickwts$feed, function(x)sd(x)/sqrt(length(x))) # add mean to graph with low-level points() function points(1:6,meanWeight,pch=16) # add 2x SE bars to graph with low-level segments() function; # draws line from (x0,y0) to (x1,y1) segments(x0=1:6, y0=meanWeight-2*seWeight, x1=1:6, y1=meanWeight+2*seWeight) # --- BAR CHART/BAR GRAPH --- # # often used for graphing counts library(MASS) # has a dataset I want to use head(Cars93) # Data from 93 Cars on Sale in the USA in 1993 # bar graph of counts of Car Type Cars93$Type table(Cars93$Type) # barplot() # first argument: vector or matrix of values barplot(table(Cars93$Type)) # save table types <- table(Cars93$Type) # change y axis limits barplot(types, ylim=c(0,25)) # draw a box around the plot and add title box(which="plot") title("Counts of Car Type") # let's include Origin in our graph table(Cars93$Origin, Cars93$Type) types2 <- table(Cars93$Origin, Cars93$Type) barplot(types2, beside=TRUE) # adjacent # barplot has built-in legend argument # add legend, make y axis go to 20, set colors barplot(types2, beside = TRUE, legend=levels(Cars93$Origin), ylim=c(0,20), xlab="number of cars", main="Counts of Car Type by Origin", col=c(4,8)) # --- DOT CHART --- # # aka, Cleveland's dot chart # some people prefer these to bar charts # first argument: either a vector or matrix of numeric values # again we'll use the Cars93 dataset summary(Cars93$Type) # dotchart doesn't work on tables dotchart(summary(Cars93$Type)) # add labels and sort by count dotchart(sort(summary(Cars93$Type)), pch=16, main="Counts of Car Type", xlim=c(0,25)) # --- PIE CHARTS --- # help(pie) # from the help file: # "Pie charts are a very bad way of displaying information. # The eye is good at judging linear measures and bad at judging relative areas. # A bar chart or dot chart is a preferable way of displaying this type of data." # --- TIME SERIES PLOT --- # # back to airquality data head(airquality) # need to wrangle month and day columns into a single vector of dates; # life is easier in R if date is formatted as YYYY-MM-DD date <- paste("1973", airquality$Month, airquality$Day, sep="-") head(date) # convert date to Date object and add to airquality data frame airquality$date <- as.Date(date) head(airquality) str(airquality) # basic time series plot plot(Temp ~ date, data=airquality, type="l") plot(Temp ~ date, data=airquality, type="b") plot(Temp ~ date, data=airquality, type="b", pch=20) plot(Temp ~ date, data=airquality, type="o", pch=20) # --- LINE GRAPH --- # # Pharmacokinetics of Indomethacin (6 subjects) # time: a numeric vector of times at which blood samples were drawn (hr). # conc: a numeric vector of plasma concentrations of indomethacin (mcg/ml). head(Indometh, n=12) # plot conc over time by subject # type="l" means plot a line plot(conc ~ time, data=Indometh, type="l") # not what we want! # let's subset for subject 1 plot(conc ~ time, data=Indometh, subset= Subject==1, type="l") # and now add other subjects... # ALERT! Notice the the lines exceed the y axis! lines(conc ~ time, data=Indometh, subset= Subject==2, type="l", col=2) lines(conc ~ time, data=Indometh, subset= Subject==3, type="l", col=3) lines(conc ~ time, data=Indometh, subset= Subject==4, type="l", col=4) lines(conc ~ time, data=Indometh, subset= Subject==5, type="l", col=5) lines(conc ~ time, data=Indometh, subset= Subject==6, type="l", col=6) # let's fix the y axis and work smarter, not harder plot(conc ~ time, data=Indometh, subset= Subject==1, type="l", ylim=range(conc), main="Concentrations over Time by Subject") for(i in 2:6){ lines(conc ~ time, data=Indometh, subset= Subject==i, type="l", col=i) } # add a legend legend("topright",legend=unique(Indometh$Subject),lty=1, col=1:6, title="Subjects") # we can work even smarter using ggplot! (coming up shortly) # Saving R graphics ------------------------------------------------------- # R graphs can be saved multiple ways. # In R Studio: Export...Save as Image... # In R GUI: File...Save as... # Available formats include PNG, TIF, JPG, PDF, BMP, EPS # can also use code to save graphic files; # useful for automating creation of lots of graphs # first issue the command; "opens" a graphing device; nothing visible happens # file will be saved in working directory # functions include png(), jpeg(), bmp(), tiff() png(file = "iris1.png") # then issue plotting commands; notice it doesn't appear in R Studio window stripchart(Petal.Width ~ Species, data=iris, method="jitter") # then dev.off() # Example: create and save stripcharts of all iris measurements in your working # directory for(i in 1:4){ png(file = paste0("sc",names(iris)[i],".png")) stripchart(iris[,i] ~ Species, data=iris, method="jitter", main=names(iris)[i]) dev.off() } # clean up workspace rm(list = ls()) # back to presentation... # ggplot2 ----------------------------------------------------------------- # install.packages("ggplot2") # if you don't aready have library(ggplot2) # --- SCATTER PLOT --- # ggplot(iris, aes(x = Petal.Width, y = Petal.Length)) + geom_point() # scatter plot by group # add color=Species to aes() ggplot(iris, aes(x = Petal.Width, y = Petal.Length, color=Species)) + geom_point() # add shape=Species to aes() ggplot(iris, aes(x = Petal.Width, y = Petal.Length, shape=Species)) + geom_point() # shape and color ggplot(iris, aes(x = Petal.Width, y = Petal.Length, shape=Species, color=Species)) + geom_point() # mapping species to color and shape and making points bigger and jittered ggplot(iris, aes(x = Petal.Width, y = Petal.Length, color=Species, shape=Species)) + geom_point(size=3, position = "jitter") # mapping species to color and Sepal.Length to size # balloon scatter plot ggplot(iris, aes(x = Petal.Width, y = Petal.Length, color=Species, size=Sepal.Length)) + geom_point(position = "jitter") # fix the labels and add a title ggplot(iris, aes(x = Petal.Width, y = Petal.Length, color=Species, size=Sepal.Length)) + geom_point(position = "jitter") + labs(y="Petal Length (cm)", x= "Petal Width (cm)", size="Sepal Length", title="Iris Balloon Plot") # adding fitted regression lines with CI and save p <- ggplot(iris, aes(x = Petal.Width, y = Petal.Length, color=Species)) + geom_point() + geom_smooth(method="lm") + ggtitle("Petals of the Iris data set") p # "zoom in" on plot with coord_cartesian() p + coord_cartesian(xlim=c(0,0.75), ylim = c(0, 2)) # --- BOX PLOT --- # ggplot(iris, aes(x=Species, y=Petal.Width)) + geom_boxplot() # skinnier box plot with mean included ggplot(iris, aes(x=Species, y=Petal.Width)) + geom_boxplot(width=0.5) + stat_summary(fun.y = mean, geom="point", color="red") # --- STRIP CHART --- # ggplot(chickwts, aes(x=feed, y=weight)) + geom_point() ggplot(chickwts, aes(x=feed, y=weight)) + geom_point(position = position_jitter(w = 0.1, h = 0)) # with confidence limits # first need to calculate means and standard errors fMean <- tapply(chickwts$weight, chickwts$feed, mean) fSE <- tapply(chickwts$weight, chickwts$feed, function(x) sd(x)/sqrt(length(x))) # now create data frame; recall that ggplot requires data frame chick2 <- data.frame(feed=names(fMean), fMean, fSE) # now plot strip chart and error bars # notice we call two data frames ggplot(chickwts, aes(x=feed, y=weight)) + geom_point(position = position_jitter(w = 0.1, h = 0)) + geom_point(aes(x=feed, y=fMean), data=chick2, color="red", size=3) + geom_errorbar(aes(x=feed, y=fMean, ymin=fMean - 2*fSE, ymax=fMean + 2*fSE), data=chick2, width=0.1) # --- DOT PLOT --- # # cleveland dot plot # use state.x77 data; it's a matrix, so need to convert to data frame state.x77 class(state.x77) states <- data.frame(state=rownames(state.x77),state.x77, row.names = NULL) head(states) # population dot plot ggplot(states, aes(x=Population, y=state)) + geom_point() # in order of population ggplot(states, aes(x=Population, y=reorder(state, Population))) + geom_point(size=3) + ylab("State") + ggtitle("Population of US States in 1975") # area dot plot ggplot(states, aes(x=Area, y=reorder(state, Area))) + geom_point() + ylab("State") + ggtitle("Land Area of US States") # fix the x-axis tick marks library(scales) # for comma() function ggplot(states, aes(x=Area, y=reorder(state, Area))) + geom_point() + scale_x_continuous(breaks=pretty(range(states$Area)), labels=comma(pretty(range(states$Area)))) + ylab("State") + xlab("Area (sq miles)") + ggtitle("Land Area of US States") # --- HISTOGRAM --- # ggplot(faithful, aes(x=eruptions)) + geom_histogram() # note the warnings; developer firmly believes you should not accept default # binwidth. use the binwdith argument to change # with different color bars ggplot(faithful, aes(x=eruptions)) + geom_histogram(fill="white", color="black", binwidth=0.1) # "true" histogram - density instead of counts # ..density.. refers to a variable generated by geom_histogram ggplot(faithful, aes(x=eruptions, y = ..density..)) + geom_histogram() # with a density curve ggplot(faithful, aes(x=eruptions, y = ..density..)) + geom_histogram(fill="white", color="black", binwidth=0.1) + geom_density(adjust=0.5) # mapping bin counts to color of histogram # ..count.. refers to a variable generated by geom_histogram ggplot(faithful, aes(x=eruptions, fill = ..count..)) + geom_histogram(binwidth=0.2) # back to iris data # in groups (vertical facet) ggplot(iris, aes(x=Petal.Length)) + geom_histogram(binwidth=0.2) + facet_grid(Species ~ .) # vertical ~ horizontal # in groups (horizontal facet) ggplot(iris, aes(x=Petal.Length)) + geom_histogram(binwidth=0.2) + facet_grid(. ~ Species) # overlayed histograms with transparency ggplot(iris, aes(x=Petal.Length, fill=Species)) + geom_histogram(position="identity", alpha=0.4, binwidth=.1) # position="identity" needed for overlapping; without they get stacked; # alpha = transparency setting # --- VIOLIN PLOT --- # # compare multiple data distributions ggplot(iris, aes(x=Species, y=Petal.Length)) + geom_violin() # violin plot with boxplot ggplot(iris, aes(x=Species, y=Petal.Length)) + geom_violin() + geom_boxplot(width=0.1) # --- LINE GRAPH --- # names(Indometh) # recall how we did this above with a loop ggplot(Indometh, aes(x=time,y=conc, group=Subject)) + geom_line() # with color ggplot(Indometh, aes(x=time,y=conc,group=Subject, color=Subject)) + geom_line() # put legend in numeric order ggplot(Indometh, aes(x=time,y=conc,group=Subject, color=Subject)) + geom_line() + scale_color_discrete(limits=1:6) # single line graph of means at each time point with SE bars # first calculate means and SEs tMean <- tapply(Indometh$conc, Indometh$time, mean) tSE <- tapply(Indometh$conc, Indometh$time, function(x)sd(x)/sqrt(length(x))) # ggplot requires data in data frame Indo2 <- data.frame(time=unique(Indometh$time), tMean, tSE) # now ready to create plot ggplot(Indo2, aes(x=time,y=tMean)) + geom_line() + geom_point(size=2) + geom_errorbar(aes(ymin=tMean-2*tSE, ymax=tMean+2*tSE), width=0.2) # --- BAR GRAPH --- # ggplot(mtcars, aes(x=factor(cyl))) + geom_bar() # by group Transmission - stacked (0 = automatic, 1 = manual) ggplot(mtcars, aes(x=factor(cyl), fill=factor(am))) + geom_bar() # by group (transmission) - dodged (side-by-side) ggplot(mtcars, aes(x=factor(cyl), fill=factor(am))) + geom_bar(position="dodge", alpha=0.5) + scale_fill_discrete(labels=c("automatic", "manual")) + labs(fill="Transmission") + xlab("Cylinder") # by group (transmission) - dodged, with different color # and fix the y axis so it only displays whole numbers ggplot(mtcars, aes(x=factor(cyl), fill=factor(am))) + geom_bar(position="dodge") + scale_y_discrete(breaks=seq(0,14,2)) + scale_fill_brewer(palette="Paired", labels=c("automatic", "manual")) + labs(fill="Transmission") + xlab("Cylinder") # help(RColorBrewer) for different palettes # others to try: # SEQUENTIAL PALETTES (suited to ordered data that progress from low to high): # Blues BuGn BuPu GnBu Greens Greys Oranges OrRd PuBu PuBuGn PuRd Purples RdPu # Reds YlGn YlGnBu YlOrBr YlOrRd # DIVERGING PALETTES (equal emphasis on mid-range critical values and extremes): # BrBG PiYG PRGn PuOr RdBu RdGy RdYlBu RdYlGn Spectral # QUALITATIVE PALETTES (used to create the primary visual differences between classes): # Accent Dark2 Paired Pastel1 Pastel2 Set1 Set2 Set3 # See the online help for ggplot2; full of examples # Electronic version of R Graphics Cookbook available from UVa Library # tidy up before moving on rm(list=ls()) # Plotting functions in packages ------------------------------------------ # many R packages include special plotting functions. # these are usually functions that do much of the plotting customization for you. # Example: the likert package # Functions to analyze and visualize likert type items # install.packages("likert") # if you don't aready have library(likert) # let's use data that comes with the likert package: # North American (i.e. Canada, Mexico, and United States) results from the # 2009 Programme of International Student Assessment data(pisaitems) # load data into workspace names(pisaitems) head(pisaitems) str(pisaitems) # all factors # grab a subset of data: all questions that contain "ST25Q" read <- subset(pisaitems, select = substr(names(pisaitems), 1,5) == 'ST25Q') head(read); dim(read) names(read) <- c("Magazines", "Comic books", "Fiction", "Non-fiction books", "Newspapers") head(read) lread <- likert(read) # provides various statistics about a set of likert items lread # percents sum to 100 for each row class(lread) # "likert" object plot(lread) plot(lread, type="density") plot(lread, type="heat") # see plot methods for R objects; notice plot.likert methods(plot) # notice plot.likert help(plot.likert) # Example: googleVis package # allows users to create web pages with interactive charts based on R data frames # install.packages("googleVis") library(googleVis) data(Fruits) # data set that comes with googleVis package Fruits str(Fruits) # idvar = column name of data with the subject to be analysed # timevar = column name of data which shows the time dimension M <- gvisMotionChart(Fruits, idvar="Fruit", timevar="Year") class(M) # gvis object # Plotting a gvis object opens a browser window # and requires Flash and Internet connection to display the visualisation. plot(M) # map of hurrican sandy # data source: http://www.wunderground.com/hurricane/atlantic/2012/Post-Tropical-Cyclone-Sandy # copied and pasted table into Excel, saved as CSV sandy <- read.csv(file="http://static.lib.virginia.edu/statlab/materials/workshops/R_Graphics/sandy.csv", header=T) sandy$LatLong <- paste(sandy$Lat,sandy$Lon,sep=":") sandy$Tip <- paste(sandy$Date,sandy$Storm.Type) head(sandy) sandyMap <- gvisMap(sandy, locationvar="LatLong" , tipvar="Tip", options=list(showTip=TRUE, showLine=TRUE, enableScrollWheel=TRUE, mapType='hybrid', useMapTypeControl=TRUE, width=800,height=400)) plot(sandyMap) # BEGIN TIME-PERMITTING BONUS MATERIAL! # dynamite plunger plots (or detonator plots) ----------------------------- # comparing means with SE bars # some people frown on these; does not convey distribution of data # generate some data set.seed(5) a <- rchisq(100,df=2) b <- rnorm(100,mean=2) # create a bar plot of the means bplt <- barplot(c(mean(a),mean(b)), ylim=c(0,3),names.arg=c("a","b")) bplt # calculate standard errors of a and b se.a <- sd(a)/sqrt(100) se.b <- sd(b)/sqrt(100) # add standard error bars to barplot # draw lines from (xo,y0) to (x1,y1) arrows(x0=bplt,y0=c(mean(a),mean(b))-2*c(se.a,se.b), y1=c(mean(a),mean(b))+2*c(se.a,se.b), code=3,angle=90) # compare a and b with boxplots par(mfrow=c(1,2)) boxplot(a,xlab="a",ylim=c(min(a,b),max(a,b))) boxplot(b,xlab="b",ylim=c(min(a,b),max(a,b))) # polygon function -------------------------------------------------------- # draws a shape; the first two arguments are the x and y coordinates of a closed # figure. It is assumed that the polygon is to be closed by joining the last # point to the first point. # create a blank plotting window plot(x=c(1,10), y=c(1,10), type="n", xlab="", ylab="", axes = FALSE) ticks <- 1:10 # define tick marks axis(side = 1, at = ticks) # plot tick marks on x axis (bottom) axis(side = 2, at = ticks) # plot tick marks on y axis (left) box(which="plot") # draw a box around the plot # now draw a box in the plotting area # points at (2,2), (3,2), (3,3), (2,3) polygon(x=c(2,3,3,2),y=c(2,2,3,3),col="grey") points(x=c(2,3,3,2),y=c(2,2,3,3), pch=16) # can be used for shading areas under a curve # plot a standard normal curve x <- seq(-3,3,by=0.01) y <- dnorm(x) plot(x,y, type="l") # shade bottom 5% of curve, i.e., P(Z < -1.64) bot <- seq(-3,qnorm(0.05),length=30) # x values for polygon top <- dnorm(bot) # y values for polygon # recall: polygon() joins last point to the first point; # need to add two extra points that sit on the x axis polygon(x=c(-3,bot,qnorm(0.05)), y=c(0,top,0),col="red") text(0,0.1,"P(Z < -1.64) = 0.05") # 3D plots ---------------------------------------------------------------- # try the scatterplot3d package # base R 3D graphing functions # create some data x <- seq(-3,3, length=50) y <- x z <- outer(x, y, function(x,y)(y^2 - x^2)) # returns a 50 x 50 matrix # saddle contour(x, y, z) # contour plotd contour(x, y, z, nlevels=30) # contour plot filled.contour(x, y, z, nlevels = 30) persp(x, y, z) # 3d plot # 3d plot rotated 30 degrees "to the left" with color and shading persp(x, y, z, theta=30, col = "lightblue", shade = 0.75) image(x, y, z) # heat map # Anscombe's Quartet ------------------------------------------------------ # We need tools to easily visualize data in R, especially before analysis. # The following will help demonstrate why. # load the anscombe dataset that comes with R data(anscombe) names(anscombe) # x1 goes with y1, x2 goes with y2, etc. # x1-x4 are "predictors" # y1-y4 are "responses" options(digits=4) # limit display of digits to 4 # let's statistically investigate sapply(anscombe[,1:4], mean) # mean of x's sapply(anscombe[,1:4], sd) # sd of x's sapply(anscombe[,5:8], mean) # mean of y's sapply(anscombe[,5:8], sd) # sd of y's cor(anscombe[,1],anscombe[,5]) # correlation of x1 and y1 cor(anscombe[,2],anscombe[,6]) # correlation of x2 and y2 cor(anscombe[,3],anscombe[,7]) # correlation of x3 and y3 cor(anscombe[,4],anscombe[,8]) # correlation of x4 and y4 # they're all almost identical. # regress y1 on x1, y2 on x2, etc. m <- list() for(i in 1:4){ m[[i]] <- lm(anscombe[,i+4] ~ anscombe[,i], data=anscombe) } m # all coefficients are identical # let's graphically investigate par(mfrow=c(2,2)) # 2 x 2 plotting area # this loop will plot y1 vs x1, y2 vs x2, etc. for(i in 1:4){ # create blank plot: type="n" (high-level function) plot(x=c(0,20), y=c(0,20), type="n", xlab=paste("x",i,sep=""), ylab=paste("y",i,sep=""), axes = FALSE) # add custom axis (low-level functions) ticks <- 0:20 # define tick marks axis(side = 1, at = ticks) # plot tick marks on x axis axis(side = 2, at = ticks) # plot tick marks on y axis box(which="plot") # draw a box around the plot # plot x1 vs y1, x2 vs y2, etc, with linear regression line points(anscombe[,i],anscombe[,i+4]) abline(m[[i]]) } # RESULT: four very different plots with almost identical statistics. # Always a good idea to graph your data before analyzing it. # see http://en.wikipedia.org/wiki/Anscombe%27s_quartet # see also the R help file: help(anscombe) # Interactive 3D ---------------------------------------------------------- # Spinning 3D scatterplot # Install and load the "rgl" package ("3D visualization # device system (OpenGL)") # NOTE: This will cause RStudio to crash when graphics # window is closed. Instead, run this in the standard, # console version of R. install.packages("rgl") require("rgl") require("RColorBrewer") plot3d(iris$Petal.Length, # x variable iris$Petal.Width, # y variable iris$Sepal.Length, # z variable xlab = "Petal.Length", ylab = "Petal.Width", zlab = "Sepal.Length", col = brewer.pal(3, "Dark2")[unclass(iris$Species)], size = 8) # conditioning plots ------------------------------------------------------ # multiple plots with different subsets of data # mtcars: Motor Trend Car Road Tests # plot mpg versus horsepower conditional on cylinders coplot(mpg ~ hp | factor(cyl), data=mtcars, rows=1) # more information on colors ---------------------------------------------- # see what numbers refer to which color palette() # you can change the palette palette(rainbow(6)) # set palette to 6 rainbow colors palette() # define a new palette using the R ColorBrewer package install.packages("RColorBrewer") library(RColorBrewer) help(RColorBrewer) # brewer.pal() makes the color palettes from ColorBrewer available as R palettes. palette(brewer.pal(7, "Blues")) # load 7 colors from the RColorBrewer "Blues" palette palette() barplot(1:7,col=palette()) # trick to see colors palette("default") # return to default