## R lesson on APIs ## Field to Database workshop ## 3/12/2015 at iDigBio ## ## Line with one "#" in front are R code you can uncomment to follow along. ## Lines with two "##" are comments and should be left in place. Beware that ## some of this code calls external APIs which may be slow. Resist the urge ## to uncomment everything! ## ## Pre-work: ## ## 1) Get a NOAA web token here: http://www.ncdc.noaa.gov/cdo-web/token ## ## 2) Create a plot.ly account here: http://plot.ly/ ## ## 3) Install the following R packages: rnoaa, ridigbio, plotly, gridExtra ## ## On Ubuntu, needed at least these libraries: apt-get install libgdal-dev ## libproj-dev to install rnoaa ## ## Install packages ## You almost certainly have these installed already, please don't re-install ## them. ##install.packages("devtools") ##install.packages("jsonlite") ##install.packages("grid") ##install.packages("ggplot2") ## You will need to install these #library("devtools") #install.packages("gridExtra") #install.packages("rnoaa") #install_github("iDigBio/ridigbio") #install_github("ropensci/plotly") ## Load survey & species data from Portal, AZ species <- read.csv("data/species.csv", stringsAsFactors=FALSE) surveys <- read.csv("data/surveys.csv", stringsAsFactors=FALSE) ## What are we interested in? gen <- "Chaetodipus" sp <- "penicillatus" ## Do we have any in our Portal survey data? sp_row <- subset(species, genus==gen & species==sp) survey_cha_pen <- subset(surveys, species_id==sp_row$species_id) ## CHALLENGE ## ## How many Chaetodipus penicillatus were trapped in Portal, AZ? ## Let's look at the month they were collected to see if that reflects this ## species being dormant during winter. hist(survey_cha_pen$month, breaks=1:12) ## Wow, sure does. But what if our scientist was lazy and didn't go out to the ## field when it was cold? How can we gauge sampling effort from the data? hist(surveys$month, breaks=1:12) ## Now it's time to compare this to a wider data set. Recall the hypothesis is ## that these mice are less active in "southern Arizona". Let's look for mice ## collected in other places. ## Load the ridigbio library (install first if needed) library("ridigbio") ## PAUSE TO CHECK THAT EVERYONE HAS THIS LOADED ## Remember the JSON out, JSON in slide from the API introduction? We're going ## to have to build a data structure that R can transform into the JSON the API ## expects to see. This is something ridigbio has chosen to use, not all ## packages work this way. query <- list("scientificname"=paste(gen, sp)) ## ( ## Let's look at how R encodes objects as JSON. This block is a side-trip and ## not something you need to remember. The ridigbio package (and others) take ## care of this for you. library("jsonlite") toJSON(query) ## Now compare that to the documentation for what the iDigBio API expects for a ## record query: https://github.com/iDigBio/idigbio-search-api/wiki/Query-Format ## Skip down to "Searching for multiple values within a field" ## ) ## Back to searching for mice in iDigBio: cha_pen_idig <- idig_search_records(rq=query) ## Unlike the surveys data set, iDigBio provides a dwc:DateCollected field as a ## string and not a month number. We need to add a column to the cha_pen_idig ## data frame with month number. We'll do that in 3 steps: ## ## 1) Change the datecollected vector to date objects ## 2) Use R's date functions to get just the month from the date object ## 3) Bind the resulting vector back to the data frame cha_pen_idig_months <- format( as.Date(cha_pen_idig$datecollected), "%m") ## CHALLENGE ## ## Do step 3 and add the cha_pen_idig_months column to the cha_pen_idig data ## frame as "monthcollected". The next line won't work until you do. #cha_pen_idig <- cbind( ## Now let's make a histogram on month collected. hist(as.numeric(cha_pen_idig$monthcollected), breaks=1:12) ## Not as clear is it? We don't have a convenient measure of sampling effort ## here to help us with why. ## Remember this histogram? The nice one? hist(survey_cha_pen$month, breaks=1:12) ## Wouldn't you love to see an average monthly temperature graph below it? ## Folks from Norway probably don't have a good idea of what "winter" is ## is like in Arizona. Let's get some data from NOAA. ## BACK TO POWER POINT! ## Start by loading the library and filling in the web token you got from ## NOAA via email. Also, the station name is long to type so let's save it as a ## variable too. (tlnrvjzLToJojhXEsUYzeGNmUEUHurYk) library("rnoaa") tok <- "" stn <- "GHCND:USC00026716" ## The rnoaa package requires us to know the name of the data set that we are ## going to download. Because the datasets vary by station, let's get a list ## of all the datasets for our station. ## CHALLENGE ## ## Use "?" to look at the help for the ncdc_datasets function. Call the ## function to get a list of the datasets the station has availible. #ncdc_datasets( ## After you've met the challenge above, just type in the name of the dataset ## that has the monthly summaries into the assignment below. ds_name <- "" ## The NOAA API is not the fastest way to get lots of data. We're just going to ## get one year of monthly summaries. The limit parameter will make sense when ## we look at what is returned by ncdc. weather <- ncdc(datasetid=ds_name, stationid=stn, startdate='2014-01-01', enddate='2014-12-01', limit=300, token=tok) ## Take a look at weather with head(). Can you describe what you see? ## BACK TO POWER POINT! # Now that we know the units and the column we want, let's pull that out to # to it's own vector to be easier to work with. mean_temp <- subset(weather$data, weather$data$datatype=="MNTM") mean_temp$degf <- mean_temp$value/10*9/5 + 32 ## ( ## Another aside: You can write your own functions in R. That C to F conversion ## is just asking to be made into a function: c_to_f <- function (c){ c*9/5 + 32 } ## Now try: mean_temp$degf_fun <- c_to_f(mean_temp$value/10) ## Simpler than having to rewrite a lot of formulas. Functions can contain any ## R code that you want to reuse. ## ) ## Remember our goal here is to make a graph of average monthly temperature to ## compare to the collection frequency of our mouse. Lets make it look like a ## bar chart to match the histogram we drew before. plot(mean_temp$degf, type="n") lines(mean_temp$degf, type="s") lines(mean_temp$degf, type="h") ## CHALLENGE ## ## Use "?" to look at the help for the par() function. Scroll down to the mfcol, ## mfrow parameters section and read it. Call par() with the mfrow parameter to ## get ready to make a layout with the collection histogram above the ## temperature plot. #par( ## Now let's draw both our graphs again on the same layout: hist(survey_cha_pen$month, breaks=1:12) plot(mean_temp$degf, type="n") lines(mean_temp$degf, type="s") lines(mean_temp$degf, type="h") ## BACK TO POWERPOINT! ## The plot.ly URL: https://plot.ly/ggplot2/getting-started/ ## ## Paste in the rest of your credentials here: library("plotly") #set_credentials_file( ## We're going to go really fast here. I had hoped to use rfigshare but is has ## a bug in it that causes crashes at the moment so we'll have to use plotly. ## They use ggplot2 for plotting so we'll have to re-do our plots using some ## slightly more complicated commands. library(ggplot2) library(grid) library(gridExtra) mice <- ggplot(survey_cha_pen, aes(survey_cha_pen$month)) + geom_histogram(fill=NA, color="black", binwidth=1) + theme_bw() ## Notice that we assigned the plot to a variable. We're going to pass that ## plot to a function in the plot.ly package that will upload it to your ## account. Probably it will open a browser window too but in case it does not ## the last line will print out the URL to your plot. py <- plotly() resp <- py$ggplotly(mice) resp$response$url