Getting data from the web: API access

MACS 30500 University of Chicago

November 13, 2017

Methods for obtaining data online

  • Click and download
  • Install and play
  • API query
  • Scraping

Click and download

  • read.csv or readr::read_csv
  • downloader package or curl

Data supplied on the web

  • Application programming interface (API)
  • Client
  • Server

Install and play packages

  • Packages with R functions written for existing APIs
  • Useful because
    • provenance
    • reproducible
    • updating
    • ease
    • scaling

The Manifesto Project: manifestoR

  • Collects and organizes political party manifestos from around the world
  • Over 1000 parties from 1945 until today in over 50 countries on five continents
  • manifestoR

Load library and set API key

library(manifestoR)

# retrieve API key stored in .Rprofile
mp_setapikey(key = getOption("manifesto_key"))

Retrieve the database

(mpds <- mp_maindataset())
## Connecting to Manifesto Project DB API... 
## Connecting to Manifesto Project DB API... corpus version: 2017-1
## # A tibble: 4,214 x 173
##    country countryname oecdmember eumember      edate   date party
##      <dbl>       <chr>      <dbl>    <dbl>     <date>  <dbl> <dbl>
##  1      11      Sweden          0        0 1944-09-17 194409 11220
##  2      11      Sweden          0        0 1944-09-17 194409 11320
##  3      11      Sweden          0        0 1944-09-17 194409 11420
##  4      11      Sweden          0        0 1944-09-17 194409 11620
##  5      11      Sweden          0        0 1944-09-17 194409 11810
##  6      11      Sweden          0        0 1948-09-19 194809 11220
##  7      11      Sweden          0        0 1948-09-19 194809 11320
##  8      11      Sweden          0        0 1948-09-19 194809 11420
##  9      11      Sweden          0        0 1948-09-19 194809 11620
## 10      11      Sweden          0        0 1948-09-19 194809 11810
## # ... with 4,204 more rows, and 166 more variables: partyname <chr>,
## #   partyabbrev <chr>, parfam <dbl>, coderid <dbl>, manual <dbl>,
## #   coderyear <dbl>, testresult <dbl>, testeditsim <dbl>, pervote <dbl>,
## #   voteest <dbl>, presvote <dbl>, absseat <dbl>, totseats <dbl>,
## #   progtype <dbl>, datasetorigin <dbl>, corpusversion <chr>, total <dbl>,
## #   peruncod <dbl>, per101 <dbl>, per102 <dbl>, per103 <dbl>,
## #   per104 <dbl>, per105 <dbl>, per106 <dbl>, per107 <dbl>, per108 <dbl>,
## #   per109 <dbl>, per110 <dbl>, per201 <dbl>, per202 <dbl>, per203 <dbl>,
## #   per204 <dbl>, per301 <dbl>, per302 <dbl>, per303 <dbl>, per304 <dbl>,
## #   per305 <dbl>, per401 <dbl>, per402 <dbl>, per403 <dbl>, per404 <dbl>,
## #   per405 <dbl>, per406 <dbl>, per407 <dbl>, per408 <dbl>, per409 <dbl>,
## #   per410 <dbl>, per411 <dbl>, per412 <dbl>, per413 <dbl>, per414 <dbl>,
## #   per415 <dbl>, per416 <dbl>, per501 <dbl>, per502 <dbl>, per503 <dbl>,
## #   per504 <dbl>, per505 <dbl>, per506 <dbl>, per507 <dbl>, per601 <dbl>,
## #   per602 <dbl>, per603 <dbl>, per604 <dbl>, per605 <dbl>, per606 <dbl>,
## #   per607 <dbl>, per608 <dbl>, per701 <dbl>, per702 <dbl>, per703 <dbl>,
## #   per704 <dbl>, per705 <dbl>, per706 <dbl>, per1011 <dbl>,
## #   per1012 <dbl>, per1013 <dbl>, per1014 <dbl>, per1015 <dbl>,
## #   per1016 <dbl>, per1021 <dbl>, per1022 <dbl>, per1023 <dbl>,
## #   per1024 <dbl>, per1025 <dbl>, per1026 <dbl>, per1031 <dbl>,
## #   per1032 <dbl>, per1033 <dbl>, per2021 <dbl>, per2022 <dbl>,
## #   per2023 <dbl>, per2031 <dbl>, per2032 <dbl>, per2033 <dbl>,
## #   per2041 <dbl>, per3011 <dbl>, per3051 <dbl>, per3052 <dbl>,
## #   per3053 <dbl>, ...

How many manifestos have been published by each political party in Sweden?

mpds %>%
  filter(countryname == "Sweden") %>%
  count(partyname) %>%
  ggplot(aes(fct_reorder(partyname, n), n)) +
  geom_col() +
  labs(title = "Political manifestos published in Sweden",
       x = NULL,
       y = "Total (1948-present)") +
  coord_flip()

How have the Democratic and Republican Party manifestos in the United States changed over time?

mpds %>%
  filter(party == 61320 | party == 61620) %>%
  mutate(ideo = mp_scale(.)) %>%
  select(partyname, edate, ideo) %>%
  ggplot(aes(edate, ideo, color = partyname)) +
  geom_line() +
  scale_color_manual(values = c("blue", "red")) +
  labs(title = "Ideological scaling of major US political parties",
       x = "Year",
       y = "Ideological position",
       color = NULL) +
  theme(legend.position = "bottom")

Analyze text of manifestos

# download documents
(docs <- mp_corpus(countryname == "United States" & edate > as.Date("2012-01-01")))
## Connecting to Manifesto Project DB API... 
## Connecting to Manifesto Project DB API... corpus version: 2017-1 
## Connecting to Manifesto Project DB API... corpus version: 2017-1 
## Connecting to Manifesto Project DB API... corpus version: 2017-1
## <<ManifestoCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 2
# generate wordcloud of most common terms
docs %>%
  tidy() %>%
  mutate(party = factor(party, levels = c(61320, 61620),
                        labels = c("Democratic Party", "Republican Party"))) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  count(party, word, sort = TRUE) %>%
  na.omit() %>%
  reshape2::acast(word ~ party, value.var = "n", fill = 0) %>%
  comparison.cloud(max.words = 200)

Twitter API

Using rtweet

library(rtweet)

OAuth authentication

  1. Create a Twitter account
  2. Store your API key and token using the .Rprofile method
  3. create_token() from the console
  4. Get back into RStudio

Searching tweets

rt <- search_tweets(
  q = "#rstats",
  n = 3000,
  include_rts = FALSE
)
rt
## # A tibble: 2,655 x 42
##    status_id  created_at          user_id  screen_name text         source
##    <chr>      <dttm>              <chr>    <chr>       <chr>        <chr> 
##  1 973237480… 2018-03-12 16:41:11 1531004… msubbaiah1  "Here's my … Twitt…
##  2 973234778… 2018-03-12 16:30:27 2957630… calves06    Today’s the… Twitt…
##  3 973234218… 2018-03-12 16:28:14 9354611… RLadiesLau… We are cont… Twitt…
##  4 973233874… 2018-03-12 16:26:52 8441528… rweekly_li… Steel-ing a… R Wee…
##  5 973233697… 2018-03-12 16:26:10 4686586… bass_analy… "A tidy mod… Twitt…
##  6 973232678… 2018-03-12 16:22:07 1989657… monkmanmh   "TIL that I… Twitt…
##  7 973231360… 2018-03-12 16:16:52 16736320 kai_arzhei… @kuziemsky … Twitt…
##  8 973230437… 2018-03-12 16:13:12 89191817 BigDataIns… "Big | Data… Paper…
##  9 973229907… 2018-03-12 16:11:06 5232165… martinjhnh… "I've a #GI… Twitt…
## 10 973228130… 2018-03-12 16:04:02 3580439… maximaform… #Statistics… Buffer
## # ... with 2,645 more rows, and 36 more variables:
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, hashtags <list>,
## #   symbols <list>, urls_url <list>, urls_t.co <list>,
## #   urls_expanded_url <list>, media_url <list>, media_t.co <list>,
## #   media_expanded_url <list>, media_type <list>, ext_media_url <list>,
## #   ext_media_t.co <list>, ext_media_expanded_url <list>,
## #   ext_media_type <lgl>, mentions_user_id <list>,
## #   mentions_screen_name <list>, lang <chr>, quoted_status_id <chr>,
## #   quoted_text <chr>, retweet_status_id <chr>, retweet_text <chr>,
## #   place_url <chr>, place_name <chr>, place_full_name <chr>,
## #   place_type <chr>, country <chr>, country_code <chr>,
## #   geo_coords <list>, coords_coords <list>, bbox_coords <list>

Searching users

countvoncount <- get_timeline(user = "countvoncount", n = 1000)
countvoncount
## # A tibble: 1,200 x 42
##    status_id   created_at          user_id  screen_name text      source  
##  * <chr>       <dttm>              <chr>    <chr>       <chr>     <chr>   
##  1 9731839178… 2018-03-12 13:08:21 5551297… CountVonCo… One thou… CountVo…
##  2 9728970194… 2018-03-11 18:08:19 5551297… CountVonCo… One thou… CountVo…
##  3 9725799215… 2018-03-10 21:08:17 5551297… CountVonCo… One thou… CountVo…
##  4 9723081230… 2018-03-10 03:08:15 5551297… CountVonCo… One thou… CountVo…
##  5 9721420240… 2018-03-09 16:08:14 5551297… CountVonCo… One thou… CountVo…
##  6 9718551253… 2018-03-08 21:08:12 5551297… CountVonCo… One thou… CountVo…
##  7 9717343260… 2018-03-08 13:08:11 5551297… CountVonCo… One thou… CountVo…
##  8 9714776275… 2018-03-07 20:08:10 5551297… CountVonCo… One thou… CountVo…
##  9 9713870278… 2018-03-07 14:08:09 5551297… CountVonCo… One thou… CountVo…
## 10 9711303288… 2018-03-06 21:08:07 5551297… CountVonCo… One thou… CountVo…
## # ... with 1,190 more rows, and 36 more variables:
## #   reply_to_status_id <lgl>, reply_to_user_id <lgl>,
## #   reply_to_screen_name <lgl>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, hashtags <list>,
## #   symbols <list>, urls_url <list>, urls_t.co <list>,
## #   urls_expanded_url <list>, media_url <list>, media_t.co <list>,
## #   media_expanded_url <list>, media_type <list>, ext_media_url <list>,
## #   ext_media_t.co <list>, ext_media_expanded_url <list>,
## #   ext_media_type <lgl>, mentions_user_id <list>,
## #   mentions_screen_name <list>, lang <chr>, quoted_status_id <chr>,
## #   quoted_text <chr>, retweet_status_id <chr>, retweet_text <chr>,
## #   place_url <chr>, place_name <chr>, place_full_name <chr>,
## #   place_type <chr>, country <chr>, country_code <chr>,
## #   geo_coords <list>, coords_coords <list>, bbox_coords <list>

Visualizing tweets

ts_plot(rt, by = "3 hours")

Visualizing tweets

ts_plot(rt, by = "1 hours")

Visualizing tweets

ts_plot(rt, by = "3 hours") +
  theme(plot.title = element_text(face = "bold")) +
  labs(
    x = NULL, y = NULL,
    title = "Frequency of #rstats Twitter statuses from past 9 days",
    subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  )

Exercise: Practice using rtweet

Writing an API function

Determine the shape of an API request

Determine the shape of an API request

http://www.omdbapi.com/?apikey=[apikey]&t=Sharknado&y=2013&plot=short&r=xml
<?xml version="1.0" encoding="UTF-8"?><root response="True"><movie title="Sharknado" year="2013" rated="TV-14" released="11 Jul 2013" runtime="86 min" genre="Comedy, Horror, Sci-Fi" director="Anthony C. Ferrante" writer="Thunder Levin" actors="Ian Ziering, Tara Reid, John Heard, Cassandra Scerbo" plot="When a freak hurricane swamps Los Angeles, nature's deadliest killer rules sea, land, and air as thousands of sharks terrorize the waterlogged populace." language="English" country="USA" awards="1 win &amp; 2 nominations." poster="https://images-na.ssl-images-amazon.com/images/M/MV5BOTE2OTk4MTQzNV5BMl5BanBnXkFtZTcwODUxOTM3OQ@@._V1_SX300.jpg" metascore="N/A" imdbRating="3.3" imdbVotes="38,948" imdbID="tt2724064" type="movie"/></root>

Create a request in R

# retrieve API key from .RProfile
omdb_key <- getOption("omdb_key")

# create url
request <- str_c("http://www.omdbapi.com/?apikey=", omdb_key, "&", "t=", "Sharknado", "&", "y=", "2013", "&", "plot=", "short", "&", "r=", "xml")
request
## [1] "http://www.omdbapi.com/?apikey=775e324f&t=Sharknado&y=2013&plot=short&r=xml"

Abstracting to a function

omdb <- function(Key, Title, Year, Plot, Format){
  baseurl <- "http://www.omdbapi.com/?"
  params <- c("apikey=", "t=", "y=", "plot=", "r=")
  values <- c(Key, Title, Year, Plot, Format)
  param_values <- map2_chr(params, values, str_c)
  args <- str_c(param_values, collapse = "&")
  str_c(baseurl, args)
}

omdb("omdb_key", "Sharknado", "2013", "short", "xml")
## [1] "http://www.omdbapi.com/?apikey=omdb_key&t=Sharknado&y=2013&plot=short&r=xml"

Obtain data with curl

request_sharknado <- omdb(omdb_key, "Sharknado", "2013", "short", "xml")
con <- curl(request_sharknado)
answer_xml <- readLines(con)
close(con)
answer_xml
## [1] "<?xml version=\"1.0\" encoding=\"UTF-8\"?><root response=\"True\"><movie title=\"Sharknado\" year=\"2013\" rated=\"TV-14\" released=\"11 Jul 2013\" runtime=\"86 min\" genre=\"Comedy, Horror, Sci-Fi\" director=\"Anthony C. Ferrante\" writer=\"Thunder Levin\" actors=\"Ian Ziering, Tara Reid, John Heard, Cassandra Scerbo\" plot=\"When a freak hurricane swamps Los Angeles, nature's deadliest killer rules sea, land, and air as thousands of sharks terrorize the waterlogged populace.\" language=\"English\" country=\"USA\" awards=\"1 win &amp; 2 nominations.\" poster=\"https://images-na.ssl-images-amazon.com/images/M/MV5BOTE2OTk4MTQzNV5BMl5BanBnXkFtZTcwODUxOTM3OQ@@._V1_SX300.jpg\" metascore=\"N/A\" imdbRating=\"3.3\" imdbVotes=\"38,948\" imdbID=\"tt2724064\" type=\"movie\"/></root>"

Obtain data with curl

request_sharknado <- omdb(omdb_key, "Sharknado", "2013", "short", "json")
con <- curl(request_sharknado)
answer_json <- readLines(con)
close(con)
answer_json %>% 
  prettify()
## {
##     "Title": "Sharknado",
##     "Year": "2013",
##     "Rated": "TV-14",
##     "Released": "11 Jul 2013",
##     "Runtime": "86 min",
##     "Genre": "Comedy, Horror, Sci-Fi",
##     "Director": "Anthony C. Ferrante",
##     "Writer": "Thunder Levin",
##     "Actors": "Ian Ziering, Tara Reid, John Heard, Cassandra Scerbo",
##     "Plot": "When a freak hurricane swamps Los Angeles, nature's deadliest killer rules sea, land, and air as thousands of sharks terrorize the waterlogged populace.",
##     "Language": "English",
##     "Country": "USA",
##     "Awards": "1 win & 2 nominations.",
##     "Poster": "https://images-na.ssl-images-amazon.com/images/M/MV5BOTE2OTk4MTQzNV5BMl5BanBnXkFtZTcwODUxOTM3OQ@@._V1_SX300.jpg",
##     "Ratings": [
##         {
##             "Source": "Internet Movie Database",
##             "Value": "3.3/10"
##         },
##         {
##             "Source": "Rotten Tomatoes",
##             "Value": "82%"
##         }
##     ],
##     "Metascore": "N/A",
##     "imdbRating": "3.3",
##     "imdbVotes": "38,948",
##     "imdbID": "tt2724064",
##     "Type": "movie",
##     "DVD": "03 Sep 2013",
##     "BoxOffice": "N/A",
##     "Production": "NCM Fathom",
##     "Website": "http://www.mtivideo.com/TitleView.aspx?TITLE_ID=728",
##     "Response": "True"
## }
## 

JavaScript Object Notation (JSON)

{
  "crust": "original",
  "toppings": ["cheese", "pepperoni", "garlic"],
  "status": "cooking",
  "customer": {
    "name": "Brian",
    "phone": "573-111-1111"
  }
}

Parsing JSON

answer_json %>% 
  fromJSON()
## $Title
## [1] "Sharknado"
## 
## $Year
## [1] "2013"
## 
## $Rated
## [1] "TV-14"
## 
## $Released
## [1] "11 Jul 2013"
## 
## $Runtime
## [1] "86 min"
## 
## $Genre
## [1] "Comedy, Horror, Sci-Fi"
## 
## $Director
## [1] "Anthony C. Ferrante"
## 
## $Writer
## [1] "Thunder Levin"
## 
## $Actors
## [1] "Ian Ziering, Tara Reid, John Heard, Cassandra Scerbo"
## 
## $Plot
## [1] "When a freak hurricane swamps Los Angeles, nature's deadliest killer rules sea, land, and air as thousands of sharks terrorize the waterlogged populace."
## 
## $Language
## [1] "English"
## 
## $Country
## [1] "USA"
## 
## $Awards
## [1] "1 win & 2 nominations."
## 
## $Poster
## [1] "https://images-na.ssl-images-amazon.com/images/M/MV5BOTE2OTk4MTQzNV5BMl5BanBnXkFtZTcwODUxOTM3OQ@@._V1_SX300.jpg"
## 
## $Ratings
##                    Source  Value
## 1 Internet Movie Database 3.3/10
## 2         Rotten Tomatoes    82%
## 
## $Metascore
## [1] "N/A"
## 
## $imdbRating
## [1] "3.3"
## 
## $imdbVotes
## [1] "38,948"
## 
## $imdbID
## [1] "tt2724064"
## 
## $Type
## [1] "movie"
## 
## $DVD
## [1] "03 Sep 2013"
## 
## $BoxOffice
## [1] "N/A"
## 
## $Production
## [1] "NCM Fathom"
## 
## $Website
## [1] "http://www.mtivideo.com/TitleView.aspx?TITLE_ID=728"
## 
## $Response
## [1] "True"

Parsing JSON into a data frame

answer_json %>% 
  fromJSON() %>% 
  # remove ratings element for now
  list_modify(Ratings = NULL) %>%
  as_tibble()
## # A tibble: 1 x 24
##       Title  Year Rated    Released Runtime                  Genre
##       <chr> <chr> <chr>       <chr>   <chr>                  <chr>
## 1 Sharknado  2013 TV-14 11 Jul 2013  86 min Comedy, Horror, Sci-Fi
## # ... with 18 more variables: Director <chr>, Writer <chr>, Actors <chr>,
## #   Plot <chr>, Language <chr>, Country <chr>, Awards <chr>, Poster <chr>,
## #   Metascore <chr>, imdbRating <chr>, imdbVotes <chr>, imdbID <chr>,
## #   Type <chr>, DVD <chr>, BoxOffice <chr>, Production <chr>,
## #   Website <chr>, Response <chr>

Introducing the easy way: httr

  • GET
  • POST
  • PUT
  • DELETE

GET Sharknado

sharknado_json <- omdb(omdb_key, "Sharknado", "2013", "short", "json")
response_json <- GET(sharknado_json)
content(response_json, as = "parsed", type = "application/json")
## $Title
## [1] "Sharknado"
## 
## $Year
## [1] "2013"
## 
## $Rated
## [1] "TV-14"
## 
## $Released
## [1] "11 Jul 2013"
## 
## $Runtime
## [1] "86 min"
## 
## $Genre
## [1] "Comedy, Horror, Sci-Fi"
## 
## $Director
## [1] "Anthony C. Ferrante"
## 
## $Writer
## [1] "Thunder Levin"
## 
## $Actors
## [1] "Ian Ziering, Tara Reid, John Heard, Cassandra Scerbo"
## 
## $Plot
## [1] "When a freak hurricane swamps Los Angeles, nature's deadliest killer rules sea, land, and air as thousands of sharks terrorize the waterlogged populace."
## 
## $Language
## [1] "English"
## 
## $Country
## [1] "USA"
## 
## $Awards
## [1] "1 win & 2 nominations."
## 
## $Poster
## [1] "https://images-na.ssl-images-amazon.com/images/M/MV5BOTE2OTk4MTQzNV5BMl5BanBnXkFtZTcwODUxOTM3OQ@@._V1_SX300.jpg"
## 
## $Ratings
## $Ratings[[1]]
## $Ratings[[1]]$Source
## [1] "Internet Movie Database"
## 
## $Ratings[[1]]$Value
## [1] "3.3/10"
## 
## 
## $Ratings[[2]]
## $Ratings[[2]]$Source
## [1] "Rotten Tomatoes"
## 
## $Ratings[[2]]$Value
## [1] "82%"
## 
## 
## 
## $Metascore
## [1] "N/A"
## 
## $imdbRating
## [1] "3.3"
## 
## $imdbVotes
## [1] "38,948"
## 
## $imdbID
## [1] "tt2724064"
## 
## $Type
## [1] "movie"
## 
## $DVD
## [1] "03 Sep 2013"
## 
## $BoxOffice
## [1] "N/A"
## 
## $Production
## [1] "NCM Fathom"
## 
## $Website
## [1] "http://www.mtivideo.com/TitleView.aspx?TITLE_ID=728"
## 
## $Response
## [1] "True"

Headers

headers(response_json)
## $date
## [1] "Mon, 13 Nov 2017 16:05:52 GMT"
## 
## $`content-type`
## [1] "application/json; charset=utf-8"
## 
## $`transfer-encoding`
## [1] "chunked"
## 
## $connection
## [1] "keep-alive"
## 
## $`set-cookie`
## [1] "__cfduid=d5e5ca2ea7c25bb15b9a368ce97f93aa81510589152; expires=Tue, 13-Nov-18 16:05:52 GMT; path=/; domain=.omdbapi.com; HttpOnly"
## 
## $`cache-control`
## [1] "public, max-age=86400"
## 
## $expires
## [1] "Tue, 14 Nov 2017 16:05:52 GMT"
## 
## $`last-modified`
## [1] "Mon, 13 Nov 2017 15:49:48 GMT"
## 
## $vary
## [1] "Accept-Encoding"
## 
## $`x-aspnet-version`
## [1] "4.0.30319"
## 
## $`x-powered-by`
## [1] "ASP.NET"
## 
## $`access-control-allow-origin`
## [1] "*"
## 
## $`cf-cache-status`
## [1] "HIT"
## 
## $server
## [1] "cloudflare-nginx"
## 
## $`cf-ray`
## [1] "3bd2f99aa22954f8-ORD"
## 
## $`content-encoding`
## [1] "gzip"
## 
## attr(,"class")
## [1] "insensitive" "list"

HTTP status code

status_code(response_json)
## [1] 200

HTTP status code

Code Status
1xx Informational
2xx Success
3xx Redirection
4xx Client error (you did something wrong)
5xx Server error (server did something wrong)

A more intuitive guide

Skip omdb()

sharknado_2 <- GET("http://www.omdbapi.com/?",
                   query = list(t = "Sharknado 2: The Second One",
                                y = 2014,
                                plot = "short",
                                r = "json",
                                apikey = omdb_key))

content(sharknado_2)
## $Title
## [1] "Sharknado 2: The Second One"
## 
## $Year
## [1] "2014"
## 
## $Rated
## [1] "TV-14"
## 
## $Released
## [1] "30 Jul 2014"
## 
## $Runtime
## [1] "95 min"
## 
## $Genre
## [1] "Comedy, Horror, Sci-Fi"
## 
## $Director
## [1] "Anthony C. Ferrante"
## 
## $Writer
## [1] "Thunder Levin"
## 
## $Actors
## [1] "Ian Ziering, Tara Reid, Vivica A. Fox, Mark McGrath"
## 
## $Plot
## [1] "Fin and April are on their way to New York City, until a category seven hurricane spawns heavy rain, storm surges, and deadly Sharknadoes."
## 
## $Language
## [1] "English"
## 
## $Country
## [1] "USA"
## 
## $Awards
## [1] "N/A"
## 
## $Poster
## [1] "https://images-na.ssl-images-amazon.com/images/M/MV5BMjA0MTIxMDEwNF5BMl5BanBnXkFtZTgwMDk3ODIxMjE@._V1_SX300.jpg"
## 
## $Ratings
## $Ratings[[1]]
## $Ratings[[1]]$Source
## [1] "Internet Movie Database"
## 
## $Ratings[[1]]$Value
## [1] "4.1/10"
## 
## 
## $Ratings[[2]]
## $Ratings[[2]]$Source
## [1] "Rotten Tomatoes"
## 
## $Ratings[[2]]$Value
## [1] "61%"
## 
## 
## 
## $Metascore
## [1] "N/A"
## 
## $imdbRating
## [1] "4.1"
## 
## $imdbVotes
## [1] "14,537"
## 
## $imdbID
## [1] "tt3062074"
## 
## $Type
## [1] "movie"
## 
## $DVD
## [1] "07 Oct 2014"
## 
## $BoxOffice
## [1] "N/A"
## 
## $Production
## [1] "NCM Fathom"
## 
## $Website
## [1] "N/A"
## 
## $Response
## [1] "True"

Messy API responses

# omdb API function
omdb <- function(Key, Title, Year, Plot, Format){
  baseurl <- "http://www.omdbapi.com/?"
  params <- c("apikey=", "t=", "y=", "plot=", "r=")
  values <- c(Key, Title, Year, Plot, Format)
  param_values <- map2_chr(params, values, str_c)
  args <- str_c(param_values, collapse = "&")
  str_c(baseurl, args)
}

# use curl to execute the query
request_sharknado <- omdb(getOption("omdb_key"), "Sharknado",
                          "2013", "short", "json")
con <- curl(request_sharknado)
answer_json <- readLines(con)
close(con)

# convert to data frame
answer_json %>% 
  fromJSON() %>% 
  as_tibble()
## Error: Column `Ratings` must be a 1d atomic vector or a list

Whoops

sharknado <- answer_json %>% 
  fromJSON()

str(sharknado)
## List of 25
##  $ Title     : chr "Sharknado"
##  $ Year      : chr "2013"
##  $ Rated     : chr "TV-14"
##  $ Released  : chr "11 Jul 2013"
##  $ Runtime   : chr "86 min"
##  $ Genre     : chr "Comedy, Horror, Sci-Fi"
##  $ Director  : chr "Anthony C. Ferrante"
##  $ Writer    : chr "Thunder Levin"
##  $ Actors    : chr "Ian Ziering, Tara Reid, John Heard, Cassandra Scerbo"
##  $ Plot      : chr "When a freak hurricane swamps Los Angeles, nature's deadliest killer rules sea, land, and air as thousands of s"| __truncated__
##  $ Language  : chr "English"
##  $ Country   : chr "USA"
##  $ Awards    : chr "1 win & 2 nominations."
##  $ Poster    : chr "https://images-na.ssl-images-amazon.com/images/M/MV5BOTE2OTk4MTQzNV5BMl5BanBnXkFtZTcwODUxOTM3OQ@@._V1_SX300.jpg"
##  $ Ratings   :'data.frame':  2 obs. of  2 variables:
##   ..$ Source: chr [1:2] "Internet Movie Database" "Rotten Tomatoes"
##   ..$ Value : chr [1:2] "3.3/10" "82%"
##  $ Metascore : chr "N/A"
##  $ imdbRating: chr "3.3"
##  $ imdbVotes : chr "38,948"
##  $ imdbID    : chr "tt2724064"
##  $ Type      : chr "movie"
##  $ DVD       : chr "03 Sep 2013"
##  $ BoxOffice : chr "N/A"
##  $ Production: chr "NCM Fathom"
##  $ Website   : chr "http://www.mtivideo.com/TitleView.aspx?TITLE_ID=728"
##  $ Response  : chr "True"
jsonedit(sharknado, mode = "view", elementId = "sharknado")

Inspecting and exploring lists

library(purrr)
library(repurrrsive)
str(got_chars, list.len = 3)
## List of 29
##  $ :List of 18
##   ..$ url        : chr "http://www.anapioficeandfire.com/api/characters/1022"
##   ..$ id         : int 1022
##   ..$ name       : chr "Theon Greyjoy"
##   .. [list output truncated]
##  $ :List of 18
##   ..$ url        : chr "http://www.anapioficeandfire.com/api/characters/1052"
##   ..$ id         : int 1052
##   ..$ name       : chr "Tyrion Lannister"
##   .. [list output truncated]
##  $ :List of 18
##   ..$ url        : chr "http://www.anapioficeandfire.com/api/characters/1074"
##   ..$ id         : int 1074
##   ..$ name       : chr "Victarion Greyjoy"
##   .. [list output truncated]
##   [list output truncated]
jsonedit(got_chars, mode = "view", elementId = "got_chars")

Name and position shortcuts

map(got_chars[1:4], "name")
## [[1]]
## [1] "Theon Greyjoy"
## 
## [[2]]
## [1] "Tyrion Lannister"
## 
## [[3]]
## [1] "Victarion Greyjoy"
## 
## [[4]]
## [1] "Will"
  • Equivalent to function(x) x[["TEXT"]]

Name and position shortcuts

map(got_chars[5:8], 3)
## [[1]]
## [1] "Areo Hotah"
## 
## [[2]]
## [1] "Chett"
## 
## [[3]]
## [1] "Cressen"
## 
## [[4]]
## [1] "Arianne Martell"
  • Equivalent to function(x) x[[i]]

Name and position shortcuts with pipe

got_chars %>% 
  map("name")
got_chars %>% 
  map(3)

Type-specific map

map_chr(got_chars[9:12], "name")
## [1] "Daenerys Targaryen" "Davos Seaworth"     "Arya Stark"        
## [4] "Arys Oakheart"
map_chr(got_chars[13:16], 3)
## [1] "Asha Greyjoy"    "Barristan Selmy" "Varamyr"         "Brandon Stark"

Extract multiple values

# Victarion element
got_chars[[3]]
## $url
## [1] "http://www.anapioficeandfire.com/api/characters/1074"
## 
## $id
## [1] 1074
## 
## $name
## [1] "Victarion Greyjoy"
## 
## $gender
## [1] "Male"
## 
## $culture
## [1] "Ironborn"
## 
## $born
## [1] "In 268 AC or before, at Pyke"
## 
## $died
## [1] ""
## 
## $alive
## [1] TRUE
## 
## $titles
## [1] "Lord Captain of the Iron Fleet" "Master of the Iron Victory"    
## 
## $aliases
## [1] "The Iron Captain"
## 
## $father
## [1] ""
## 
## $mother
## [1] ""
## 
## $spouse
## [1] ""
## 
## $allegiances
## [1] "House Greyjoy of Pyke"
## 
## $books
## [1] "A Game of Thrones" "A Clash of Kings"  "A Storm of Swords"
## 
## $povBooks
## [1] "A Feast for Crows"    "A Dance with Dragons"
## 
## $tvSeries
## list()
## 
## $playedBy
## list()
# specific elements for Victarion
got_chars[[3]][c("name", "culture", "gender", "born")]
## $name
## [1] "Victarion Greyjoy"
## 
## $culture
## [1] "Ironborn"
## 
## $gender
## [1] "Male"
## 
## $born
## [1] "In 268 AC or before, at Pyke"

Adapt to map() framework

map(.x, .f, ...)
  • .f = [
  • ... = character vector identifying the names of the elements to extract

Adapt to map() framework

x <- map(got_chars, `[`, c("name", "culture", "gender", "born"))
str(x[16:17])
## List of 2
##  $ :List of 4
##   ..$ name   : chr "Brandon Stark"
##   ..$ culture: chr "Northmen"
##   ..$ gender : chr "Male"
##   ..$ born   : chr "In 290 AC, at Winterfell"
##  $ :List of 4
##   ..$ name   : chr "Brienne of Tarth"
##   ..$ culture: chr ""
##   ..$ gender : chr "Female"
##   ..$ born   : chr "In 280 AC"

magrittr::extract()

library(magrittr)

x <- map(got_chars, extract, c("name", "culture", "gender", "born"))
str(x[18:19])
## List of 2
##  $ :List of 4
##   ..$ name   : chr "Catelyn Stark"
##   ..$ culture: chr "Rivermen"
##   ..$ gender : chr "Female"
##   ..$ born   : chr "In 264 AC, at Riverrun"
##  $ :List of 4
##   ..$ name   : chr "Cersei Lannister"
##   ..$ culture: chr "Westerman"
##   ..$ gender : chr "Female"
##   ..$ born   : chr "In 266 AC, at Casterly Rock"

Data frame output

map_df(got_chars, extract, c("name", "culture", "gender", "id", "born", "alive"))
## # A tibble: 29 x 6
##                  name  culture gender    id
##                 <chr>    <chr>  <chr> <int>
##  1      Theon Greyjoy Ironborn   Male  1022
##  2   Tyrion Lannister            Male  1052
##  3  Victarion Greyjoy Ironborn   Male  1074
##  4               Will            Male  1109
##  5         Areo Hotah Norvoshi   Male  1166
##  6              Chett            Male  1267
##  7            Cressen            Male  1295
##  8    Arianne Martell  Dornish Female   130
##  9 Daenerys Targaryen Valyrian Female  1303
## 10     Davos Seaworth Westeros   Male  1319
## # ... with 19 more rows, and 2 more variables: born <chr>, alive <lgl>

More robust approach

got_chars %>% {
  tibble(
    name = map_chr(., "name"),
    culture = map_chr(., "culture"),
    gender = map_chr(., "gender"),       
    id = map_int(., "id"),
    born = map_chr(., "born"),
    alive = map_lgl(., "alive")
  )
}
## # A tibble: 29 x 6
##                  name  culture gender    id
##                 <chr>    <chr>  <chr> <int>
##  1      Theon Greyjoy Ironborn   Male  1022
##  2   Tyrion Lannister            Male  1052
##  3  Victarion Greyjoy Ironborn   Male  1074
##  4               Will            Male  1109
##  5         Areo Hotah Norvoshi   Male  1166
##  6              Chett            Male  1267
##  7            Cressen            Male  1295
##  8    Arianne Martell  Dornish Female   130
##  9 Daenerys Targaryen Valyrian Female  1303
## 10     Davos Seaworth Westeros   Male  1319
## # ... with 19 more rows, and 2 more variables: born <chr>, alive <lgl>

Exercise: simplify gh_users

List inside a data frame

str(gh_repos, list.len = 2)
## List of 6
##  $ :List of 30
##   ..$ :List of 68
##   .. ..$ id               : int 61160198
##   .. ..$ name             : chr "after"
##   .. .. [list output truncated]
##   ..$ :List of 68
##   .. ..$ id               : int 40500181
##   .. ..$ name             : chr "argufy"
##   .. .. [list output truncated]
##   .. [list output truncated]
##  $ :List of 30
##   ..$ :List of 68
##   .. ..$ id               : int 14756210
##   .. ..$ name             : chr "2013-11_sfu"
##   .. .. [list output truncated]
##   ..$ :List of 68
##   .. ..$ id               : int 14152301
##   .. ..$ name             : chr "2014-01-27-miami"
##   .. .. [list output truncated]
##   .. [list output truncated]
##   [list output truncated]
jsonedit(gh_repos, mode = "view", elementId = "gh_repos")

Vector input to extraction shortcuts

gh_repos %>%
  map_chr(c(1, 3))
## [1] "gaborcsardi/after"   "jennybc/2013-11_sfu" "jtleek/advdatasci"  
## [4] "juliasilge/2016-14"  "leeper/ampolcourse"  "masalmon/aqi_pdf"

Get it into a data frame

One row per repository, with variables identifying which GitHub user owns it, the repository name, etc.

Create a data frame with usernames and gh_repos

(unames <- map_chr(gh_repos, c(1, 4, 1)))
## [1] "gaborcsardi" "jennybc"     "jtleek"      "juliasilge"  "leeper"     
## [6] "masalmon"
(udf <- gh_repos %>%
    set_names(unames) %>% 
    enframe("username", "gh_repos"))
## # A tibble: 6 x 2
##      username    gh_repos
##         <chr>      <list>
## 1 gaborcsardi <list [30]>
## 2     jennybc <list [30]>
## 3      jtleek <list [30]>
## 4  juliasilge <list [26]>
## 5      leeper <list [30]>
## 6    masalmon <list [30]>

How many repos are associated with each user?

udf %>% 
  mutate(n_repos = map_int(gh_repos, length))
## # A tibble: 6 x 3
##      username    gh_repos n_repos
##         <chr>      <list>   <int>
## 1 gaborcsardi <list [30]>      30
## 2     jennybc <list [30]>      30
## 3      jtleek <list [30]>      30
## 4  juliasilge <list [26]>      26
## 5      leeper <list [30]>      30
## 6    masalmon <list [30]>      30

Practice on a single user

# one_user is a list of repos for one user
one_user <- udf$gh_repos[[1]]

# one_user[[1]] is a list of info for one repo
one_repo <- one_user[[1]]
str(one_repo, max.level = 1, list.len = 5)
## List of 68
##  $ id               : int 61160198
##  $ name             : chr "after"
##  $ full_name        : chr "gaborcsardi/after"
##  $ owner            :List of 17
##  $ private          : logi FALSE
##   [list output truncated]
# a highly selective list of tibble-worthy info for one repo
one_repo[c("name", "fork", "open_issues")]
## $name
## [1] "after"
## 
## $fork
## [1] FALSE
## 
## $open_issues
## [1] 0
# make a data frame of that info for all a user's repos
map_df(one_user, `[`, c("name", "fork", "open_issues"))
## # A tibble: 30 x 3
##           name  fork open_issues
##          <chr> <lgl>       <int>
##  1       after FALSE           0
##  2      argufy FALSE           6
##  3         ask FALSE           4
##  4 baseimports FALSE           0
##  5      citest  TRUE           0
##  6  clisymbols FALSE           0
##  7      cmaker  TRUE           0
##  8       cmark  TRUE           0
##  9  conditions  TRUE           0
## 10      crayon FALSE           7
## # ... with 20 more rows
map_df(one_user, extract, c("name", "fork", "open_issues"))
## # A tibble: 30 x 3
##           name  fork open_issues
##          <chr> <lgl>       <int>
##  1       after FALSE           0
##  2      argufy FALSE           6
##  3         ask FALSE           4
##  4 baseimports FALSE           0
##  5      citest  TRUE           0
##  6  clisymbols FALSE           0
##  7      cmaker  TRUE           0
##  8       cmark  TRUE           0
##  9  conditions  TRUE           0
## 10      crayon FALSE           7
## # ... with 20 more rows

Scale up to all users

Scale up to all users

udf %>% 
  mutate(repo_info = gh_repos %>%
           map(. %>%
                 map_df(extract, c("name", "fork", "open_issues"))))
## # A tibble: 6 x 3
##      username    gh_repos         repo_info
##         <chr>      <list>            <list>
## 1 gaborcsardi <list [30]> <tibble [30 x 3]>
## 2     jennybc <list [30]> <tibble [30 x 3]>
## 3      jtleek <list [30]> <tibble [30 x 3]>
## 4  juliasilge <list [26]> <tibble [26 x 3]>
## 5      leeper <list [30]> <tibble [30 x 3]>
## 6    masalmon <list [30]> <tibble [30 x 3]>

Tidy the data frame

(rdf <- udf %>% 
   mutate(
     repo_info = gh_repos %>%
       map(. %>%
             map_df(extract, c("name", "fork", "open_issues")))
   ) %>% 
   select(-gh_repos) %>% 
   tidyr::unnest())
## # A tibble: 176 x 4
##       username        name  fork open_issues
##          <chr>       <chr> <lgl>       <int>
##  1 gaborcsardi       after FALSE           0
##  2 gaborcsardi      argufy FALSE           6
##  3 gaborcsardi         ask FALSE           4
##  4 gaborcsardi baseimports FALSE           0
##  5 gaborcsardi      citest  TRUE           0
##  6 gaborcsardi  clisymbols FALSE           0
##  7 gaborcsardi      cmaker  TRUE           0
##  8 gaborcsardi       cmark  TRUE           0
##  9 gaborcsardi  conditions  TRUE           0
## 10 gaborcsardi      crayon FALSE           7
## # ... with 166 more rows