Data wrangling: tidy data

MACS 30500 University of Chicago

readr vs. base R

readr vs. base R

Alternative file formats

  • CSV
  • RDS
  • Feather
  • Excel
  • SPSS/Stata
## # A tibble: 2,000 x 2
##        x      y
##    <dbl> <date>
##  1   404     NA
##  2  4172     NA
##  3  3004     NA
##  4   787     NA
##  5    37     NA
##  6  2332     NA
##  7  2489     NA
##  8  1449     NA
##  9  3665     NA
## 10  3863     NA
## # ... with 1,990 more rows

RDS

## # A tibble: 2,000 x 2
##        x      y
##    <dbl> <date>
##  1   404     NA
##  2  4172     NA
##  3  3004     NA
##  4   787     NA
##  5    37     NA
##  6  2332     NA
##  7  2489     NA
##  8  1449     NA
##  9  3665     NA
## 10  3863     NA
## # ... with 1,990 more rows
# compare file size
file.info("../data/challenge.csv")$size %>%
  utils:::format.object_size("auto")
## [1] "37.1 Kb"
file.info("../data/challenge.rds")$size %>%
  utils:::format.object_size("auto")
## [1] "31.9 Kb"

feather

readxl

library(readxl)

xlsx_example <- readxl_example("datasets.xlsx")
read_excel(xlsx_example)
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl>   <chr>
##  1          5.1         3.5          1.4         0.2  setosa
##  2          4.9         3.0          1.4         0.2  setosa
##  3          4.7         3.2          1.3         0.2  setosa
##  4          4.6         3.1          1.5         0.2  setosa
##  5          5.0         3.6          1.4         0.2  setosa
##  6          5.4         3.9          1.7         0.4  setosa
##  7          4.6         3.4          1.4         0.3  setosa
##  8          5.0         3.4          1.5         0.2  setosa
##  9          4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa
## # ... with 140 more rows
excel_sheets(xlsx_example)
## [1] "iris"     "mtcars"   "chickwts" "quakes"
read_excel(xlsx_example, sheet = "chickwts")
## # A tibble: 71 x 2
##    weight      feed
##     <dbl>     <chr>
##  1    179 horsebean
##  2    160 horsebean
##  3    136 horsebean
##  4    227 horsebean
##  5    217 horsebean
##  6    168 horsebean
##  7    108 horsebean
##  8    124 horsebean
##  9    143 horsebean
## 10    140 horsebean
## # ... with 61 more rows
read_excel(xlsx_example, sheet = 2)
## # A tibble: 32 x 11
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21.0     6 160.0   110  3.90 2.620 16.46     0     1     4     4
##  2  21.0     6 160.0   110  3.90 2.875 17.02     0     1     4     4
##  3  22.8     4 108.0    93  3.85 2.320 18.61     1     1     4     1
##  4  21.4     6 258.0   110  3.08 3.215 19.44     1     0     3     1
##  5  18.7     8 360.0   175  3.15 3.440 17.02     0     0     3     2
##  6  18.1     6 225.0   105  2.76 3.460 20.22     1     0     3     1
##  7  14.3     8 360.0   245  3.21 3.570 15.84     0     0     3     4
##  8  24.4     4 146.7    62  3.69 3.190 20.00     1     0     4     2
##  9  22.8     4 140.8    95  3.92 3.150 22.90     1     0     4     2
## 10  19.2     6 167.6   123  3.92 3.440 18.30     1     0     4     4
## # ... with 22 more rows

haven

library(haven)

# SAS
read_sas(system.file("examples", "iris.sas7bdat", package = "haven"))
## # A tibble: 150 x 5
##    Sepal_Length Sepal_Width Petal_Length Petal_Width Species
##           <dbl>       <dbl>        <dbl>       <dbl>   <chr>
##  1          5.1         3.5          1.4         0.2  setosa
##  2          4.9         3.0          1.4         0.2  setosa
##  3          4.7         3.2          1.3         0.2  setosa
##  4          4.6         3.1          1.5         0.2  setosa
##  5          5.0         3.6          1.4         0.2  setosa
##  6          5.4         3.9          1.7         0.4  setosa
##  7          4.6         3.4          1.4         0.3  setosa
##  8          5.0         3.4          1.5         0.2  setosa
##  9          4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa
## # ... with 140 more rows
write_sas(mtcars, "../data/mtcars.sas7bdat")

# SPSS
read_sav(system.file("examples", "iris.sav", package = "haven"))
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
##           <dbl>       <dbl>        <dbl>       <dbl> <dbl+lbl>
##  1          5.1         3.5          1.4         0.2         1
##  2          4.9         3.0          1.4         0.2         1
##  3          4.7         3.2          1.3         0.2         1
##  4          4.6         3.1          1.5         0.2         1
##  5          5.0         3.6          1.4         0.2         1
##  6          5.4         3.9          1.7         0.4         1
##  7          4.6         3.4          1.4         0.3         1
##  8          5.0         3.4          1.5         0.2         1
##  9          4.4         2.9          1.4         0.2         1
## 10          4.9         3.1          1.5         0.1         1
## # ... with 140 more rows
write_sav(mtcars, "../data/mtcars.sav")

# Stata
read_dta(system.file("examples", "iris.dta", package = "haven"))
## # A tibble: 150 x 5
##    sepallength sepalwidth petallength petalwidth species
##          <dbl>      <dbl>       <dbl>      <dbl>   <chr>
##  1         5.1        3.5         1.4        0.2  setosa
##  2         4.9        3.0         1.4        0.2  setosa
##  3         4.7        3.2         1.3        0.2  setosa
##  4         4.6        3.1         1.5        0.2  setosa
##  5         5.0        3.6         1.4        0.2  setosa
##  6         5.4        3.9         1.7        0.4  setosa
##  7         4.6        3.4         1.4        0.3  setosa
##  8         5.0        3.4         1.5        0.2  setosa
##  9         4.4        2.9         1.4        0.2  setosa
## 10         4.9        3.1         1.5        0.1  setosa
## # ... with 140 more rows
write_dta(mtcars, "../data/mtcars.dta")

Tidy data

Figure 12.1 from [@hadley2016]

Common tidying tasks

  • Gathering
  • Spreading
  • Separating
  • Uniting

Gathering

table4a
## # A tibble: 3 x 3
##       country `1999` `2000`
## *       <chr>  <int>  <int>
## 1 Afghanistan    745   2666
## 2      Brazil  37737  80488
## 3       China 212258 213766

Gathering

table4a
## # A tibble: 3 x 3
##       country `1999` `2000`
## *       <chr>  <int>  <int>
## 1 Afghanistan    745   2666
## 2      Brazil  37737  80488
## 3       China 212258 213766
table4a %>% 
  gather(`1999`, `2000`, key = "year", value = "cases")
## # A tibble: 6 x 3
##       country  year  cases
##         <chr> <chr>  <int>
## 1 Afghanistan  1999    745
## 2      Brazil  1999  37737
## 3       China  1999 212258
## 4 Afghanistan  2000   2666
## 5      Brazil  2000  80488
## 6       China  2000 213766

Spreading

table2
## # A tibble: 12 x 4
##        country  year       type      count
##          <chr> <int>      <chr>      <int>
##  1 Afghanistan  1999      cases        745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000      cases       2666
##  4 Afghanistan  2000 population   20595360
##  5      Brazil  1999      cases      37737
##  6      Brazil  1999 population  172006362
##  7      Brazil  2000      cases      80488
##  8      Brazil  2000 population  174504898
##  9       China  1999      cases     212258
## 10       China  1999 population 1272915272
## 11       China  2000      cases     213766
## 12       China  2000 population 1280428583

Spreading

table2
## # A tibble: 12 x 4
##        country  year       type      count
##          <chr> <int>      <chr>      <int>
##  1 Afghanistan  1999      cases        745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000      cases       2666
##  4 Afghanistan  2000 population   20595360
##  5      Brazil  1999      cases      37737
##  6      Brazil  1999 population  172006362
##  7      Brazil  2000      cases      80488
##  8      Brazil  2000 population  174504898
##  9       China  1999      cases     212258
## 10       China  1999 population 1272915272
## 11       China  2000      cases     213766
## 12       China  2000 population 1280428583
table2 %>%
  spread(key = type, value = count)
## # A tibble: 6 x 4
##       country  year  cases population
## *       <chr> <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3      Brazil  1999  37737  172006362
## 4      Brazil  2000  80488  174504898
## 5       China  1999 212258 1272915272
## 6       China  2000 213766 1280428583

Separating

table3
## # A tibble: 6 x 3
##       country  year              rate
## *       <chr> <int>             <chr>
## 1 Afghanistan  1999      745/19987071
## 2 Afghanistan  2000     2666/20595360
## 3      Brazil  1999   37737/172006362
## 4      Brazil  2000   80488/174504898
## 5       China  1999 212258/1272915272
## 6       China  2000 213766/1280428583

Separating

table3
## # A tibble: 6 x 3
##       country  year              rate
## *       <chr> <int>             <chr>
## 1 Afghanistan  1999      745/19987071
## 2 Afghanistan  2000     2666/20595360
## 3      Brazil  1999   37737/172006362
## 4      Brazil  2000   80488/174504898
## 5       China  1999 212258/1272915272
## 6       China  2000 213766/1280428583
table3 %>% 
  separate(rate, into = c("cases", "population"))
## # A tibble: 6 x 4
##       country  year  cases population
## *       <chr> <int>  <chr>      <chr>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3      Brazil  1999  37737  172006362
## 4      Brazil  2000  80488  174504898
## 5       China  1999 212258 1272915272
## 6       China  2000 213766 1280428583

Uniting

table5
## # A tibble: 6 x 4
##       country century  year              rate
## *       <chr>   <chr> <chr>             <chr>
## 1 Afghanistan      19    99      745/19987071
## 2 Afghanistan      20    00     2666/20595360
## 3      Brazil      19    99   37737/172006362
## 4      Brazil      20    00   80488/174504898
## 5       China      19    99 212258/1272915272
## 6       China      20    00 213766/1280428583

Uniting

table5
## # A tibble: 6 x 4
##       country century  year              rate
## *       <chr>   <chr> <chr>             <chr>
## 1 Afghanistan      19    99      745/19987071
## 2 Afghanistan      20    00     2666/20595360
## 3      Brazil      19    99   37737/172006362
## 4      Brazil      20    00   80488/174504898
## 5       China      19    99 212258/1272915272
## 6       China      20    00 213766/1280428583
table5 %>% 
  unite(new, century, year)
## # A tibble: 6 x 3
##       country   new              rate
## *       <chr> <chr>             <chr>
## 1 Afghanistan 19_99      745/19987071
## 2 Afghanistan 20_00     2666/20595360
## 3      Brazil 19_99   37737/172006362
## 4      Brazil 20_00   80488/174504898
## 5       China 19_99 212258/1272915272
## 6       China 20_00 213766/1280428583
# remove underscore
table5 %>% 
  unite(new, century, year, sep = "")
## # A tibble: 6 x 3
##       country   new              rate
## *       <chr> <chr>             <chr>
## 1 Afghanistan  1999      745/19987071
## 2 Afghanistan  2000     2666/20595360
## 3      Brazil  1999   37737/172006362
## 4      Brazil  2000   80488/174504898
## 5       China  1999 212258/1272915272
## 6       China  2000 213766/1280428583