Vectors and iteration

MACS 30500 University of Chicago

Logical vectors

parse_logical(c(TRUE, TRUE, FALSE, TRUE, NA))
## [1]  TRUE  TRUE FALSE  TRUE    NA

Numeric vectors

parse_integer(c(1, 5, 3, 4, 12423))
## [1]     1     5     3     4 12423
parse_double(c(4.2, 4, 6, 53.2))
## [1]  4.2  4.0  6.0 53.2

Character vectors

parse_character(c("Goodnight Moon", "Runaway Bunny", "Big Red Barn"))
## [1] "Goodnight Moon" "Runaway Bunny"  "Big Red Barn"

Scalars

(x <- sample(10))
##  [1]  2  6  5  8  9  4  1  7 10  3
x + c(100, 100, 100, 100, 100, 100, 100, 100, 100, 100)
##  [1] 102 106 105 108 109 104 101 107 110 103
x + 100
##  [1] 102 106 105 108 109 104 101 107 110 103

Vector recycling

# create a sequence of numbers between 1 and 10
(x1 <- seq(from = 1, to = 2))
## [1] 1 2
(x2 <- seq(from = 1, to = 10))
##  [1]  1  2  3  4  5  6  7  8  9 10
# add together two sequences of numbers
x1 + x2
##  [1]  2  4  4  6  6  8  8 10 10 12

Subsetting vectors

x <- c("one", "two", "three", "four", "five")
  • With positive integers

    x[c(3, 2, 5)]
    ## [1] "three" "two"   "five"
  • With negative integers

    x[c(-1, -3, -5)]
    ## [1] "two"  "four"
  • Don’t mix positive and negative

    x[c(-1, 1)]
    ## Error in x[c(-1, 1)]: only 0's may be mixed with negative subscripts

Subset with a logical vector

(x <- c(10, 3, NA, 5, 8, 1, NA))
## [1] 10  3 NA  5  8  1 NA
# All non-missing values of x
!is.na(x)
## [1]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
x[!is.na(x)]
## [1] 10  3  5  8  1
# All even (or missing!) values of x
x[x %% 2 == 0]
## [1] 10 NA  8 NA

Exercise on subsetting vectors

Lists

x <- list(1, 2, 3)
x
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 2
## 
## [[3]]
## [1] 3

Lists: str()

str(x)
## List of 3
##  $ : num 1
##  $ : num 2
##  $ : num 3
x_named <- list(a = 1, b = 2, c = 3)
str(x_named)
## List of 3
##  $ a: num 1
##  $ b: num 2
##  $ c: num 3

Store a mix of objects

y <- list("a", 1L, 1.5, TRUE)
str(y)
## List of 4
##  $ : chr "a"
##  $ : int 1
##  $ : num 1.5
##  $ : logi TRUE

Nested lists

z <- list(list(1, 2), list(3, 4))
str(z)
## List of 2
##  $ :List of 2
##   ..$ : num 1
##   ..$ : num 2
##  $ :List of 2
##   ..$ : num 3
##   ..$ : num 4

Secret lists

str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

Exercise on subsetting lists

Iteration

df <- tibble(
  a = rnorm(10),
  b = rnorm(10),
  c = rnorm(10),
  d = rnorm(10)
)
median(df$a)
## [1] -0.5555419
median(df$b)
## [1] -0.4656169
median(df$c)
## [1] -0.605349
median(df$d)
## [1] -0.9248524

Iteration with for loop

output <- vector(mode = "double", length = ncol(df))
for (i in seq_along(df)) {
  output[[i]] <- median(df[[i]])
}
output
## [1] -0.5555419 -0.4656169 -0.6053490 -0.9248524

Output

output <- vector(mode = "double", length = ncol(df))
vector(mode = "double", length = ncol(df))
## [1] 0 0 0 0
vector(mode = "logical", length = ncol(df))
## [1] FALSE FALSE FALSE FALSE
vector(mode = "character", length = ncol(df))
## [1] "" "" "" ""
vector(mode = "list", length = ncol(df))
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL

Sequence

i in seq_along(df)
seq_along(df)
## [1] 1 2 3 4

Body

output[[i]] <- median(df[[i]])

Preallocation

x <- rnorm(1000, mean = 0, sd = 1)
str(x)
##  num [1:1000] -0.969 -1.107 -1.252 -0.524 -0.497 ...
# load microbenchmark library to time code
library(microbenchmark)

microbenchmark(
  # don't preallocate
  `No preallocation` = {
    output <- vector("numeric", 0)
    
    for (i in seq_along(x)) {
      output <- c(output, x[[i]] + 1)
    }
  },
  # preallocate
  `Preallocation` = {
    output <- vector("numeric", length(x))
    
    for (i in seq_along(x)) {
      output[[i]] <- x[[i]] + 1
    }
  }) %>%
  autoplot +
  scale_y_log10(breaks = c(2, 4, 8, 16, 32)) +
  labs(y = "Time [milliseconds]")

Exercise on writing for loops

Map functions

  • Why for loops are good
  • Why map() functions may be better
  • Types of map() functions
    • map() makes a list
    • map_lgl() makes a logical vector
    • map_int() makes an integer vector
    • map_dbl() makes a double vector
    • map_chr() makes a character vector

Map functions

map_dbl(df, mean)
##           a           b           c           d 
## -0.32976859 -0.09851033 -0.50612789 -0.71983177
map_dbl(df, median)
##          a          b          c          d 
## -0.5555419 -0.4656169 -0.6053490 -0.9248524
map_dbl(df, sd)
##         a         b         c         d 
## 0.6377362 0.9825674 0.8589300 0.9474181

Map functions

map_dbl(df, mean, na.rm = TRUE)
##           a           b           c           d 
## -0.32976859 -0.09851033 -0.50612789 -0.71983177

Map functions

df %>%
  map_dbl(mean, na.rm = TRUE)
##           a           b           c           d 
## -0.32976859 -0.09851033 -0.50612789 -0.71983177

Exercise on writing map() functions

Scoped verbs

mtcars %>%
  summarize(mpg = mean(mpg))
##        mpg
## 1 20.09062

Scoped verbs

mtcars %>%
  summarize(mpg = mean(mpg),
            cyl = mean(cyl),
            disp = mean(disp),
            hp = mean(hp),
            drat = mean(drat),
            wt = mean(wt),
            qsec = mean(qsec),
            vs = mean(vs),
            am = mean(am),
            gear = mean(gear),
            carb = mean(carb))
##        mpg    cyl     disp       hp     drat      wt     qsec     vs
## 1 20.09062 6.1875 230.7219 146.6875 3.596563 3.21725 17.84875 0.4375
##        am   gear   carb
## 1 0.40625 3.6875 2.8125

Scoped verbs

  • _if allows you to pick variables based on a predicate function like is.numeric() or is.character()
  • _at allows you to pick variables using the same syntax as select()
  • _all operates on all variables

summarize_all()

summarize_all(mtcars, mean)
##        mpg    cyl     disp       hp     drat      wt     qsec     vs
## 1 20.09062 6.1875 230.7219 146.6875 3.596563 3.21725 17.84875 0.4375
##        am   gear   carb
## 1 0.40625 3.6875 2.8125
summarize_all(mtcars, funs(min, max))
##   mpg_min cyl_min disp_min hp_min drat_min wt_min qsec_min vs_min am_min
## 1    10.4       4     71.1     52     2.76  1.513     14.5      0      0
##   gear_min carb_min mpg_max cyl_max disp_max hp_max drat_max wt_max
## 1        3        1    33.9       8      472    335     4.93  5.424
##   qsec_max vs_max am_max gear_max carb_max
## 1     22.9      1      1        5        8
mtcars %>%
  group_by(gear) %>%
  summarize_all(mean)
## # A tibble: 3 x 11
##    gear   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  carb
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1    3.  16.1  7.47  326. 176.   3.13  3.89  17.7 0.200 0.     2.67
## 2    4.  24.5  4.67  123.  89.5  4.04  2.62  19.0 0.833 0.667  2.33
## 3    5.  21.4  6.00  202. 196.   3.92  2.63  15.6 0.200 1.00   4.40

summarize_at()

summarize_at(mtcars, vars(-mpg), mean)
##      cyl     disp       hp     drat      wt     qsec     vs      am   gear
## 1 6.1875 230.7219 146.6875 3.596563 3.21725 17.84875 0.4375 0.40625 3.6875
##     carb
## 1 2.8125
summarize_at(mtcars, vars(mpg), funs(min, max))
##    min  max
## 1 10.4 33.9
summarize_at(mtcars, vars(mpg, wt), min)
##    mpg    wt
## 1 10.4 1.513
summarize_at(mtcars, vars(-mpg), funs(min, max))
##   cyl_min disp_min hp_min drat_min wt_min qsec_min vs_min am_min gear_min
## 1       4     71.1     52     2.76  1.513     14.5      0      0        3
##   carb_min cyl_max disp_max hp_max drat_max wt_max qsec_max vs_max am_max
## 1        1       8      472    335     4.93  5.424     22.9      1      1
##   gear_max carb_max
## 1        5        8

summarize_if()

starwars
## # A tibble: 87 x 13
##    name     height  mass hair_color skin_color eye_color birth_year gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> 
##  1 Luke Sk…    172   77. blond      fair       blue            19.0 male  
##  2 C-3PO       167   75. <NA>       gold       yellow         112.  <NA>  
##  3 R2-D2        96   32. <NA>       white, bl… red             33.0 <NA>  
##  4 Darth V…    202  136. none       white      yellow          41.9 male  
##  5 Leia Or…    150   49. brown      light      brown           19.0 female
##  6 Owen La…    178  120. brown, gr… light      blue            52.0 male  
##  7 Beru Wh…    165   75. brown      light      blue            47.0 female
##  8 R5-D4        97   32. <NA>       white, red red             NA   <NA>  
##  9 Biggs D…    183   84. black      light      brown           24.0 male  
## 10 Obi-Wan…    182   77. auburn, w… fair       blue-gray       57.0 male  
## # ... with 77 more rows, and 5 more variables: homeworld <chr>,
## #   species <chr>, films <list>, vehicles <list>, starships <list>
starwars %>%
  group_by(species) %>%
  summarize_if(is.numeric, mean, na.rm = TRUE)
## # A tibble: 38 x 4
##    species   height  mass birth_year
##    <chr>      <dbl> <dbl>      <dbl>
##  1 Aleena      79.0  15.0     NaN   
##  2 Besalisk   198.  102.      NaN   
##  3 Cerean     198.   82.0      92.0 
##  4 Chagrian   196.  NaN       NaN   
##  5 Clawdite   168.   55.0     NaN   
##  6 Droid      140.   69.8      53.3 
##  7 Dug        112.   40.0     NaN   
##  8 Ewok        88.0  20.0       8.00
##  9 Geonosian  183.   80.0     NaN   
## 10 Gungan     209.   74.0      52.0 
## # ... with 28 more rows

Mutate

mutate_all(mtcars, log10)
##         mpg       cyl     disp       hp      drat        wt     qsec   vs
## 1  1.322219 0.7781513 2.204120 2.041393 0.5910646 0.4183013 1.216430 -Inf
## 2  1.322219 0.7781513 2.204120 2.041393 0.5910646 0.4586378 1.230960 -Inf
## 3  1.357935 0.6020600 2.033424 1.968483 0.5854607 0.3654880 1.269746    0
## 4  1.330414 0.7781513 2.411620 2.041393 0.4885507 0.5071810 1.288696    0
## 5  1.271842 0.9030900 2.556303 2.243038 0.4983106 0.5365584 1.230960 -Inf
## 6  1.257679 0.7781513 2.352183 2.021189 0.4409091 0.5390761 1.305781    0
## 7  1.155336 0.9030900 2.556303 2.389166 0.5065050 0.5526682 1.199755 -Inf
## 8  1.387390 0.6020600 2.166430 1.792392 0.5670264 0.5037907 1.301030    0
## 9  1.357935 0.6020600 2.148603 1.977724 0.5932861 0.4983106 1.359835    0
## 10 1.283301 0.7781513 2.224274 2.089905 0.5932861 0.5365584 1.262451    0
## 11 1.250420 0.7781513 2.224274 2.089905 0.5932861 0.5365584 1.276462    0
## 12 1.214844 0.9030900 2.440594 2.255273 0.4871384 0.6095944 1.240549 -Inf
## 13 1.238046 0.9030900 2.440594 2.255273 0.4871384 0.5717088 1.245513 -Inf
## 14 1.181844 0.9030900 2.440594 2.255273 0.4871384 0.5774918 1.255273 -Inf
## 15 1.017033 0.9030900 2.673942 2.311754 0.4668676 0.7201593 1.254790 -Inf
## 16 1.017033 0.9030900 2.662758 2.332438 0.4771213 0.7343197 1.250908 -Inf
## 17 1.167317 0.9030900 2.643453 2.361728 0.5092025 0.7279477 1.241048 -Inf
## 18 1.510545 0.6020600 1.895975 1.819544 0.6106602 0.3424227 1.289366    0
## 19 1.482874 0.6020600 1.879096 1.716003 0.6928469 0.2081725 1.267641    0
## 20 1.530200 0.6020600 1.851870 1.812913 0.6253125 0.2636361 1.298853    0
## 21 1.332438 0.6020600 2.079543 1.986772 0.5682017 0.3918169 1.301247    0
## 22 1.190332 0.9030900 2.502427 2.176091 0.4409091 0.5465427 1.227115 -Inf
## 23 1.181844 0.9030900 2.482874 2.176091 0.4983106 0.5359267 1.238046 -Inf
## 24 1.123852 0.9030900 2.544068 2.389166 0.5717088 0.5843312 1.187803 -Inf
## 25 1.283301 0.9030900 2.602060 2.243038 0.4885507 0.5848963 1.231724 -Inf
## 26 1.436163 0.6020600 1.897627 1.819544 0.6106602 0.2866810 1.276462    0
## 27 1.414973 0.6020600 2.080266 1.959041 0.6464037 0.3304138 1.222716 -Inf
## 28 1.482874 0.6020600 1.978181 2.053078 0.5763414 0.1798389 1.227887    0
## 29 1.198657 0.9030900 2.545307 2.421604 0.6253125 0.5010593 1.161368 -Inf
## 30 1.294466 0.7781513 2.161368 2.243038 0.5587086 0.4424798 1.190332 -Inf
## 31 1.176091 0.9030900 2.478566 2.525045 0.5490033 0.5526682 1.164353 -Inf
## 32 1.330414 0.6020600 2.082785 2.037426 0.6138418 0.4440448 1.269513    0
##      am      gear      carb
## 1     0 0.6020600 0.6020600
## 2     0 0.6020600 0.6020600
## 3     0 0.6020600 0.0000000
## 4  -Inf 0.4771213 0.0000000
## 5  -Inf 0.4771213 0.3010300
## 6  -Inf 0.4771213 0.0000000
## 7  -Inf 0.4771213 0.6020600
## 8  -Inf 0.6020600 0.3010300
## 9  -Inf 0.6020600 0.3010300
## 10 -Inf 0.6020600 0.6020600
## 11 -Inf 0.6020600 0.6020600
## 12 -Inf 0.4771213 0.4771213
## 13 -Inf 0.4771213 0.4771213
## 14 -Inf 0.4771213 0.4771213
## 15 -Inf 0.4771213 0.6020600
## 16 -Inf 0.4771213 0.6020600
## 17 -Inf 0.4771213 0.6020600
## 18    0 0.6020600 0.0000000
## 19    0 0.6020600 0.3010300
## 20    0 0.6020600 0.0000000
## 21 -Inf 0.4771213 0.0000000
## 22 -Inf 0.4771213 0.3010300
## 23 -Inf 0.4771213 0.3010300
## 24 -Inf 0.4771213 0.6020600
## 25 -Inf 0.4771213 0.3010300
## 26    0 0.6020600 0.0000000
## 27    0 0.6989700 0.3010300
## 28    0 0.6989700 0.3010300
## 29    0 0.6989700 0.6020600
## 30    0 0.6989700 0.7781513
## 31    0 0.6989700 0.9030900
## 32    0 0.6020600 0.3010300

Filter

library(nycflights13)

# Rows where any value is missing
filter_all(weather, any_vars(is.na(.)))
## # A tibble: 3,109 x 15
##    origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
##    <chr>  <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>
##  1 EWR    2013.    1.     1    17  39.2  28.4  64.9     270.      16.1 
##  2 EWR    2013.    1.     1    18  39.2  28.4  64.9     330.      15.0 
##  3 EWR    2013.    1.     3    16  30.9  14.0  49.0      NA        4.60
##  4 EWR    2013.    1.     6    10  33.8  30.2  86.5     210.       4.60
##  5 EWR    2013.    1.     6    12  33.8  32.0  93.0     220.       9.21
##  6 EWR    2013.    1.     6    13  35.6  32.0  86.6     240.       8.06
##  7 EWR    2013.    1.     6    14  35.6  32.0  86.6     230.       8.06
##  8 EWR    2013.    1.     6    15  37.4  30.2  75.0     250.      11.5 
##  9 EWR    2013.    1.    11    17  45.0  36.0  70.5      NA        4.60
## 10 EWR    2013.    1.    11    21  46.4  39.2  75.8      90.       4.60
## # ... with 3,099 more rows, and 5 more variables: wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>
# Rows where all wind variables are missing
filter_at(weather, vars(starts_with("wind")), all_vars(is.na(.)))
## # A tibble: 3 x 15
##   origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
##   <chr>  <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>
## 1 EWR    2013.    3.    27    21  52.0  19.0  27.0       NA         NA
## 2 JFK    2013.    7.     4    10  73.0  71.1  93.5       NA         NA
## 3 JFK    2013.    7.    20    10  81.0  71.1  71.9       NA         NA
## # ... with 5 more variables: wind_gust <dbl>, precip <dbl>,
## #   pressure <dbl>, visib <dbl>, time_hour <dttm>