Vectors and iteration

MACS 30500 University of Chicago

Logical vectors

parse_logical(c(TRUE, TRUE, FALSE, TRUE, NA))

## [1]  TRUE  TRUE FALSE  TRUE    NA

Numeric vectors

parse_integer(c(1, 5, 3, 4, 12423))

## [1]     1     5     3     4 12423

parse_double(c(4.2, 4, 6, 53.2))

## [1]  4.2  4.0  6.0 53.2

Character vectors

parse_character(c("Goodnight Moon", "Runaway Bunny", "Big Red Barn"))

## [1] "Goodnight Moon" "Runaway Bunny"  "Big Red Barn"

Scalars

(x <- sample(10))

##  [1]  2  6  5  8  9  4  1  7 10  3

x + c(100, 100, 100, 100, 100, 100, 100, 100, 100, 100)

##  [1] 102 106 105 108 109 104 101 107 110 103

x + 100

##  [1] 102 106 105 108 109 104 101 107 110 103

Vector recycling

# create a sequence of numbers between 1 and 10
(x1 <- seq(from = 1, to = 2))

## [1] 1 2

(x2 <- seq(from = 1, to = 10))

##  [1]  1  2  3  4  5  6  7  8  9 10

# add together two sequences of numbers
x1 + x2

##  [1]  2  4  4  6  6  8  8 10 10 12

Subsetting vectors

x <- c("one", "two", "three", "four", "five")

With positive integers

x[c(3, 2, 5)]
## [1] "three" "two"   "five"

With negative integers
```
x[c(-1, -3, -5)]
## [1] "two"  "four"
```

Don’t mix positive and negative

x[c(-1, 1)]
## Error in x[c(-1, 1)]: only 0's may be mixed with negative subscripts

Subset with a logical vector

(x <- c(10, 3, NA, 5, 8, 1, NA))

## [1] 10  3 NA  5  8  1 NA

# All non-missing values of x
!is.na(x)

## [1]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE

x[!is.na(x)]

## [1] 10  3  5  8  1

# All even (or missing!) values of x
x[x %% 2 == 0]

## [1] 10 NA  8 NA

Exercise on subsetting vectors

Lists

x <- list(1, 2, 3)
x

## [[1]]
## [1] 1
## 
## [[2]]
## [1] 2
## 
## [[3]]
## [1] 3

Lists: `str()`

str(x)

## List of 3
##  $ : num 1
##  $ : num 2
##  $ : num 3

x_named <- list(a = 1, b = 2, c = 3)
str(x_named)

## List of 3
##  $ a: num 1
##  $ b: num 2
##  $ c: num 3

Store a mix of objects

y <- list("a", 1L, 1.5, TRUE)
str(y)

## List of 4
##  $ : chr "a"
##  $ : int 1
##  $ : num 1.5
##  $ : logi TRUE

Nested lists

z <- list(list(1, 2), list(3, 4))
str(z)

## List of 2
##  $ :List of 2
##   ..$ : num 1
##   ..$ : num 2
##  $ :List of 2
##   ..$ : num 3
##   ..$ : num 4

Secret lists

str(diamonds)

## Classes 'tbl_df', 'tbl' and 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

Exercise on subsetting lists

Iteration

df <- tibble(
  a = rnorm(10),
  b = rnorm(10),
  c = rnorm(10),
  d = rnorm(10)
)

median(df$a)
## [1] -0.5555419
median(df$b)
## [1] -0.4656169
median(df$c)
## [1] -0.605349
median(df$d)
## [1] -0.9248524

Iteration with `for` loop

output <- vector(mode = "double", length = ncol(df))
for (i in seq_along(df)) {
  output[[i]] <- median(df[[i]])
}
output

## [1] -0.5555419 -0.4656169 -0.6053490 -0.9248524

Output

output <- vector(mode = "double", length = ncol(df))

vector(mode = "double", length = ncol(df))

## [1] 0 0 0 0

vector(mode = "logical", length = ncol(df))

## [1] FALSE FALSE FALSE FALSE

vector(mode = "character", length = ncol(df))

## [1] "" "" "" ""

vector(mode = "list", length = ncol(df))

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL

Sequence

i in seq_along(df)

seq_along(df)

## [1] 1 2 3 4

Body

output[[i]] <- median(df[[i]])

Preallocation

x <- rnorm(1000, mean = 0, sd = 1)
str(x)

##  num [1:1000] -0.969 -1.107 -1.252 -0.524 -0.497 ...

# load microbenchmark library to time code
library(microbenchmark)

microbenchmark(
  # don't preallocate
  `No preallocation` = {
    output <- vector("numeric", 0)
    
    for (i in seq_along(x)) {
      output <- c(output, x[[i]] + 1)
    }
  },
  # preallocate
  `Preallocation` = {
    output <- vector("numeric", length(x))
    
    for (i in seq_along(x)) {
      output[[i]] <- x[[i]] + 1
    }
  }) %>%
  autoplot +
  scale_y_log10(breaks = c(2, 4, 8, 16, 32)) +
  labs(y = "Time [milliseconds]")

Exercise on writing `for` loops

Map functions

Why for loops are good
Why map() functions may be better
Types of map() functions
- map() makes a list
- map_lgl() makes a logical vector
- map_int() makes an integer vector
- map_dbl() makes a double vector
- map_chr() makes a character vector

Map functions

map_dbl(df, mean)

##           a           b           c           d 
## -0.32976859 -0.09851033 -0.50612789 -0.71983177

map_dbl(df, median)

##          a          b          c          d 
## -0.5555419 -0.4656169 -0.6053490 -0.9248524

map_dbl(df, sd)

##         a         b         c         d 
## 0.6377362 0.9825674 0.8589300 0.9474181

Map functions

map_dbl(df, mean, na.rm = TRUE)

##           a           b           c           d 
## -0.32976859 -0.09851033 -0.50612789 -0.71983177

Map functions

df %>%
  map_dbl(mean, na.rm = TRUE)

##           a           b           c           d 
## -0.32976859 -0.09851033 -0.50612789 -0.71983177

Exercise on writing `map()` functions

Scoped verbs

mtcars %>%
  summarize(mpg = mean(mpg))

##        mpg
## 1 20.09062

Scoped verbs

mtcars %>%
  summarize(mpg = mean(mpg),
            cyl = mean(cyl),
            disp = mean(disp),
            hp = mean(hp),
            drat = mean(drat),
            wt = mean(wt),
            qsec = mean(qsec),
            vs = mean(vs),
            am = mean(am),
            gear = mean(gear),
            carb = mean(carb))

##        mpg    cyl     disp       hp     drat      wt     qsec     vs
## 1 20.09062 6.1875 230.7219 146.6875 3.596563 3.21725 17.84875 0.4375
##        am   gear   carb
## 1 0.40625 3.6875 2.8125

Scoped verbs

_if allows you to pick variables based on a predicate function like is.numeric() or is.character()
_at allows you to pick variables using the same syntax as select()
_all operates on all variables

`summarize_all()`

summarize_all(mtcars, mean)

##        mpg    cyl     disp       hp     drat      wt     qsec     vs
## 1 20.09062 6.1875 230.7219 146.6875 3.596563 3.21725 17.84875 0.4375
##        am   gear   carb
## 1 0.40625 3.6875 2.8125

summarize_all(mtcars, funs(min, max))

##   mpg_min cyl_min disp_min hp_min drat_min wt_min qsec_min vs_min am_min
## 1    10.4       4     71.1     52     2.76  1.513     14.5      0      0
##   gear_min carb_min mpg_max cyl_max disp_max hp_max drat_max wt_max
## 1        3        1    33.9       8      472    335     4.93  5.424
##   qsec_max vs_max am_max gear_max carb_max
## 1     22.9      1      1        5        8

mtcars %>%
  group_by(gear) %>%
  summarize_all(mean)

## # A tibble: 3 x 11
##    gear   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  carb
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1    3.  16.1  7.47  326. 176.   3.13  3.89  17.7 0.200 0.     2.67
## 2    4.  24.5  4.67  123.  89.5  4.04  2.62  19.0 0.833 0.667  2.33
## 3    5.  21.4  6.00  202. 196.   3.92  2.63  15.6 0.200 1.00   4.40

`summarize_at()`

summarize_at(mtcars, vars(-mpg), mean)

##      cyl     disp       hp     drat      wt     qsec     vs      am   gear
## 1 6.1875 230.7219 146.6875 3.596563 3.21725 17.84875 0.4375 0.40625 3.6875
##     carb
## 1 2.8125

summarize_at(mtcars, vars(mpg), funs(min, max))

##    min  max
## 1 10.4 33.9

summarize_at(mtcars, vars(mpg, wt), min)

##    mpg    wt
## 1 10.4 1.513

summarize_at(mtcars, vars(-mpg), funs(min, max))

##   cyl_min disp_min hp_min drat_min wt_min qsec_min vs_min am_min gear_min
## 1       4     71.1     52     2.76  1.513     14.5      0      0        3
##   carb_min cyl_max disp_max hp_max drat_max wt_max qsec_max vs_max am_max
## 1        1       8      472    335     4.93  5.424     22.9      1      1
##   gear_max carb_max
## 1        5        8

`summarize_if()`

starwars

## # A tibble: 87 x 13
##    name     height  mass hair_color skin_color eye_color birth_year gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> 
##  1 Luke Sk…    172   77. blond      fair       blue            19.0 male  
##  2 C-3PO       167   75. <NA>       gold       yellow         112.  <NA>  
##  3 R2-D2        96   32. <NA>       white, bl… red             33.0 <NA>  
##  4 Darth V…    202  136. none       white      yellow          41.9 male  
##  5 Leia Or…    150   49. brown      light      brown           19.0 female
##  6 Owen La…    178  120. brown, gr… light      blue            52.0 male  
##  7 Beru Wh…    165   75. brown      light      blue            47.0 female
##  8 R5-D4        97   32. <NA>       white, red red             NA   <NA>  
##  9 Biggs D…    183   84. black      light      brown           24.0 male  
## 10 Obi-Wan…    182   77. auburn, w… fair       blue-gray       57.0 male  
## # ... with 77 more rows, and 5 more variables: homeworld <chr>,
## #   species <chr>, films <list>, vehicles <list>, starships <list>

starwars %>%
  group_by(species) %>%
  summarize_if(is.numeric, mean, na.rm = TRUE)

## # A tibble: 38 x 4
##    species   height  mass birth_year
##    <chr>      <dbl> <dbl>      <dbl>
##  1 Aleena      79.0  15.0     NaN   
##  2 Besalisk   198.  102.      NaN   
##  3 Cerean     198.   82.0      92.0 
##  4 Chagrian   196.  NaN       NaN   
##  5 Clawdite   168.   55.0     NaN   
##  6 Droid      140.   69.8      53.3 
##  7 Dug        112.   40.0     NaN   
##  8 Ewok        88.0  20.0       8.00
##  9 Geonosian  183.   80.0     NaN   
## 10 Gungan     209.   74.0      52.0 
## # ... with 28 more rows

Mutate

mutate_all(mtcars, log10)

##         mpg       cyl     disp       hp      drat        wt     qsec   vs
## 1  1.322219 0.7781513 2.204120 2.041393 0.5910646 0.4183013 1.216430 -Inf
## 2  1.322219 0.7781513 2.204120 2.041393 0.5910646 0.4586378 1.230960 -Inf
## 3  1.357935 0.6020600 2.033424 1.968483 0.5854607 0.3654880 1.269746    0
## 4  1.330414 0.7781513 2.411620 2.041393 0.4885507 0.5071810 1.288696    0
## 5  1.271842 0.9030900 2.556303 2.243038 0.4983106 0.5365584 1.230960 -Inf
## 6  1.257679 0.7781513 2.352183 2.021189 0.4409091 0.5390761 1.305781    0
## 7  1.155336 0.9030900 2.556303 2.389166 0.5065050 0.5526682 1.199755 -Inf
## 8  1.387390 0.6020600 2.166430 1.792392 0.5670264 0.5037907 1.301030    0
## 9  1.357935 0.6020600 2.148603 1.977724 0.5932861 0.4983106 1.359835    0
## 10 1.283301 0.7781513 2.224274 2.089905 0.5932861 0.5365584 1.262451    0
## 11 1.250420 0.7781513 2.224274 2.089905 0.5932861 0.5365584 1.276462    0
## 12 1.214844 0.9030900 2.440594 2.255273 0.4871384 0.6095944 1.240549 -Inf
## 13 1.238046 0.9030900 2.440594 2.255273 0.4871384 0.5717088 1.245513 -Inf
## 14 1.181844 0.9030900 2.440594 2.255273 0.4871384 0.5774918 1.255273 -Inf
## 15 1.017033 0.9030900 2.673942 2.311754 0.4668676 0.7201593 1.254790 -Inf
## 16 1.017033 0.9030900 2.662758 2.332438 0.4771213 0.7343197 1.250908 -Inf
## 17 1.167317 0.9030900 2.643453 2.361728 0.5092025 0.7279477 1.241048 -Inf
## 18 1.510545 0.6020600 1.895975 1.819544 0.6106602 0.3424227 1.289366    0
## 19 1.482874 0.6020600 1.879096 1.716003 0.6928469 0.2081725 1.267641    0
## 20 1.530200 0.6020600 1.851870 1.812913 0.6253125 0.2636361 1.298853    0
## 21 1.332438 0.6020600 2.079543 1.986772 0.5682017 0.3918169 1.301247    0
## 22 1.190332 0.9030900 2.502427 2.176091 0.4409091 0.5465427 1.227115 -Inf
## 23 1.181844 0.9030900 2.482874 2.176091 0.4983106 0.5359267 1.238046 -Inf
## 24 1.123852 0.9030900 2.544068 2.389166 0.5717088 0.5843312 1.187803 -Inf
## 25 1.283301 0.9030900 2.602060 2.243038 0.4885507 0.5848963 1.231724 -Inf
## 26 1.436163 0.6020600 1.897627 1.819544 0.6106602 0.2866810 1.276462    0
## 27 1.414973 0.6020600 2.080266 1.959041 0.6464037 0.3304138 1.222716 -Inf
## 28 1.482874 0.6020600 1.978181 2.053078 0.5763414 0.1798389 1.227887    0
## 29 1.198657 0.9030900 2.545307 2.421604 0.6253125 0.5010593 1.161368 -Inf
## 30 1.294466 0.7781513 2.161368 2.243038 0.5587086 0.4424798 1.190332 -Inf
## 31 1.176091 0.9030900 2.478566 2.525045 0.5490033 0.5526682 1.164353 -Inf
## 32 1.330414 0.6020600 2.082785 2.037426 0.6138418 0.4440448 1.269513    0
##      am      gear      carb
## 1     0 0.6020600 0.6020600
## 2     0 0.6020600 0.6020600
## 3     0 0.6020600 0.0000000
## 4  -Inf 0.4771213 0.0000000
## 5  -Inf 0.4771213 0.3010300
## 6  -Inf 0.4771213 0.0000000
## 7  -Inf 0.4771213 0.6020600
## 8  -Inf 0.6020600 0.3010300
## 9  -Inf 0.6020600 0.3010300
## 10 -Inf 0.6020600 0.6020600
## 11 -Inf 0.6020600 0.6020600
## 12 -Inf 0.4771213 0.4771213
## 13 -Inf 0.4771213 0.4771213
## 14 -Inf 0.4771213 0.4771213
## 15 -Inf 0.4771213 0.6020600
## 16 -Inf 0.4771213 0.6020600
## 17 -Inf 0.4771213 0.6020600
## 18    0 0.6020600 0.0000000
## 19    0 0.6020600 0.3010300
## 20    0 0.6020600 0.0000000
## 21 -Inf 0.4771213 0.0000000
## 22 -Inf 0.4771213 0.3010300
## 23 -Inf 0.4771213 0.3010300
## 24 -Inf 0.4771213 0.6020600
## 25 -Inf 0.4771213 0.3010300
## 26    0 0.6020600 0.0000000
## 27    0 0.6989700 0.3010300
## 28    0 0.6989700 0.3010300
## 29    0 0.6989700 0.6020600
## 30    0 0.6989700 0.7781513
## 31    0 0.6989700 0.9030900
## 32    0 0.6020600 0.3010300

Filter

library(nycflights13)

# Rows where any value is missing
filter_all(weather, any_vars(is.na(.)))

## # A tibble: 3,109 x 15
##    origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
##    <chr>  <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>
##  1 EWR    2013.    1.     1    17  39.2  28.4  64.9     270.      16.1 
##  2 EWR    2013.    1.     1    18  39.2  28.4  64.9     330.      15.0 
##  3 EWR    2013.    1.     3    16  30.9  14.0  49.0      NA        4.60
##  4 EWR    2013.    1.     6    10  33.8  30.2  86.5     210.       4.60
##  5 EWR    2013.    1.     6    12  33.8  32.0  93.0     220.       9.21
##  6 EWR    2013.    1.     6    13  35.6  32.0  86.6     240.       8.06
##  7 EWR    2013.    1.     6    14  35.6  32.0  86.6     230.       8.06
##  8 EWR    2013.    1.     6    15  37.4  30.2  75.0     250.      11.5 
##  9 EWR    2013.    1.    11    17  45.0  36.0  70.5      NA        4.60
## 10 EWR    2013.    1.    11    21  46.4  39.2  75.8      90.       4.60
## # ... with 3,099 more rows, and 5 more variables: wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>

# Rows where all wind variables are missing
filter_at(weather, vars(starts_with("wind")), all_vars(is.na(.)))

## # A tibble: 3 x 15
##   origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
##   <chr>  <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>
## 1 EWR    2013.    3.    27    21  52.0  19.0  27.0       NA         NA
## 2 JFK    2013.    7.     4    10  73.0  71.1  93.5       NA         NA
## 3 JFK    2013.    7.    20    10  81.0  71.1  71.9       NA         NA
## # ... with 5 more variables: wind_gust <dbl>, precip <dbl>,
## #   pressure <dbl>, visib <dbl>, time_hour <dttm>

Vectors and iteration

MACS 30500 University of Chicago

Logical vectors

Numeric vectors

Character vectors

Scalars

Vector recycling

Subsetting vectors

Subset with a logical vector

Exercise on subsetting vectors

Lists

Lists: str()

Store a mix of objects

Nested lists

Secret lists

Exercise on subsetting lists

Iteration

Iteration with for loop

Output

Sequence

Body

Preallocation

Exercise on writing for loops

Map functions

Map functions

Map functions

Map functions

Exercise on writing map() functions

Scoped verbs

Scoped verbs

Scoped verbs

summarize_all()

summarize_at()

summarize_if()

Mutate

Filter

Lists: `str()`

Iteration with `for` loop

Exercise on writing `for` loops

Exercise on writing `map()` functions

`summarize_all()`

`summarize_at()`

`summarize_if()`