Pipes and functions in R

MACS 30500 University of Chicago

Using the diamonds dataset, calculate the average price for each cut of “I” colored diamonds.

  1. Filter diamonds to only keep observations where the color is rated as “I”
  2. Group the filtered diamonds data frame by cut
  3. Summarize the grouped and filtered diamonds data frame by calculating the average price

Intermediate steps

diamonds_1 <- filter(diamonds, color == "I")
diamonds_2 <- group_by(diamonds_1, cut)
(diamonds_3 <- summarize(diamonds_2, price = mean(price)))
## # A tibble: 5 x 2
##         cut    price
##       <ord>    <dbl>
## 1      Fair 4685.446
## 2      Good 5078.533
## 3 Very Good 5255.880
## 4   Premium 5946.181
## 5     Ideal 4451.970

Overwrite the original

# copy diamonds to diamonds_t just for demonstration purposes
diamonds_t <- diamonds

diamonds_t <- filter(diamonds_t, color == "I")
diamonds_t <- group_by(diamonds_t, cut)
(diamonds_t <- summarize(diamonds_t, price = mean(price)))
## # A tibble: 5 x 2
##         cut    price
##       <ord>    <dbl>
## 1      Fair 4685.446
## 2      Good 5078.533
## 3 Very Good 5255.880
## 4   Premium 5946.181
## 5     Ideal 4451.970

Function composition

summarize(
  group_by(
    filter(diamonds, color == "I"),
    cut
  ),
  price = mean(price)
)
## # A tibble: 5 x 2
##         cut    price
##       <ord>    <dbl>
## 1      Fair 4685.446
## 2      Good 5078.533
## 3 Very Good 5255.880
## 4   Premium 5946.181
## 5     Ideal 4451.970

Piping

diamonds %>%
  filter(color == "I") %>%
  group_by(cut) %>%
  summarize(price = mean(price))
## # A tibble: 5 x 2
##         cut    price
##       <ord>    <dbl>
## 1      Fair 4685.446
## 2      Good 5078.533
## 3 Very Good 5255.880
## 4   Premium 5946.181
## 5     Ideal 4451.970

Exercise on piping

Functions

Function components

  • Name
  • Arguments
  • Body

Rescale function

rescale01 <- function(x) {
  rng <- range(x, na.rm = TRUE)
  (x - rng[1]) / (rng[2] - rng[1])
}

rescale01(c(0, 5, 10))
## [1] 0.0 0.5 1.0
rescale01(c(-10, 0, 10))
## [1] 0.0 0.5 1.0
rescale01(c(1, 2, 3, NA, 5))
## [1] 0.00 0.25 0.50   NA 1.00
  • Name
  • Arguments
  • Body

What is that?

pythagorean <- function(a, b){
  hypotenuse <- sqrt(a^2 + b^2)
  return(hypotenuse)
}
  • Name
  • Arguments
  • Body

How to use a function

# print the output of the function
pythagorean(a = 3, b = 4)
## [1] 5
# save the output as a new object
(tri_c <- pythagorean(a = 3, b = 4))
## [1] 5
# what happens to the hypotenuse from inside the function?
pythagorean(a = 3, b = 4)
## [1] 5
hypotenuse
## Error in eval(expr, envir, enclos): object 'hypotenuse' not found

Exercise

Conditional execution

if (condition) {
  # code executed when condition is TRUE
} else {
  # code executed when condition is FALSE
}

Conditional execution

if (this) {
  # do that
} else if (that) {
  # do something else
} else {
  # do something completely different
}

Conditional execution and cut()

diamonds %>%
  select(carat) %>%
  mutate(carat_autobin = cut(carat, breaks = 5),
         carat_manbin = cut(carat,
                            breaks = c(0, 1, 2, 3, 6),
                            labels = c("Small", "Medium",
                                       "Large", "Huge")))
## # A tibble: 53,940 x 3
##    carat carat_autobin carat_manbin
##    <dbl>        <fctr>       <fctr>
##  1  0.23  (0.195,1.16]        Small
##  2  0.21  (0.195,1.16]        Small
##  3  0.23  (0.195,1.16]        Small
##  4  0.29  (0.195,1.16]        Small
##  5  0.31  (0.195,1.16]        Small
##  6  0.24  (0.195,1.16]        Small
##  7  0.24  (0.195,1.16]        Small
##  8  0.26  (0.195,1.16]        Small
##  9  0.22  (0.195,1.16]        Small
## 10  0.23  (0.195,1.16]        Small
## # ... with 53,930 more rows

if() versus if_else()

library(rcfss)
data("gun_deaths")

(educ <- select(gun_deaths, education))
## # A tibble: 100,798 x 1
##       education
##          <fctr>
##  1          BA+
##  2 Some college
##  3          BA+
##  4          BA+
##  5       HS/GED
##  6 Less than HS
##  7       HS/GED
##  8       HS/GED
##  9 Some college
## 10         <NA>
## # ... with 100,788 more rows

if() versus if_else()

(educ_if <- educ %>%
  mutate(hsPlus = if(education == "Less than HS"){
    "Less than HS"
  } else{
    "HS+"
  }))
## # A tibble: 100,798 x 2
##       education hsPlus
##          <fctr>  <chr>
##  1          BA+    HS+
##  2 Some college    HS+
##  3          BA+    HS+
##  4          BA+    HS+
##  5       HS/GED    HS+
##  6 Less than HS    HS+
##  7       HS/GED    HS+
##  8       HS/GED    HS+
##  9 Some college    HS+
## 10         <NA>    HS+
## # ... with 100,788 more rows

count(educ_if, hsPlus)
## # A tibble: 1 x 2
##   hsPlus      n
##    <chr>  <int>
## 1    HS+ 100798

if() versus if_else()

(educ_ifelse <- educ %>%
  mutate(hsPlus = if_else(education == "Less than HS",
                          "Less than HS",
                          "HS+")))
## # A tibble: 100,798 x 2
##       education       hsPlus
##          <fctr>        <chr>
##  1          BA+          HS+
##  2 Some college          HS+
##  3          BA+          HS+
##  4          BA+          HS+
##  5       HS/GED          HS+
##  6 Less than HS Less than HS
##  7       HS/GED          HS+
##  8       HS/GED          HS+
##  9 Some college          HS+
## 10         <NA>         <NA>
## # ... with 100,788 more rows

count(educ_ifelse, hsPlus)
## # A tibble: 3 x 2
##         hsPlus     n
##          <chr> <int>
## 1          HS+ 77553
## 2 Less than HS 21823
## 3         <NA>  1422

Exercise