diamonds
diamonds
## # A tibble: 53,940 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4.00 4.05 2.39
## # ... with 53,930 more rows
data("diamonds")
diamonds_ideal <- filter(diamonds, cut == "Ideal")
summarize(diamonds_ideal, avg_price = mean(price))
## # A tibble: 1 x 1
## avg_price
## <dbl>
## 1 3457.542
data("diamonds")
diamonds_cut <- group_by(diamonds, cut)
summarize(diamonds_cut, avg_price = mean(price))
## # A tibble: 5 x 2
## cut avg_price
## <ord> <dbl>
## 1 Fair 4358.758
## 2 Good 3928.864
## 3 Very Good 3981.760
## 4 Premium 4584.258
## 5 Ideal 3457.542
data("diamonds")
diamonds_i <- filter(diamonds, color == "I")
diamonds_i_group <- group_by(diamonds_i, cut)
summarize(
diamonds_i_group,
carat = mean(carat),
price = mean(price)
)
## # A tibble: 5 x 3
## cut carat price
## <ord> <dbl> <dbl>
## 1 Fair 1.1980571 4685.446
## 2 Good 1.0572222 5078.533
## 3 Very Good 1.0469518 5255.880
## 4 Premium 1.1449370 5946.181
## 5 Ideal 0.9130291 4451.970
dplyr
function() |
Action performed |
---|---|
filter() |
Subsets observations based on their values |
arrange() |
Changes the order of observations based on their values |
select() |
Selects a subset of columns from the data frame |
rename() |
Changes the name of columns in the data frame |
mutate() |
Creates new columns (or variables) |
group_by() |
Changes the unit of analysis from the complete dataset to individual groups |
summarize() |
Collapses the data frame to a smaller number of rows which summarize the larger data |
The holy grail: “For consistency, aim to use British (rather than American) spelling.” #rstats http://t.co/7qQSWIowcl. Colour is right!
— Hadley Wickham (@hadleywickham) November 27, 2013
The holy grail: “For consistency, aim to use British (rather than American) spelling.” #rstats http://t.co/7qQSWIowcl. Colour is right!
— Hadley Wickham (@hadleywickham) November 27, 2013
We have to make America great again!
— Donald J. Trump (@realDonaldTrump) November 7, 2012
summarize()
= summarise()
color()
= colour()
<-
)# printed, but not saved
filter(diamonds, cut == "Ideal")
## # A tibble: 21,551 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.23 Ideal J VS1 62.8 56 340 3.93 3.90 2.46
## 3 0.31 Ideal J SI2 62.2 54 344 4.35 4.37 2.71
## 4 0.30 Ideal I SI2 62.0 54 348 4.31 4.34 2.68
## 5 0.33 Ideal I SI2 61.8 55 403 4.49 4.51 2.78
## 6 0.33 Ideal I SI2 61.2 56 403 4.49 4.50 2.75
## 7 0.33 Ideal J SI1 61.1 56 403 4.49 4.55 2.76
## 8 0.23 Ideal G VS1 61.9 54 404 3.93 3.95 2.44
## 9 0.32 Ideal I SI1 60.9 55 404 4.45 4.48 2.72
## 10 0.30 Ideal I SI2 61.0 59 405 4.30 4.33 2.63
## # ... with 21,541 more rows
<-
)# saved, but not printed
diamonds_ideal <- filter(diamonds, cut == "Ideal")
<-
)# saved and printed
(diamonds_ideal <- filter(diamonds, cut == "Ideal"))
## # A tibble: 21,551 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.23 Ideal J VS1 62.8 56 340 3.93 3.90 2.46
## 3 0.31 Ideal J SI2 62.2 54 344 4.35 4.37 2.71
## 4 0.30 Ideal I SI2 62.0 54 348 4.31 4.34 2.68
## 5 0.33 Ideal I SI2 61.8 55 403 4.49 4.51 2.78
## 6 0.33 Ideal I SI2 61.2 56 403 4.49 4.50 2.75
## 7 0.33 Ideal J SI1 61.1 56 403 4.49 4.55 2.76
## 8 0.23 Ideal G VS1 61.9 54 404 3.93 3.95 2.44
## 9 0.32 Ideal I SI1 60.9 55 404 4.45 4.48 2.72
## 10 0.30 Ideal I SI2 61.0 59 405 4.30 4.33 2.63
## # ... with 21,541 more rows
NA > 5
## [1] NA
10 == NA
## [1] NA
NA + 10
## [1] NA
na.rm
argumentdf <- tibble(x = c(1, NA, 3))
filter(df, x > 1)
## # A tibble: 1 x 1
## x
## <dbl>
## 1 3
filter(df, is.na(x) | x > 1)
## # A tibble: 2 x 1
## x
## <dbl>
## 1 NA
## 2 3
df <- tibble(
x = c(1, 2, 3, 5, NA)
)
summarize(df, meanx = mean(x))
## # A tibble: 1 x 1
## meanx
## <dbl>
## 1 NA
summarize(df, meanx = mean(x, na.rm = TRUE))
## # A tibble: 1 x 1
## meanx
## <dbl>
## 1 2.75
%>%
)by_dest <- group_by(flights, dest)
delay <- summarise(by_dest,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
)
delay <- filter(delay, count > 20, dest != "HNL")
# Pipes - clear
delays <- flights %>%
group_by(dest) %>%
summarize(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")
delays <- flights %>%
by_dest <- group_by(dest) %>%
delay <- summarize(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
delay <- filter(count > 20, dest != "HNL")
Error: bad assignment:
summarize(count = n(), dist = mean(distance, na.rm = TRUE), delay = mean(arr_delay,
na.rm = TRUE)) %>% delay <- filter(count > 20, dest != "HNL")
delays <- flights %>%
group_by(flights, dest) %>%
summarize(flights,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(flights, count > 20, dest != "HNL")
## Error in grouped_df_impl(data, unname(vars), drop): Column `flights` is unknown