## ---- include = FALSE----------------------------------------------------------------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  warning = FALSE,
  message = FALSE
)
# install.packages("tidyverse")


## ----setup, include = FALSE----------------------------------------------------------------------------------------------------------
library(datascience.curriculum)


## ------------------------------------------------------------------------------------------------------------------------------------
# attach the packages we will need
library("tidyverse")
# read in an example data file
# or substitute a file path to your own data
readr::read_csv(file = "https://raw.githubusercontent.com/blaserlab/datascience.curriculum/main/inst/extdata/demo_iris_data.csv")


## ------------------------------------------------------------------------------------------------------------------------------------
readr::read_csv(file = "https://raw.githubusercontent.com/blaserlab/datascience.curriculum/main/inst/extdata/demo_iris_data.csv", col_types = "fddddc")


## ------------------------------------------------------------------------------------------------------------------------------------
# assign the data to the variable, demo_data
# readr::read_csv(file = system.file("extdata/demo_iris_data.csv", package = "datascience.curriculum"))
demo_data <- readr::read_csv(file = "https://raw.githubusercontent.com/blaserlab/datascience.curriculum/main/inst/extdata/demo_iris_data.csv", col_types = "fddddc")
demo_data


## ------------------------------------------------------------------------------------------------------------------------------------
# a numeric vector
c(1, 2, 3)

# a character vector
c("a", "b", "c")

# another character vector
c("1", "b" ,"charlie")

# here 1 gets coerced to a character because a vector must be all the same type
c(1, "b", "charlie")


## ------------------------------------------------------------------------------------------------------------------------------------
# extract a column by position
# the head command prints the first few values only
# omit head() if you want the whole thing
head(demo_data[[2]])

# extract a column by name
head(demo_data[["Species"]])

# another way to extract a column by name
head(demo_data$Species)


## ------------------------------------------------------------------------------------------------------------------------------------
# make the list
demo_list <- list(1, "b", "charlie")

demo_list

# optionally name the elements
names(demo_list) <- c("a_number", "a_letter", "a_name")

# extract the elements by name
demo_list$a_number
demo_list$a_letter
demo_list$a_name


## ------------------------------------------------------------------------------------------------------------------------------------
# subset using dataframe[row,colum] syntax
# subset a dataframe to get the second and third columns only
demo_data[,2:3]

# subset a dataframe to get the first and second rows only
demo_data[1:2,]

# subset to include columns by name 
demo_data[c("Species", "observation")]

# subset a list to return a smaller list
# return the first two elements
demo_list[1:2]

# subset a list by name
demo_list["a_name"]


## ----warning=TRUE--------------------------------------------------------------------------------------------------------------------
# get the mean of the sepal length
mean(demo_data$Sepal.Length)

# add two vectors
c(1, 2) + c(2, 3)

# for vectors of unequal length, the smaller vector is "recycled" for each element of the larger vector
1 + c(1, 2, 3)

# this gives a warning if the recycling doesn't work out evenly
c(1, 2) + c(1, 2, 3)


## ------------------------------------------------------------------------------------------------------------------------------------
1 > 0

2 == 2

1 != 3

2 <= 1

"apple" == "banana"


## ------------------------------------------------------------------------------------------------------------------------------------
# add a new column with mutate
# then group by a useful categorical variable
# then summarize the new value we calculated by mean according to group
demo_data |>
  mutate(sepal_l_w = Sepal.Length + Sepal.Width) |>
  group_by(Species) |>
  summarise(mean_sepal_l_w = mean(sepal_l_w))


## ------------------------------------------------------------------------------------------------------------------------------------
# return a tibble without the observation column
demo_data |>
  select(-observation)
# return a tibble with only the Species and Sepal.Length columns
demo_data |>
  select(c(Species, Sepal.Length))


## ------------------------------------------------------------------------------------------------------------------------------------
# filter rows satisfying a logical test
demo_data |>
  filter(Species == "setosa")

demo_data |>
  filter(Species %in% c("setosa", "versicolor"))


## ------------------------------------------------------------------------------------------------------------------------------------
# pivot from wide form to long form
long_data <- demo_data |>
  pivot_longer(cols = c(Sepal.Length,  Sepal.Width, Petal.Length, Petal.Width))
long_data

# pivot back to wide form
long_data |>
  pivot_wider(names_from = "name", values_from = "value")


## ------------------------------------------------------------------------------------------------------------------------------------
# make two smaller tables
sepal_data <-
  demo_data |>
  group_by(Species) |>
  summarise(mean_sepal_l = mean(Sepal.Length), mean_sepal_w = mean(Sepal.Width))
sepal_data

petal_data <-
  demo_data |>
  group_by(Species) |>
  summarise(mean_petal_l = mean(Petal.Length), mean_petal_w = mean(Petal.Width))
petal_data

# now join them back together
left_join(sepal_data, petal_data)