#### Setup ####

# Load the tidyverse package
## Usually it's a good idea to have this at the top of your code, so you and your 
## collaborators know which packages are needed to run the code
library(tidyverse)

# Set your working directory
## This can be used as an alternative to using "R projects".
## Note: the "~" symbol means "home directory", which is variable depending on your 
## username and operating system (Mac or Windows or Linux).
## You can use the `getwd()` command to see what your current working directory is 
setwd("~/Course_Materials/Day1PM-2_R_Data_Analysis")

# Clean workspace - this removes all the objects from the current environment
## Usually you don't have to do this, we are doing it to start this lesson clean 
rm(list = ls())

# Create a directory for the data
## You might already have this directory, in which case the function issues a warning
dir.create("data")

# Download the data provided by your collaborator
download.file("https://ndownloader.figshare.com/files/2292169",
              destfile = "data/portal_data_joined.csv")

# Read data into R
surveys <- read_csv("data/portal_data_joined.csv", na = "")

# Or, if you want to use the base R function:
## read.csv("data/portal_data_joined.csv", na.strings = "", stringsAsFactors = FALSE)


#### Tidy data ####

# Removing missing values from variables
surveys_complete <- surveys %>%
  filter(!is.na(weight),           # remove missing weight
         !is.na(hindfoot_length),  # remove missing hindfoot_length
         !is.na(sex))              # remove missing sex

# Extract the most common species_id
species_counts <- surveys_complete %>%
  count(species_id) %>% 
  filter(n >= 50)

# Only keep the most common species
surveys_complete <- surveys_complete %>%
  filter(species_id %in% species_counts$species_id)


##### Analysis ####