############## # INTRO TO R # ############## # This tutorial is intended to introduce you to some of the basic functions of R as used by data scientists. # It is intended to show you the sorts of things that R can do as a tool. # It will not teach you all that you need to know in order to do this kind of work. # If you want to learn more, there is a fantastic online book called _R for Data Science_ at # http://r4ds.had.co.nz/index.html # We will start with some basic concepts that are common to all programming languages, including R. # Some basic terminology and concepts #------------------------------------- # Libraries: # Libraries are code written by other people that can be imported into your code and reused. # R also uses the term 'package' to refer to the same idea. # We will be using the 'tidyverse' library and syntax, so we need to import it. This tells R that we # are going to use it and it loads the functions of the library. library(tidyverse) # You should have already installed tidyverse, but if Rstudio complains that tidyverse isn't installed then you need to run: # install.packages('tidyverse') # Variables: # Variables are like containers that store things so that you can reuse them again. x = 1 # stores the number 1 in the variable x. We can then do operations on x which change the value. # This is not bad math - it adds one to the variable x = x + 1 # If we look at x, we can see that it now stores the number 2 print(x) # Functions: # Functions are little pieces of code that take variables as input and produce an output. add_one = function(n) { n = n + 1 return(n) } multiply = function(x, y) { z = x * y return(z) } x = add_one(x) print(x) # Print is also a function! ################## # Importing Data # ################## # R has a number of tools for importing different data types. # We'll focus on the most common - importing CSVs # For this tutorial, we'll be using a prepared dataset of flights from NYC in 2013. This is how to load it: # Again, run this if it isn't installed yet: # install.packages("nycflights13") library(nycflights13) ################ # Loading Data # ################ # The dataset includes four dataframes. Dataframes are sort of like R's version of spreadsheets. head(flights) head(airlines) head(weather) head(planes) # We will be using functions from dplyr. These make R much easier to understand. # There are 6 basic functions: filter, select, arrange, mutate, summarize, and group_by. # For each function, they take in a dataframe as the first argument (together with other arguments) # and provide a dataframe as output. ## filter: keep rows that match certain criteria # get the records for United Airlines filter(flights, carrier=="UA") # Or we could store this result in a new variable temp_df = filter(flights, carrier=="UA") head(temp_df) # You can use pipe for OR condition or & for AND filter(flights, carrier=="AA" | carrier=="UA") filter(flights, carrier=="AA" & month>5) ## select: keep columns that match certain criteria # Commas between a list of columns to keep select(flights, carrier, dep_delay, arr_delay) # Colons to keep everything between two columns select(flights, carrier:dep_delay, arr_delay) # Or remove columns with '-' select(flights, -(carrier:dep_delay)) # You can even rename columns select(flights, carrier, minutes_late=arr_delay) ## arrange: Put dataframes in order # default is ascending (smallest first); this is the flights that left earliest arrange(flights, dep_delay) # We can also see those that left latest arrange(flights, desc(dep_delay)) # Or order by multiple columns arrange(weather, desc(wind_dir), temp) ## mutate: create new columns # Add a column to indicate high wind tmp = select(weather, year:hour, wind_speed) tmp = mutate(tmp, high_wind = wind_speed > 30, low_wind = wind_speed < 5) # And then filter to just those with high wind filter(tmp, high_wind == TRUE) ## summarize: Create summary data from a dataframe (we'll come back to this with group_by) # Get the mean arrival and departure delays summarize(flights, mean_dep=mean(dep_delay, na.rm=T), mean_arr=mean(arr_delay, na.rm=T)) # Get the standard deviations of each summarize(flights, sd_dep=sd(dep_delay, na.rm=T), sd_arr=sd(arr_delay, na.rm=T)) ######################### ### Combining functions # ######################### # There are a few different ways to combine functions together. # First, saving intermediate dataframes tmp = filter(flights, arr_delay > 5) tmp = select(tmp, year:arr_delay) head(tmp) # Next, enclosing them select(filter(flights, arr_delay > 5), year:arr_delay) # Finally, dplyr has a piping option ## With piping, the output from one function becomes the first argument passed # to the next function latearr = flights %>% filter(arr_delay > 5) %>% select(year:arr_delay) # Compare am and pm flights for delays am_stats = flights %>% filter(sched_arr_time < 1200) %>% summarize(mn_arr_delay = mean(arr_delay, na.rm=T), mn_dep_delay = mean(dep_delay, na.rm=T), p_delay = mean(as.integer(dep_delay > 5), na.rm=T)) pm_stats = flights %>% filter(sched_arr_time >= 1200) %>% summarize(mn_arr_delay = mean(arr_delay, na.rm=T), mn_dep_delay = mean(dep_delay, na.rm=T), p_delay = mean(as.integer(dep_delay > 5), na.rm=T)) am_stats pm_stats am_stats - pm_stats ## group_by: puts rows into groups so that they can be summarized flights %>% group_by(carrier) %>% tally() %>% # tally counts how many are in each group and puts the count in a column called 'n' arrange(desc(n)) # and this sorts by the n column # The summarize function becomes particularly useful with group_by planes %>% group_by(manufacturer) %>% summarize(mn_age = mean(2017 - year, na.rm=T), mn_size=mean(seats, na.rm=T)) %>% arrange(desc(mn_age)) # Weather by airport weather %>% group_by(origin) %>% summarize(max_gust = max(wind_gust, na.rm=T), mn_wind = mean(wind_speed, na.rm=T), mn_tmp = mean(temp, na.rm=T), min_vis = min(visib, na.rm=T), max_vix = max(visib, na.rm=T), mn_vis = mean(visib, na.rm=T)) # Visualizations # We can take the outputs and pass them to another library, ggplot2, which makes visualizations library(ggplot2) theme_set(theme_minimal()) # ggplot works based on this template: # ggplot(data = ) + # (mapping = aes()) + # (mapping = aes()) # Scatterplot flights %>% sample_n(3000) %>% # Random sample to make it faster ggplot() + geom_point(aes(x=dep_delay, y=arr_delay)) # With colors by category flights %>% sample_n(3000) %>% ggplot() + geom_point(aes(x=dep_delay, y=arr_delay, color=carrier)) # You can also layer additional plots, like the linear fit flights %>% sample_n(3000) %>% ggplot() + geom_point(aes(x=dep_delay, y=arr_delay, color=carrier)) + geom_smooth(mapping=aes(x=dep_delay, y=arr_delay), method = 'lm') # Note: If you'll be reusing the same aesthetics over and over, # you can put them in the ggplot() call, like so: flights %>% sample_n(3000) %>% ggplot(mapping = aes(x=dep_delay, y=arr_delay)) + geom_point(mapping=aes(color=carrier)) + # The color only applies here, so it has to be here geom_smooth( method = 'lm') # Histograms and density plots flights %>% ggplot(mapping = aes(x=dep_time)) + geom_histogram(fill='blue', alpha=.7) + xlab('Departure time') + # Label for x-axis ggtitle('Histogram of departure times') # And a density plot flights %>% ggplot(mapping = aes(x=dep_time)) + geom_density(fill='blue', alpha=.7, bw = 50) # When plotting skewed data you can log the axis flights %>% ggplot(mapping = aes(x=dep_delay)) + geom_histogram(fill='orange', alpha=.7) + scale_y_log10(breaks=c(10,100,1000,10000)) # Faceting lets you look at the same plot, with data broken up by group flights %>% ggplot(mapping = aes(x=dep_time)) + geom_histogram(fill='blue', alpha=.7) + xlab('Departure time') + # Label for x-axis ggtitle('Histogram of departure times') + facet_wrap(~ carrier) # Joyplots / ridge plots are another approach # (to try this one run install.packages('ggridges') and library(ggridges)) library(ggridges) flights %>% ggplot(mapping = aes(x=dep_time)) + geom_density_ridges(mapping=aes(y=carrier,fill=carrier), alpha=.7) + xlab('Departure time') + # Label for x-axis ggtitle('Density plot of departure times by carrier') + theme(legend.position = '') # Get rid of the legend # Boxplots # Let's look at delays by time of day flights %>% ggplot(mapping=aes(y=dep_delay, x=as.factor(hour))) + geom_boxplot() # We can clean this up by breaking the day into chunks flights %>% mutate(time_of_day=cut(hour, breaks=c(-1,6,12,18,24), labels=c('early morning', 'morning', 'afternoon', 'night'))) %>% ggplot(mapping=aes(y=dep_delay, x=time_of_day))+ geom_boxplot() # We can also compare carriers flights %>% filter(carrier == 'UA' | carrier == 'AA') %>% mutate(time_of_day=cut(hour, breaks=c(-1,6,12,18,24), labels=c('early morning', 'morning', 'afternoon', 'night'))) %>% ggplot(mapping=aes(y=dep_delay, x=time_of_day, color=as.factor(carrier))) + geom_boxplot() ## Note about NA's and missing values ## # For a bunch of plots we got an error message about rows being removed. What's that about? # This shows us summary information about the dep_time column flights %>% select(dep_time) %>% summary() # It looks like the original data has a bunch of NA's - this is missing data. # It's often important to understand why data is missing and how that could # influence conlclusions. # For example, we might check if they were all from the same carrier flights %>% ggplot(aes(x=carrier, fill=is.na(dep_time))) + # The color is whether it's missing or not geom_bar(position = position_dodge()) # position_dodge() makes them show up next to each other # By default, R will remove NAs from graphs and many other functions. # In this case, NA's (I believe) represent canceled flights so that's probably OK, # depending on the question we are trying to ask # Regression # Again, there are entire courses on this topic so this is a very simple introduction # Let's try to predict whether a flight will be delayed by at least 1 hour # First, let's make a variable tmp = flights %>% mutate(long_delay = dep_delay > 60) # Let's make sure there are enough instances for prediction to be useful summary(tmp$long_delay) # For the lm package we have to pass in a formula that is used to fit a regression model. # They are in the format 'y ~ x1 + x2 + x3' # There is also a shorthand 'y ~ .' will try to predict y using all other variables # Note that glm isn't part of dplyr and so we can't pipe to it directly tmp2 = tmp %>% select(-time_hour, -dep_delay, -tailnum, -dest,-arr_delay) # Let's start with a really simple model. Summary gives a nice output # but you can save these fitted objects and do all sorts of things with them summary(glm(long_delay ~ distance, data=tmp, family=binomial)) # Longer flights are much less likely to be delayed # What if we add in the month summary(glm(long_delay ~ distance + as.factor(month), data=tmp, family=binomial)) # We can see that this is a better fit as well, since the AIC is lower (although there are lots of other tests) # Finally, let's add in carriers summary(glm(long_delay ~ distance + as.factor(month) + carrier, data=tmp, family=binomial))