## first, we need to install a few packages. ## to do this, first run install.packages() ## and put the package name in quotes install.packages("Lahman") install.packages("dplyr") install.packages("ggplot2") ## With these libraries installed, we can now call them library(Lahman) library(dplyr) library(ggplot2) ## looking at some data: ## the Lahmann library comes with several in-built ## datasets -- let's look at whats available: data(package = "Lahman") # lots of interesting stuff here. # we'll work with the Batting data. # let's create a copy of this called df -- # this will make it easier to quickly refer to the # data later on: df = Batting # note: make sure to match the case! # ok. We've got data! # first question -- how much data do we have? nrow(df) # whoah. there's a ton! We have 102816 observations. # what variables do we observe? names(df) # player code # what year # "stint" # league ID (e.g. AL / NL etc.) # Games # At bats # Runs # Hits # Doubles # Triples # Home Runs # Runs Batted in # etc. # we can gather a bit more information using ?Batting ## looking at the data: head(Batting) # whoa. This goes back to 1871! ## ok. let's browse the data a bit -- # in general, we'll use the summarize() command # to conduct calculations. this works for # many functions of the data: # what is the average number of home runs? df %>% summarize(mn.hr = mean(HR, na.rm = TRUE)) # what is the median number of home runs? df %>% summarize(med.hr = median(HR, na.rm = TRUE)) # ha. # what is the max number of home runs? df %>% summarize(max.hr = max(HR, na.rm = TRUE)) # whoa. # who are these HR-hitting phenoms? # we can sort the data using the arrange() command: df %>% arrange(desc(HR)) %>% head(.) # we can look at a little more data by passing # another attribute to the head() command: df %>% arrange(desc(HR)) %>% head(.,20) # does anyone remember 1998? # why was 1998 so special? I'll tell you! # let's calculate the maximal number of home runs # hit in each year in the data. # to do this, we can think about splitting our data # up by year -- collecting all the 1871 observations # together, # and all the 1872 observations, etc. # next, we're going to run the same function on # each of the groups -- that function will ask what # the max of the HR variable is. # example: df %>% group_by(yearID) %>% summarize(max.hr = max(HR, na.rm = TRUE)) # we can also write it like this: df %>% group_by(yearID) %>% summarize(max.hr = max(HR, na.rm = TRUE)) ## ok. That's sort of intersting... can # we look at these values on a plot? df %>% group_by(yearID) %>% summarize(max.hr = max(HR, na.rm = TRUE)) %>% ggplot(.,aes(x = yearID, y = max.hr)) + geom_point() + theme_bw() # if I want to save this plot (insert your own file path here) # ggsave("C:/users/mpatters/desktop/HRs_by_year.png") # what's interesting about this plot? # 1920 is crazy! ## ok - back to the question about 1998 # let's highlight this point a bit: df %>% group_by(yearID) %>% summarize(max.hr = max(HR, na.rm = TRUE)) %>% ggplot(.,aes(x = yearID, y = max.hr)) + geom_point() + theme_bw() + geom_vline(aes(xintercept = 1998), col = "red") # Let's look at what the record books were like # in 1998 (note: MP was 13 years old): df %>% filter(yearID < 1998) %>% select(playerID,yearID,HR) %>% arrange(desc(HR)) %>% head(.) # the HR record had been the same since 1961 # (Roger Maris hit 61), and before that, it was Babe Ruth # in 1927 with 60! # # There was speculation about whether anyone would # ever pass roger maris... # # but let's look at the data from 1998: df %>% filter(yearID == 1998) %>% select(playerID,yearID,HR) %>% arrange(desc(HR)) %>% head(.) # what was crazy? # what about 1999? df %>% filter(yearID == 1999) %>% select(playerID,yearID,HR) %>% arrange(desc(HR)) %>% head(.) # what about 2000? df %>% filter(yearID == 2001) %>% select(playerID,yearID,HR) %>% arrange(desc(HR)) %>% head(.) # what about 2001? df %>% filter(yearID == 2001) %>% select(playerID,yearID,HR) %>% arrange(desc(HR)) %>% head(.) # what about 2002? df %>% filter(yearID == 2002) %>% select(playerID,yearID,HR) %>% arrange(desc(HR)) %>% head(.) # Poor Sammy :( ## ok. A little more on HRs: # we can modify the code above to look at the # total number of homeruns hit by year: df %>% group_by(yearID) %>% summarize(tot.hr = sum(HR, na.rm = TRUE)) %>% ggplot(.,aes(x = yearID, y = tot.hr)) + geom_point() + theme_bw() # We can also look at the number of strikeouts: df %>% group_by(yearID) %>% summarize(tot.so = sum(SO, na.rm = TRUE)) %>% ggplot(.,aes(x = yearID, y = tot.so)) + geom_point() + theme_bw() # woah. (what's going on in 1900?? -- I have no idea) # what about walks? df %>% group_by(yearID) %>% summarize(tot.bb = sum(BB, na.rm = TRUE)) %>% ggplot(.,aes(x = yearID, y = tot.bb)) + geom_point() + theme_bw() ## all of that is sort of interesting, but I'd # like to know about how the likelihoods of # walks and strikes (maybe hits too?) are changing # over time. # to do this, let's do this: # for each player, I'll create a new variable # which is ABs + BBs: df = df %>% mutate(PA = BB + AB) df %>% head(.) # this isn't perfect, but it'll give us a # starting point. # next, let's find the total BBs divided by plate # appearances: df = df %>% mutate(BB.prob = BB/PA) # let's do the same for strike outs and hits: df = df %>% mutate(SO.prob = SO/PA) %>% mutate(H.prob = H/PA) # ok. Some of the players only have a small # number of plate appearances.. we'll remove these # in our analysis using filter()... df %>% filter(PA > 120) %>% head(.) # ok. This won't be perfect, but I'd like to # look at how the median BB, SO, and H probabilities # are changing over time. df %>% filter(PA > 120) %>% group_by(yearID) %>% summarize(med.BB.prob = median(BB.prob, na.rm = TRUE), med.SO.prob = median(SO.prob,na.rm = TRUE), med.H.prob = median(H.prob,na.rm = TRUE)) %>% ggplot(.,aes(x = yearID, y = med.BB.prob)) + geom_point(col = "blue") + theme_bw() + geom_point(aes(x = yearID, y = med.SO.prob), col = "red") + geom_point(aes(x = yearID, y = med.H.prob), col = "purple") #huh. interesting. ### Ok -- switching datasets a little bit -- # let's look at Fielding. # set df to be the Fielding data: df = Fielding # look at the beginning of the data: head(df) # interesting that we have position information! # another interesting thing -- we have repeated entries # in the data. hmm.. it looks like players have seperate # entries when they play multiple positions. # is this common? # first, let's count how many positions each player had # in a given year: df %>% group_by(yearID,playerID) %>% summarize(tot.pos = length(unique(POS))) %>% head(.) # woah. 5 positions for ansonca01! # let's add a variable which indicates whether the number # of positions is more than 1: df %>% group_by(yearID,playerID) %>% summarize(tot.pos = length(unique(POS)), mult.pos = as.numeric(tot.pos > 1)) %>% head(.) # ok. Now, we can figure out what the average number of # players who play multiple positions in each year is by taking # the average of our mult.pos variable in each year: df %>% group_by(yearID,playerID) %>% summarize(tot.pos = length(unique(POS)), mult.pos = as.numeric(tot.pos > 1)) %>% group_by(yearID) %>% summarize(share.mult = mean(mult.pos, na.rm = TRUE)) %>% ggplot(.,aes(x = yearID, y = share.mult)) + geom_point() #ggsave("C:/Users/mpatters/desktop/multipos_by_year.png") # more than half of the players played multiple positions circa 1880! # fewer than 1 in 5 do now!