## first, we need to install a few packages.
## to do this, first run install.packages()
## and put the package name in quotes
install.packages("Lahman")
install.packages("dplyr")
install.packages("ggplot2")
## With these libraries installed, we can now call them
library(Lahman)
library(dplyr)
library(ggplot2)
## looking at some data:
## the Lahmann library comes with several in-built
## datasets -- let's look at whats available:
data(package = "Lahman")
# lots of interesting stuff here.
# we'll work with the Batting data.
# let's create a copy of this called df --
# this will make it easier to quickly refer to the
# data later on:
df = Batting
# note: make sure to match the case!
# ok. We've got data!
# first question -- how much data do we have?
nrow(df)
# whoah. there's a ton! We have 102816 observations.
# what variables do we observe?
names(df)
# player code
# what year
# "stint"
# league ID (e.g. AL / NL etc.)
# Games
# At bats
# Runs
# Hits
# Doubles
# Triples
# Home Runs
# Runs Batted in
# etc.
# we can gather a bit more information using
?Batting
## looking at the data:
head(Batting)
# whoa. This goes back to 1871!
## ok. let's browse the data a bit --
# in general, we'll use the summarize() command
# to conduct calculations. this works for
# many functions of the data:
# what is the average number of home runs?
df %>% summarize(mn.hr = mean(HR, na.rm = TRUE))
# what is the median number of home runs?
df %>% summarize(med.hr = median(HR, na.rm = TRUE))
# ha.
# what is the max number of home runs?
df %>% summarize(max.hr = max(HR, na.rm = TRUE))
# whoa.
# who are these HR-hitting phenoms?
# we can sort the data using the arrange() command:
df %>% arrange(desc(HR)) %>% head(.)
# we can look at a little more data by passing
# another attribute to the head() command:
df %>% arrange(desc(HR)) %>% head(.,20)
# does anyone remember 1998?
# why was 1998 so special? I'll tell you!
# let's calculate the maximal number of home runs
# hit in each year in the data.
# to do this, we can think about splitting our data
# up by year -- collecting all the 1871 observations
# together,
# and all the 1872 observations, etc.
# next, we're going to run the same function on
# each of the groups -- that function will ask what
# the max of the HR variable is.
# example:
df %>% group_by(yearID) %>% summarize(max.hr = max(HR, na.rm = TRUE))
# we can also write it like this:
df %>%
group_by(yearID) %>%
summarize(max.hr = max(HR, na.rm = TRUE))
## ok. That's sort of intersting... can
# we look at these values on a plot?
df %>%
group_by(yearID) %>%
summarize(max.hr = max(HR, na.rm = TRUE)) %>%
ggplot(.,aes(x = yearID, y = max.hr)) + geom_point() +
theme_bw()
# if I want to save this plot (insert your own file path here)
# ggsave("C:/users/mpatters/desktop/HRs_by_year.png")
# what's interesting about this plot?
# 1920 is crazy!
## ok - back to the question about 1998
# let's highlight this point a bit:
df %>%
group_by(yearID) %>%
summarize(max.hr = max(HR, na.rm = TRUE)) %>%
ggplot(.,aes(x = yearID, y = max.hr)) + geom_point() +
theme_bw() + geom_vline(aes(xintercept = 1998), col = "red")
# Let's look at what the record books were like
# in 1998 (note: MP was 13 years old):
df %>% filter(yearID < 1998) %>%
select(playerID,yearID,HR) %>%
arrange(desc(HR)) %>%
head(.)
# the HR record had been the same since 1961
# (Roger Maris hit 61), and before that, it was Babe Ruth
# in 1927 with 60!
#
# There was speculation about whether anyone would
# ever pass roger maris...
#
# but let's look at the data from 1998:
df %>% filter(yearID == 1998) %>%
select(playerID,yearID,HR) %>%
arrange(desc(HR)) %>%
head(.)
# what was crazy?
# what about 1999?
df %>% filter(yearID == 1999) %>%
select(playerID,yearID,HR) %>%
arrange(desc(HR)) %>%
head(.)
# what about 2000?
df %>% filter(yearID == 2001) %>%
select(playerID,yearID,HR) %>%
arrange(desc(HR)) %>%
head(.)
# what about 2001?
df %>% filter(yearID == 2001) %>%
select(playerID,yearID,HR) %>%
arrange(desc(HR)) %>%
head(.)
# what about 2002?
df %>% filter(yearID == 2002) %>%
select(playerID,yearID,HR) %>%
arrange(desc(HR)) %>%
head(.)
# Poor Sammy :(
## ok. A little more on HRs:
# we can modify the code above to look at the
# total number of homeruns hit by year:
df %>%
group_by(yearID) %>%
summarize(tot.hr = sum(HR, na.rm = TRUE)) %>%
ggplot(.,aes(x = yearID, y = tot.hr)) + geom_point() +
theme_bw()
# We can also look at the number of strikeouts:
df %>%
group_by(yearID) %>%
summarize(tot.so = sum(SO, na.rm = TRUE)) %>%
ggplot(.,aes(x = yearID, y = tot.so)) + geom_point() +
theme_bw()
# woah. (what's going on in 1900?? -- I have no idea)
# what about walks?
df %>%
group_by(yearID) %>%
summarize(tot.bb = sum(BB, na.rm = TRUE)) %>%
ggplot(.,aes(x = yearID, y = tot.bb)) + geom_point() +
theme_bw()
## all of that is sort of interesting, but I'd
# like to know about how the likelihoods of
# walks and strikes (maybe hits too?) are changing
# over time.
# to do this, let's do this:
# for each player, I'll create a new variable
# which is ABs + BBs:
df = df %>%
mutate(PA = BB + AB)
df %>% head(.)
# this isn't perfect, but it'll give us a
# starting point.
# next, let's find the total BBs divided by plate
# appearances:
df = df %>%
mutate(BB.prob = BB/PA)
# let's do the same for strike outs and hits:
df = df %>%
mutate(SO.prob = SO/PA) %>%
mutate(H.prob = H/PA)
# ok. Some of the players only have a small
# number of plate appearances.. we'll remove these
# in our analysis using filter()...
df %>% filter(PA > 120) %>% head(.)
# ok. This won't be perfect, but I'd like to
# look at how the median BB, SO, and H probabilities
# are changing over time.
df %>% filter(PA > 120) %>%
group_by(yearID) %>%
summarize(med.BB.prob = median(BB.prob, na.rm = TRUE),
med.SO.prob = median(SO.prob,na.rm = TRUE),
med.H.prob = median(H.prob,na.rm = TRUE)) %>%
ggplot(.,aes(x = yearID, y = med.BB.prob)) + geom_point(col = "blue") +
theme_bw() + geom_point(aes(x = yearID, y = med.SO.prob), col = "red") +
geom_point(aes(x = yearID, y = med.H.prob), col = "purple")
#huh. interesting.
### Ok -- switching datasets a little bit --
# let's look at Fielding.
# set df to be the Fielding data:
df = Fielding
# look at the beginning of the data:
head(df)
# interesting that we have position information!
# another interesting thing -- we have repeated entries
# in the data. hmm.. it looks like players have seperate
# entries when they play multiple positions.
# is this common?
# first, let's count how many positions each player had
# in a given year:
df %>%
group_by(yearID,playerID) %>%
summarize(tot.pos = length(unique(POS))) %>%
head(.)
# woah. 5 positions for ansonca01!
# let's add a variable which indicates whether the number
# of positions is more than 1:
df %>%
group_by(yearID,playerID) %>%
summarize(tot.pos = length(unique(POS)),
mult.pos = as.numeric(tot.pos > 1)) %>%
head(.)
# ok. Now, we can figure out what the average number of
# players who play multiple positions in each year is by taking
# the average of our mult.pos variable in each year:
df %>%
group_by(yearID,playerID) %>%
summarize(tot.pos = length(unique(POS)),
mult.pos = as.numeric(tot.pos > 1)) %>%
group_by(yearID) %>%
summarize(share.mult = mean(mult.pos, na.rm = TRUE)) %>%
ggplot(.,aes(x = yearID, y = share.mult)) + geom_point()
#ggsave("C:/Users/mpatters/desktop/multipos_by_year.png")
# more than half of the players played multiple positions circa 1880!
# fewer than 1 in 5 do now!