##### LOAD AND EXPLORE DATA #####

canada <- read.csv(file.choose(), stringsAsFactors = TRUE)
str(canada)
summary(canada)

##### PART 1: PRELIMINARY ANALYSIS #####

# Step 1: look at the vote share distribution 
# in the current elections

hist(canada$current_vote, 
     col = "deepskyblue4", 
     border = "white",
     main = "", 
     xlab = "Current vote (in %)",
     ylab = "Count",
     xlim = c(0, 100),
     ylim = c(0, 120))

# Step 2: compare two vote share distributions,
# for candidates with proposal power and without

boxplot(canada$current_vote ~ canada$p2p,
        col = c("indianred3", "deepskyblue4"),
        xlab = "No power vs power", 
        ylab = "Vote (in %)")

t.test(canada$current_vote ~ canada$p2p)

# Step 3: do the same, but separately 
# for government and opposition candidates

government <- canada[canada$government == 1, ]
opposition <- canada[canada$government == 0, ]

boxplot(government$current_vote ~ government$p2p,
        col = c("indianred3", "deepskyblue4"),
        xlab = "No power vs power", 
        ylab = "Vote (in %)")

boxplot(opposition$current_vote ~ opposition$p2p,
        col = c("indianred3", "deepskyblue4"),
        xlab = "No power vs power", 
        ylab = "Vote (in %)")

t.test(government$current_vote ~ government$p2p)
t.test(opposition$current_vote ~ opposition$p2p)

# Step 4: create a model that will show 
# how current vote share can be explained 
# by proposal power and government membership

# we have no such knowledge yet :(
# just wait and switch to other question

##### PART 2: EXPLORING RELATIONSHIPS #####

# Statement: the vote share in the current elections 
# is correlated with the vote share in the previous 
# elections

# Step 1: look at the scatterplot

plot(canada$previous_vote, canada$current_vote, 
     pch = 16, col = "navy", 
     xlab = "Vote share (previous elections)",
     ylab = "Vote share (current elections)")

# Step 2: calculate Pearson's correlation coefficient

cor.test(canada$previous_vote, canada$current_vote)

# Step 3: run a linear model 
# Current vote = beta0 + beta1 * Previous vote

lm(data = canada, current_vote ~ previous_vote)

# add this line to the scatterplot

plot(canada$previous_vote, canada$current_vote, 
     pch = 16, col = "navy", 
     xlab = "Vote share (previous elections)",
     ylab = "Vote share (current elections)")

abline(a = 7.5348, b = 0.8319, col = "red")

# explore parts of the model

m1 <- lm(data = canada, current_vote ~ previous_vote)

m1$fitted.values
m1$residuals

m1$residuals ^ 2
sum(m1$residuals ^ 2)