require(dplyr)
require(tidyr)
require(tibble)
require(stringr)

## Note that you will need to download the Senate vote data on your own as those files are very big
## In particular you will need to download the ZIP files from the "Formal Preferences" folder
## Extract the CSVs and place them into your working directory (edit code below to direct R accordingly)

BaseDir <- "ENTER WORKING DIRECTORY HERE"
States <- c("New South Wales","Victoria","Queensland","Western Australia","South Australia","Tasmania","Northern Territory","Australian Capital Territory")
StateAbs <- c("NSW","VIC","QLD","WA","SA","TAS","NT","ACT")

BallotFiles <- list.files(paste0(BaseDir))
Party1 <- c("Labor.Country.Labor","Labor","Australian.Labor.Party","A.L.P.","Australian.Labor.Party..Northern.Territory..Branch")
Party2 <- c("Liberal...Nationals","Liberal.The.Nationals","LIBERAL.THE.NATIONALS","Liberal.National.Party.of.Queensland","Liberal","Country.Liberals..NT.")
PartyAb1 <- "ALP"
PartyAb2 <- "LNC"
PrefBallotData <- data.frame(StateAb=character(),
                             DivisionID=integer(),
                             DivisionNm=character(),
                             PollingPlaceID=integer(),
                             PollingPlaceNm=character(),
                             Party1=character(),
                             Party2=character(),
                             RankParty1=integer(),
                             RankParty2=integer(),
                             Preference=character())

## Extracts highest voter ranking for each party listed (smallest number)
## This includes below-the-line votes
## So e.g. if a voter ranked candidate 2 of Party1 second, this is recorded as ranking Party1 second
for(File in BallotFiles){
  AllBallots <- read.csv(paste0(BaseDir,"Research/Federal-2019/BallotData/",File))
  
  Parties <- substr(colnames(AllBallots),3,nchar(colnames(AllBallots)))[-(1:6)]
  Group1 <- strsplit(colnames(AllBallots)[c(rep(FALSE,6),Parties %in% Party1)],".",fixed=TRUE)[[1]][1]
  Group2 <- strsplit(colnames(AllBallots)[c(rep(FALSE,6),Parties %in% Party2)],".",fixed=TRUE)[[1]][1]
  
  StatePrefData <- data.frame(StateAb=strsplit(strsplit(File,".",fixed=TRUE)[[1]][1],split="-",fixed=TRUE)[[1]][5],
                              DivisionID=NA,
                              DivisionNm=AllBallots$Division,
                              PollingPlaceID=NA,
                              PollingPlaceNm=AllBallots$Vote.Collection.Point.Name,
                              Party1=PartyAb1,
                              Party2=PartyAb2,
                              RankParty1=do.call(pmin,args=c(AllBallots[,c(rep(FALSE,6),substr(colnames(AllBallots)[-(1:6)],1,nchar(Group1)+1) == paste0(Group1,"."))],list(na.rm=TRUE))),
                              RankParty2=do.call(pmin,args=c(AllBallots[,c(rep(FALSE,6),substr(colnames(AllBallots)[-(1:6)],1,nchar(Group2)+1) == paste0(Group2,"."))],list(na.rm=TRUE))),
                              Preference=NA)
  
  PrefBallotData <- rbind(PrefBallotData,StatePrefData)
  
  cat(paste("Preference analysis complete for",States[StateAbs==strsplit(strsplit(File,".",fixed=TRUE)[[1]][1],"-",fixed=TRUE)[[1]][5]],"\n"))
  rm(list=c("AllBallots","StatePrefData","Parties","Group1","Group2"))
  invisible(gc())
}

PrefBallotData$DivisionID <- DivSenData2019$DivisionID[match(PrefBallotData$DivisionNm,DivSenData2019$DivisionNm)]
PrefBallotData$Preference[replace_na(PrefBallotData$RankParty1,999) < replace_na(PrefBallotData$RankParty2,999)] <- PrefBallotData$Party1[replace_na(PrefBallotData$RankParty1,999) < replace_na(PrefBallotData$RankParty2,999)]
PrefBallotData$Preference[replace_na(PrefBallotData$RankParty1,999) > replace_na(PrefBallotData$RankParty2,999)] <- PrefBallotData$Party2[replace_na(PrefBallotData$RankParty1,999) > replace_na(PrefBallotData$RankParty2,999)]
PrefBallotData$Preference[replace_na(PrefBallotData$RankParty1,999) == replace_na(PrefBallotData$RankParty2,999)] <- "EXH"

## Aggregates 2pp vote count by state
State2ppCount <- table(PrefBallotData$StateAb,PrefBallotData$Preference)
State2ppLNC <- State2ppCount[,3]/(State2ppCount[,1]+State2ppCount[,3])

write.csv(State2ppLNC,"Federal-2019-Senate-2pp-State.csv",row.names=FALSE)