# Where to find this file:
# https://raw.githubusercontent.com/KUBDatalab/beginning-R/main/files/20230925-intro-script.R


# Make data-directory
# dir.create("data")

# Additional directories/folders
# Not strictly necessary for this project.
# But good practice!
# dir.create("data_output")
# dir.create("fig_output")

# downloading data for later
# download.file("https://raw.githubusercontent.com/KUBDatalab/beginning-R/main/data/SAFI_clean.csv",
#              "data/SAFI_clean.csv", mode = "wb")

# Install packages
# Name of package should be in quotation marks
# install.packages is searching for the name
# install.packages("tidyverse")

# Note that lines 6, 11, 12, 15, 16 and 21 are commented out - we only need to run
# them once, so we make them comments after the first run. But consider 
# keeping them them in your script. That makes it easier for others to 
# figure out what to do.
              
# Simple math in R:
3 + 5
12/7

# Creating objects using the assignment operator
# We store the value 1.0 in the object, sometimes called a variable
# named on the left-hand side.
area_hectares <- 1.0

# We could use the = sign. Please do not, we use that for other stuff as
# well, and keeping the use consistent makes the code easier to read.

# Naming objects? As short as possible, as long as necessary. Make sure 
# the name describes what is stored in the object, but keep it short
# enough that you would like to type it more than once.
# Object names must never begin with a number, contain a white space, or 
# special characters like *, - or + that we use for math.

# we can do math on objects
2.47 * area_hectares

# We can change the value of objects:
area_hectares <- 2.5

# using functions:
b <- sqrt(a) # note that the object a has not been defined. We will get an error!
round(3.14159)

# Getting help:
?round

# we can see that by default rounds to the whole number. We can adjust
# how many digits it rounds to:
round(3.14159, 2)

# it is good practice to name your arguments:
round(3.14159, digits = 2)

# That makes it easier for other people to read and understand, without 
# reading the documentation.

# Vectors!
# A series of values - of the same type! No mix and match.

hh_members <- c(3, 7, 10, 6)

# taking a look:
hh_members

# Vectors can contain text:
respondent_wall_type <- c("muddaub", "burntbricks", "sunbricks")

# Bonus info - muddaubed walls are an very sustainable building technique:
# https://en.wikipedia.org/wiki/Wattle_and_daub

# Getting information about objects.
# in this case the number of elements in the vectors:
length(hh_members)

length(respondent_wall_type)

# And we can get the type of data in a vector

class(hh_members)
class(respondent_wall_type)

# and even more information using the str() function
str(hh_members)
str(respondent_wall_type)

# adding content to a vector:
possesions <- c("bicycle", "radio", "television")

possesions <- c(possesions, "mobile phone")

# Looking at data/objects

View(possesions)

# note the uppercase V!

# adding content to the beginning of a vector:
possesions <- c("car", possesions)

# another way of checking datatype:
typeof(possesions)

# Trying to make a vector with mixed datatypes:

num_chars <- c(1, 2, 3, "a")

# Looking at the result:
num_chars

# what type is this data:
typeof(num_chars)

# we can understand 1 as a character. But not a as a number - R will
# store the 1 as a character. That means we will not be able to do math
# on the vector!

# Subsetting vectors:
respondent_wall_type[2]

respondent_wall_type[c(3, 2)]

more_respondent_wall_type <- respondent_wall_type[c(1, 2, 3, 2, 1, 3)]

more_respondent_wall_type

# conditional subsetting

hh_members <- c(3, 7, 10, 6)
# Using a boolean, logical, vector to subset at a vector
hh_members <- hh_members[c(TRUE, FALSE, TRUE, TRUE)]

# the elements returned, are the ones matching a "TRUE" value

# a logical comparison will return a logical vector that can be used to subset
# a vector
hh_members > 5

# That can be used for subsetting:
hh_members[hh_members <4 | hh_members > 7]

hh_members[hh_members >= 4 & hh_members <= 7]

# We can do the same on character vectors:

possesions <- c("car", "bicycle", "radio", "television", "mobile phone")
possesions[possessions == "car" | possesions == "bicycle"]

possesions %in% c("car", "bicycle")

possesions %in% c("car", "bicycle", "motorcycle", "truck", "boat", "bus")

# Missing data

rooms <- c(2, 1, 1, NA, 7)

mean(rooms)

max(rooms)

# Using the argument na.rm allows us to calculate the results anyway:

mean(rooms, na.rm = TRUE)
max(rooms, na.rm = T) # Note that instead of typing TRUE, we can just type T

rooms[!is.na(rooms)]
sum(is.na(rooms))

# Load packages
# Now we do not need the quotation marks
# After install the library exists as an object in a package database
# And can be loaded without quotation marks.
library(tidyverse)

# Read in the data to an object.
# Note the difference between csv and csv2
safi <- read_csv("data/SAFI_clean.csv")

# view the dataframe or tibble. 
view(safi)

# manipulating data
safi %>% select(village, no_membrs, no_meals)


safi %>% 
  select(-interview_date) %>%
  mutate(tot_meals = no_membrs * no_meals) %>% 
  filter(village == "Chirodzo") %>% 
  summarise(average_tot_meals = mean(tot_meals),
            std_dev_tot_meals = sd(tot_meals))

# group_by groups the data frame!
safi %>% 
  select(-interview_date) %>%
  mutate(tot_meals = no_membrs * no_meals) %>% 
  group_by(village) %>% 
  summarise(average_tot_meals = mean(tot_meals),
            std_dev_tot_meals = sd(tot_meals))


# can we do it separately?
safi2 <- select(safi, -interview_date)
safi3 <- mutate(safi2, tot_meals = no_membrs * no_meals)
safi4 <- group_by(safi3, village)
summarise(safi4, average_tot_meals = mean(tot_meals))