# Cleaning data using R # based on https://tutorials.appliedepi.org/app/tutorial04_en # Peter van Heusden (SANBI) and Djibril Dione # October 2025 # environment set up ----- # install pacman in case it is not already installed install.packages("pacman") pacman::p_load( rio, here, skimr, tidyverse, janitor, epikit, lubridate ) # exploratory data analysis ---- # the data is in the data/raw folder in linelist_raw.csv linelist_raw <- import(here("data", "raw", "linelist_raw.csv")) # number of rows (observations) nrow(linelist_raw) # number of columns (variables) ncol(linelist_raw) # column names names(linelist_raw) # summarise data from sex column tabyl(linelist_raw, "sex") # summarise the whole dataset's numeric columns summary(linelist_raw) # what class ("type") is the linelist_raw object class(linelist_raw) # what class is the "onset date" column class(linelist_raw$`onset date`) # what class is the "date of report" column class(linelist_raw$`date of report`) # what are the values in the "fever" column # if the column name is a single name you can give it without quotes tabyl(linelist_raw, fever) # skim the data (from skimr package) skim(linelist_raw) # first version of a data cleaning pipe ---- # clean the column names and remove duplicate records linelist <- linelist_raw %>% clean_names() %>% distinct() # add a filter to select age > 10 linelist <- linelist_raw %>% clean_names() %>% distinct() %>% filter(age > 10) # view the data.frame View(linelist) # do some manual renaming of columns too linelist <- linelist_raw %>% clean_names() %>% distinct() %>% rename(date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) # selecting some columns for output ---- # select only some columns which columns to output linelist %>% select(case_id, age, sex, lat, lon) %>% filter(age > 65) # select everything except for epilink column linelist %>% select(-epilink) # select everything except for three columns # using the c() - compose - function linelist %>% select(-c(epilink, district_det, temp)) # the same thing done a different way linelist %>% select(-epilink, -district_det, -temp) # moving the data around using select() and everything() linelist %>% select(lat, lon, everything()) # how select work? use help help(select) # keep the columns from fever to vomit linelist %>% select(fever:vomit) # keep the columns from 1 through 5 linelist %>% select(1:5) # deduplication ---- # distinct() with empty parentheses will remove rows that are 100% identical # to each other linelist <- linelist_raw %>% clean_names() %>% rename(date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) %>% select(-row_num) %>% distinct() # distinct(case_id) will consider a row identical if it has the same case_id linelist %>% distinct(case_id) # by default distinct(case_id) removes all other columns # distinct(case_id, .keep_all = TRUE) keeps those columns linelist %>% distinct(case_id, .keep_all = TRUE) # also see the Epi R handbook chapter on data cleaning: # https://epirhandbook.com/en/new_pages/cleaning.html # columns and their classes ---- # getting dates from "character" to "date" type # see what the dates look like - head() gives the first few examples head(linelist$date_onset) # it is in month/day/year format - use mdy on the date to test mdy(linelist$date_onset) # create a new column using mutate # in this example bmi is added using a formula # note that the result is not saved, so it does not change # the values in linelist # by default new columns appear on the far right of the data frame # so we use select to display just the ones of interest linelist %>% mutate(bmi = wt_kg / (ht_cm / 100) ^ 2) %>% select(case_id, wt_kg, ht_cm, bmi) # using a string function, toupper(), create an uppercase # column of district_res linelist %>% mutate(district_upper = toupper(district_res)) %>% select(case_id, district_res, district_upper) # use mdy() to convert dates from character to string linelist <- linelist_raw %>% clean_names() %>% distinct() %>% rename(date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) %>% mutate(date_onset = mdy(date_onset)) %>% mutate(date_report = mdy(date_report)) # show the class of the date_onset column class(linelist$date_onset) # illustration that this makes plotting an epidemic curve possible ggplot(data = linelist, mapping = aes(x = date_onset)) + geom_histogram() # one can also write multiple mutate operations in a single function # but keeping them separate can be easier to read # use mdy() to convert dates from character to string linelist <- linelist_raw %>% clean_names() %>% distinct() %>% rename(date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) %>% mutate(date_onset = mdy(date_onset), date_report = mdy(date_report)) # cleaning up missing and incorrect data ---- # R expects missing values to be the special value NA # space and "Unknown" and other values are not NA, # e.g. in the "sex" column tabyl(linelist, "sex") # an illustration of changing all "Unknown" in the sex column to # NA linelist %>% mutate(sex = na_if(sex, "Unknown")) %>% tabyl(sex) # the same thing as part of our data cleaning pipeline linelist <- linelist_raw %>% clean_names() %>% distinct() %>% rename(date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) %>% mutate(date_onset = mdy(date_onset)) %>% mutate(date_report = mdy(date_report)) %>% mutate(sex = na_if(sex, "Unknown")) # recode() lets us replace date e.g. data we know is incorrect # mutate(column_name = recode(column_name, "OLD value" = "NEW value")) # e.g. (in this example we don't save the new data) linelist %>% tabyl(hospital) # we see we have some misspellings - "Mitilary Hospital" # note that recode is different to rename in 2 ways: # 1. it is used within mutate() # 2. one writes "OLD value" = "NEW value" (in rename it is the other way round) linelist %>% mutate(hospital = recode(hospital, "Mitilary Hospital" = "Military Hospital", "Port Hopital" = "Port Hospital", "Port" = "Port Hospital")) %>% tabyl(hospital) # now as part of our data cleaning pipeline, saving results to a new column # hospital_clean (so that we don't lose the old data and can always see which # records had the mistakes before) linelist <- linelist_raw %>% clean_names() %>% distinct() %>% rename(date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) %>% mutate(date_onset = mdy(date_onset)) %>% mutate(date_report = mdy(date_report)) %>% mutate(sex = na_if(sex, "Unknown")) %>% mutate(hospital_clean = recode(hospital, "Mitilary Hospital" = "Military Hospital", "Port Hopital" = "Port Hospital", "Port" = "Port Hospital")) # examine the changes - it creates a table summarising the different occurences # of the values linelist %>% tabyl(hospital, hospital_clean) # recording data according to logical statements # logical statements are ones about whether something is TRUE or FALSE # and use ifelse() # ifelse() has three parameters: # 1. test - the comparison to perform # 2. true value - the value to return if test evaluates to true # 3. false value - the value to return if test evaluates to false linelist %>% mutate(adult = ifelse(age >= 18, "yes", "no")) %>% select(case_id, age, adult) # you can also use named parameters linelist %>% mutate(adult = ifelse(test = age >= 18, yes = "yes", no = "no")) %>% select(case_id, age, adult) # note that in a logical test, == means "is equal to" # in this example & is a "logical operator" which means AND # learn more about logical operators in the section # "Review Logical Operators" which is in the Simple Recoding section # of Tutorial 4 linelist %>% mutate(sixty_year_old = ifelse(age == 60, "yes", "no")) %>% filter(age > 50 & age < 70) %>% select(case_id, age, sixty_year_old) # cleaning up weights # look at range of weight values summary(linelist$wt_kg) # some are negative - this must be a mistake linelist %>% mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg)) %>% select(wt_kg) %>% summary() # adding it to our pipeline linelist <- linelist_raw %>% clean_names() %>% distinct() %>% rename(date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) %>% mutate(date_onset = mdy(date_onset)) %>% mutate(date_report = mdy(date_report)) %>% mutate(sex = na_if(sex, "Unknown")) %>% mutate(hospital_clean = recode(hospital, "Mitilary Hospital" = "Military Hospital", "Port Hopital" = "Port Hospital", "Port" = "Port Hospital")) %>% mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg)) # more complex recoding with case_when # mutate(NEW_COLUMN = case_when( # LOGICAL TEST 1 ~ VALUE when LOGICAL TEST 1 is true, # LOGICAL TEST 2 ~ VALUE when LOGICAL TEST 2 is true # )) # if you use TRUE as a logical test, this is always TRUE, so # you can use TRUE to mean "for all other rows", as in the # example below linelist %>% mutate(case_def = case_when( lab_confirmed == TRUE ~ "Confirmed", epilink == "yes" & fever == "yes" ~ "Suspect", TRUE ~ "To investigate" )) %>% tabyl(case_def) # using it in a pipe where we take linelist and update its value - # using and update a value is an alternative to making one long # chained pipe but it *destroys* the values that were previously in # the variable - this is why our pipe has been going from linelist_raw # to linelist. # # this is given as an example because sometimes you are happy to # discard the previous values and replace them linelist <- linelist %>% mutate(case_def = case_when( lab_confirmed == TRUE ~ "Confirmed", epilink == "yes" & fever == "yes" ~ "Suspect", TRUE ~ "To investigate" )) # adding the case_when to our pipe linelist <- linelist_raw %>% clean_names() %>% distinct() %>% rename(date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) %>% mutate(date_onset = mdy(date_onset)) %>% mutate(date_report = mdy(date_report)) %>% mutate(sex = na_if(sex, "Unknown")) %>% mutate(hospital_clean = recode(hospital, "Mitilary Hospital" = "Military Hospital", "Port Hopital" = "Port Hospital", "Port" = "Port Hospital")) %>% mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg)) %>% mutate(case_def = case_when( lab_confirmed == TRUE ~ "Confirmed", epilink == "yes" & fever == "yes" ~ "Suspect", TRUE ~ "To investigate" )) # another case_when example linelist %>% mutate(age_years = case_when( age_unit == "months" ~ age / 12, age_unit == "years" ~ age, TRUE ~ age )) # and adding this to the pipe: linelist <- linelist_raw %>% clean_names() %>% distinct() %>% rename(date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) %>% mutate(date_onset = mdy(date_onset)) %>% mutate(date_report = mdy(date_report)) %>% mutate(sex = na_if(sex, "Unknown")) %>% mutate(hospital_clean = recode(hospital, "Mitilary Hospital" = "Military Hospital", "Port Hopital" = "Port Hospital", "Port" = "Port Hospital")) %>% mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg)) %>% mutate(case_def = case_when( lab_confirmed == TRUE ~ "Confirmed", epilink == "yes" & fever == "yes" ~ "Suspect", TRUE ~ "To investigate" )) %>% mutate(age_years = case_when( age_unit == "months" ~ age / 12, age_unit == "years" ~ age, TRUE ~ age )) # age binning with age_categories() from the epikit package linelist %>% mutate(age_group = age_categories(age_years, breakers = c(0, 5, 10, 15, 20, 30, 40, 50, 60, 70))) %>% select(case_id, age_years, age_group) %>% head(5) # another way of using age_categories() linelist %>% mutate(age_group = age_categories(age_years, upper = 70, by = 10)) %>% select(case_id, age_years, age_group) %>% head(5) # add age_categories linelist <- linelist_raw %>% clean_names() %>% distinct() %>% rename(date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) %>% mutate(date_onset = mdy(date_onset)) %>% mutate(date_report = mdy(date_report)) %>% mutate(sex = na_if(sex, "Unknown")) %>% mutate(hospital_clean = recode(hospital, "Mitilary Hospital" = "Military Hospital", "Port Hopital" = "Port Hospital", "Port" = "Port Hospital")) %>% mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg)) %>% mutate(case_def = case_when( lab_confirmed == TRUE ~ "Confirmed", epilink == "yes" & fever == "yes" ~ "Suspect", TRUE ~ "To investigate" )) %>% mutate(age_years = case_when( age_unit == "months" ~ age / 12, age_unit == "years" ~ age, TRUE ~ age )) %>% mutate(age_group = age_categories(age_years, breakers = c(0, 5, 10, 15, 20, 30, 40, 50, 60, 70))) # prioritizing values ---- # sometimes we have missing data but we can compensate with # data from another column. # take the case where you have the location where as case was recorded # but not the home location of the person involved, you can use the # coalesce() function from the dplyr package (part of tidyverse) # to use the detection location where the home location is missing. # here is an example data frame # create the demo dataset case_locations <- data.frame( case = c("A", "B", "C", "D"), detection = c("Ankara", "Gaziantep", NA, "Ankara"), residence = c("Istanbul", "Gaziantep", "Istanbul", NA)) # print the demo dataset case_locations # to be able to make a map we need to have a location for every case, # so we can use the residence where that is available, otherwise we # can use the detection location # re-define case_locations data frame with extra column case_locations <- case_locations %>% mutate(map_location = coalesce(detection, residence)) # print data frame case_locations # the results will show the new column with a location that we can # use when mapping data # case detection residence map_location # 1 A Ankara Istanbul Ankara # 2 B Gaziantep Gaziantep Gaziantep # 3 C Istanbul Istanbul # 4 D Ankara Ankara # in linelist we have two columns with the name "district" in them, # district_res and district_det linelist %>% select(contains("district")) %>% head(5) # we can use coalesce() and mutate() to create a new district # column that uses the value from district_res and if that doesn't # exist, uses that from district_det # here it is at the end of our final data cleaning pipe # this pipe is long, and it is useful while developing it to test # it piece by piece but once complete it gives a reproducible # pipeline that captures all our data cleaning decisions in one # place linelist <- linelist_raw %>% clean_names() %>% distinct() %>% rename(date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) %>% mutate(date_onset = mdy(date_onset)) %>% mutate(date_report = mdy(date_report)) %>% mutate(sex = na_if(sex, "Unknown")) %>% mutate(hospital_clean = recode(hospital, "Mitilary Hospital" = "Military Hospital", "Port Hopital" = "Port Hospital", "Port" = "Port Hospital")) %>% mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg)) %>% mutate(case_def = case_when( lab_confirmed == TRUE ~ "Confirmed", epilink == "yes" & fever == "yes" ~ "Suspect", TRUE ~ "To investigate" )) %>% mutate(age_years = case_when( age_unit == "months" ~ age / 12, age_unit == "years" ~ age, TRUE ~ age )) %>% mutate(age_group = age_categories(age_years, breakers = c(0, 5, 10, 15, 20, 30, 40, 50, 60, 70))) %>% mutate(district = coalesce(district_det, district_res)) # let's save the cleaned data to disk # the export() function will guess out format from the # file ending we use # .csv - comma separated value format # .tsv - tab separated value format # .rds - serialised R objects # # Which one you use depends on where you want to use the data # next. Do you want to load it in a spreadsheet? Import into # a database? Load into R again for more processing or # visualisation? # here we use CSV and data to data/linelist_clean.csv export(linelist, here("data", "linelist_clean.csv")) # not included here but also worth reading is the section # on filtering in https://tutorials.appliedepi.org/app/tutorial04_en