--- title: "Reproducible Research" author: date: "`r Sys.Date()`" output: html_document: code_folding: show --- *Make sure you've followed the instructions under "Setting up your API key" in * *the ipumsr API vignette (`vignette("ipums-api", package = "ipumsr")`) before * *running this template.* # {.tabset} ## Delete this section before sharing This template is for two types of IPUMS users: 1. The user creating a new analysis that they'd like to share. We'll call this user the **analyst**. 2. The user with whom the analyst shares their analysis, and who wants to run, and perhaps modify, that analysis themself. We'll call this user the **collaborator**. This template uses the IPUMS API to help the analyst and collaborator work with the same dataset. It helps the analyst by including code to download their IPUMS data extract and save the extract definition in a shareable format, and it helps the collaborator by including code to create and download a new extract matching that shared definition. If you're reading this, you are probably the analyst, because we recommend that the analyst deletes this section before sharing their analysis. If you are the analyst, follow these steps to make your analysis shareable: 1. Submit the extract you want to analyze using the online extract system or API functions. 2. Fill in the parameters in the first code chunk below. 3. Click the RStudio `Knit` button or use `rmarkdown::render()` to run the template. Repeat this step until your extract is ready. 4. Once your extract is downloaded and the HTML report is created, update the file paths in the "Define File Paths" section below by copying the code generated at the bottom of this section in the HTML report. 5. Delete this section and proceed with your analysis. ```{r project-parameters} #### Key Parameters ##### # If you change any of these parameters after running the template, delete all # files in `data_dir` to ensure a fresh start collection <- "usa" # The IPUMS data collection of your extract; run # `ipums_data_collections()` for a list of supported # collections extract_num <- NULL # The extract number, or leave as `NULL` for your most # recent extract descriptive_name <- "my_ipums_extract" # A descriptive label for your extract; # used to rename your data files data_dir <- "data" # The folder in which to save data, codebook, and .json files ``` This next code chunk pulls down your extract definition and saves it to a JSON file in `data_dir`. If the extract is ready, it downloads the data and codebook files to `data_dir` and renames them according to `descriptive_name`. If the extract is not ready, the code throws an error to inform you that your extract is not ready yet, and that you should try re-running this template again later. ```{r analyst-check-for-data, class.source = "fold-hide", results = "asis"} # Load ipumsr suppressPackageStartupMessages( library(ipumsr) ) # Create data_dir if it doesn't exist if (!dir.exists(data_dir)) dir.create(data_dir) # Define file paths json_path <- file.path(data_dir, paste0(descriptive_name,".json")) renamed_data_path <- file.path(data_dir, paste0(descriptive_name,".dat.gz")) renamed_ddi_path <- file.path(data_dir, paste0(descriptive_name,".xml")) gitignore_path <- file.path(data_dir, ".gitignore") # Get info on the designated extract (most recent extract if extract_num is NULL) if (is.null(extract_num)) { extract_definition <- get_last_extract_info(collection) } else { extract_definition <- get_extract_info(c(collection, extract_num)) } # Do we already have the data and/or the JSON? no_json <- !file.exists(json_path) no_data_yes_json <- !file.exists(renamed_data_path) & file.exists(json_path) yes_data_yes_json <- file.exists(renamed_data_path) & file.exists(json_path) # If no JSON file, create it if (no_json) { save_extract_as_json(extract_definition, file = json_path) no_data_yes_json <- TRUE } # If we don't yet have the data, check whether the extract is ready if (no_data_yes_json) { extract_is_ready <- is_extract_ready(extract_definition) extract_is_stale <- !extract_is_ready & extract_definition$status == "completed" if (extract_is_stale) { stop( paste0( "The data files for ", collection, " extract number ", extract_definition$number, " have been removed from IPUMS servers. ", "Resubmit this extract by running `submit_extract(get_extract_info(\"", collection, ":", extract_definition$number, "\"))` and update the ", "`extract_num` parameter before re-running the template." ), call. = FALSE ) } # If extract is ready, download files and rename according to `descriptive_name` if (extract_is_ready) { ddi_file <- download_extract(extract_definition, download_dir = data_dir) data_file <- gsub("\\.xml$", ".dat.gz", ddi_file) ddi_file_successfully_renamed <- file.rename(ddi_file, renamed_ddi_path) data_file_successfully_renamed <- file.rename(data_file, renamed_data_path) if (!ddi_file_successfully_renamed || !data_file_successfully_renamed) { stop( "Problem renaming DDI and/or data file; please report bug at ", "https://github.com/ipums/ipumsr/issues, including a copy of this ", "file if possible.", call. = FALSE ) } # Add the data and codebook files to .gitignore files_to_gitignore <- c( basename(renamed_data_path), basename(renamed_ddi_path) ) if (file.exists(gitignore_path)) { existing_gitignore_lines <- readLines(gitignore_path) files_to_gitignore <- c(existing_gitignore_lines, files_to_gitignore) } writeLines(files_to_gitignore, con = gitignore_path) yes_data_yes_json <- TRUE } else { # If extract isn't ready, stop execution stop( "NOT AN ERROR: ", collection, " extract number ", extract_definition$number, " is not yet ready to download. Try ", "re-running again later.", call. = FALSE ) } } # If data are downloaded, copy file paths, then delete this section if (yes_data_yes_json) { cat( paste0( "```\n", "Data, codebook, and .json extract definition files have been saved to ", "folder \"", data_dir, "\".\n\nNext, copy the code below into the ", "\"Define File Paths\" code chunk, overwriting the existing code:\n\n", "extract_definition_path <- \"", json_path, "\"\n", "data_path <- \"", renamed_data_path, "\"\n", "ddi_path <- \"", renamed_ddi_path, "\"\n\n", "Finally, delete all text and code in the section \"Delete this section ", "before sharing\"\n", "```" ) ) } ``` ## Load Packages ```{r load-packages} suppressPackageStartupMessages({ library(ipumsr) # library() additional packages as necessary }) ``` ## Define File Paths ```{r define-file-paths} extract_definition_path <- json_path data_path <- gsub("\\.json$", ".dat.gz", extract_definition_path) ddi_path <- gsub("\\.json$", ".xml", extract_definition_path) ``` ## Load your IPUMS Data This analysis of IPUMS data is designed to be shared, and thus does not assume that you have already downloaded the data used in the analysis. The code below checks whether the data are already downloaded, and if they aren't, it submits a new IPUMS extract request according to the specifications in the included extract definition JSON file. ```{r check-for-data, class.source = "fold-hide"} # Define path to "waiting_for_extract" flag file data_dir <- dirname(extract_definition_path) waiting_for_extract_path <- file.path(data_dir, "waiting_for_extract.txt") # Ensure the JSON extract definition is present json_file_exists <- file.exists(extract_definition_path) if (!json_file_exists) { stop( "File '", extract_definition_path, "' not found; make sure that ", "`extract_definition_path` is the path to the .json extract definition ", "file.", call. = FALSE ) } # Are the data downloaded, or are we waiting for an extract? data_not_downloaded <- !file.exists(data_path) data_downloaded <- file.exists(data_path) waiting_for_extract <- file.exists(waiting_for_extract_path) # Ensure that IPUMS_API_KEY environment variable is defined ipums_api_key_undefined <- Sys.getenv("IPUMS_API_KEY") == "" if (data_not_downloaded & ipums_api_key_undefined) { stop( "Environment variable 'IPUMS_API_KEY' is undefined. Make sure you've ", "followed the instructions under 'Setting up your API key' in the ", "ipumsr API vignette (`vignette(\"ipums-api\", package = \"ipumsr\")`) ", "before running this script.", call. = FALSE ) } # If not yet waiting for extract, create and submit a new extract and create # the "waiting_for_extract" flag file if (data_not_downloaded & !waiting_for_extract) { extract_definition <- define_extract_from_json(extract_definition_path) submitted_extract <- submit_extract(extract_definition) writeLines( paste0(submitted_extract$collection, ":", submitted_extract$number), con = waiting_for_extract_path ) waiting_for_extract <- TRUE } # If waiting for an extract, read extract ID from flag file and check the status if (data_not_downloaded & waiting_for_extract) { extract_id <- readLines(waiting_for_extract_path) extract_info <- get_extract_info(extract_id) extract_is_ready <- is_extract_ready(extract_info) extract_is_stale <- !extract_is_ready & extract_info$status == "completed" if (extract_is_stale) { stop( paste0( "The data files for ", extract_info$collection, " extract number ", extract_info$number, " have been removed from IPUMS servers. ", "Please delete the file '", waiting_for_extract_path, "' and re-run ", "the template." ), call. = FALSE ) } # If the extract is ready, download files and rename to match the JSON file, # then delete the waiting_for_extract flag file if (extract_is_ready) { orig_ddi_path <- download_extract(extract_info, download_dir = data_dir) orig_data_path <- gsub("\\.xml$", ".dat.gz", orig_ddi_path) ddi_file_successfully_renamed <- file.rename(orig_ddi_path, ddi_path) data_file_successfully_renamed <- file.rename(orig_data_path, data_path) if (!ddi_file_successfully_renamed || !data_file_successfully_renamed) { stop( "Problem renaming DDI and/or data file; please report bug at ", "https://github.com/ipums/ipumsr/issues, including a copy of this ", "file if possible.", call. = FALSE ) } data_downloaded <- TRUE waiting_file_successfully_removed <- file.remove(waiting_for_extract_path) if (!waiting_file_successfully_removed) { stop( "Unable to remove 'waiting_for_extract.txt'; please report bug at ", "https://github.com/ipums/ipumsr/issues, including a copy of this ", "file if possible.", call. = FALSE ) } } else { # If extract is not ready, stop execution stop( "NOT AN ERROR: ", extract_info$collection, " extract number ", extract_info$number, " is not yet ready to download. Try ", "re-running again later.", call. = FALSE ) } } ``` ```{r load-data, eval = data_downloaded} ddi <- read_ipums_ddi(ddi_path) data <- read_ipums_micro(ddi, data_file = data_path) ``` ## Analysis Awaits {.active} ```{r analyze-data, eval = data_downloaded} data ```