{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This file is part of the Minnesota Population Center's NHGISXWALK.\n", "# For copyright and licensing information, see the NOTICE and LICENSE files\n", "# in this project's top-level directory, and also on-line at:\n", "# https://github.com/ipums/nhgisxwalk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Generate national and state-level crosswalks\n", "## 2000 block group parts to 2010 county\n", "\n", "### NHGIS [block crosswalks](https://www.nhgis.org/user-resources/geographic-crosswalks)\n", "\n", "**James D. Gaboardi, 06/2020**" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:53:19.285459Z", "start_time": "2020-10-01T21:53:19.140633Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2020-10-01T17:53:19-04:00\n", "\n", "CPython 3.8.5\n", "IPython 7.18.1\n", "\n", "compiler : Clang 10.0.1 \n", "system : Darwin\n", "release : 19.6.0\n", "machine : x86_64\n", "processor : i386\n", "CPU cores : 8\n", "interpreter: 64bit\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:53:19.555408Z", "start_time": "2020-10-01T21:53:19.287875Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "watermark 2.0.2\n", "pandas 1.1.1\n", "numpy 1.19.1\n", "nhgisxwalk 0.0.9post1\n", "\n" ] } ], "source": [ "import nhgisxwalk\n", "import inspect\n", "import numpy\n", "import pandas\n", "\n", "%load_ext autoreload\n", "%autoreload 2\n", "%watermark -w\n", "%watermark -iv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source and target years for the crosswalk" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:53:19.574264Z", "start_time": "2020-10-01T21:53:19.558483Z" } }, "outputs": [], "source": [ "source_year, target_year = \"2000\", \"2010\"\n", "gj_src, gj_trg = \"GJOIN%s\"%source_year, \"GJOIN%s\"%target_year" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:53:19.592374Z", "start_time": "2020-10-01T21:53:19.576945Z" } }, "outputs": [], "source": [ "# Set these to a local directory\n", "data_in = \"path/to/data/\"\n", "data_tab = \"path/to/data/\"" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:53:19.608098Z", "start_time": "2020-10-01T21:53:19.594045Z" } }, "outputs": [], "source": [ "block_file = \"%s_block\" % source_year" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source-target building base" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:53:33.317361Z", "start_time": "2020-10-01T21:53:19.609436Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>GJOIN2000</th>\n", " <th>GJOIN2010</th>\n", " <th>WEIGHT</th>\n", " <th>PAREA</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>G01000100201001000</td>\n", " <td>G01000100201002000</td>\n", " <td>0.035897</td>\n", " <td>0.008988</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>G01000100201001000</td>\n", " <td>G01000100201002001</td>\n", " <td>0.253330</td>\n", " <td>0.263725</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>G01000100201001000</td>\n", " <td>G01000100201002002</td>\n", " <td>0.000000</td>\n", " <td>0.000385</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>G01000100201001000</td>\n", " <td>G01000100201002003</td>\n", " <td>0.076297</td>\n", " <td>0.055430</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>G01000100201001000</td>\n", " <td>G01000100201002004</td>\n", " <td>0.032441</td>\n", " <td>0.007543</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " GJOIN2000 GJOIN2010 WEIGHT PAREA\n", "0 G01000100201001000 G01000100201002000 0.035897 0.008988\n", "1 G01000100201001000 G01000100201002001 0.253330 0.263725\n", "2 G01000100201001000 G01000100201002002 0.000000 0.000385\n", "3 G01000100201001000 G01000100201002003 0.076297 0.055430\n", "4 G01000100201001000 G01000100201002004 0.032441 0.007543" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "base_xwalk_name = \"nhgis_blk%s_blk%s_gj\" % (source_year, target_year)\n", "data_types = nhgisxwalk.str_types([gj_src, gj_trg])\n", "from_csv_kws = {\"path\": data_in, \"archived\": True, \"remove_unpacked\": True}\n", "read_csv_kws = {\"dtype\": data_types}\n", "base_xwalk = nhgisxwalk.xwalk_df_from_csv(\n", " base_xwalk_name, **from_csv_kws, **read_csv_kws\n", ")\n", "base_xwalk.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source summary data" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:53:33.333064Z", "start_time": "2020-10-01T21:53:33.319487Z" } }, "outputs": [], "source": [ "base_source_name = \"%s/%s.csv\" % (block_file, block_file)\n", "base_source_file = \"%s%s\" % (data_tab, base_source_name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Convenience code shorthand/lookup" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:53:33.350663Z", "start_time": "2020-10-01T21:53:33.336507Z" } }, "outputs": [ { "data": { "text/plain": [ "{'block': 'blk',\n", " 'block group part': 'bgp',\n", " 'block group': 'bg',\n", " 'tract': 'tr',\n", " 'county': 'co'}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nhgisxwalk.valid_geo_shorthand(shorthand_name=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set the `nhgisxwalk.GeoCrossWalk` parameters\n", "##### see [nhgisxwalk.GeoCrossWalk](https://github.com/ipums/nhgisxwalk/blob/92b4fe55de0a9c53d0315dcda8ec121faaf20aef/nhgisxwalk/geocrosswalk.py#L19) for full details" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:53:33.367949Z", "start_time": "2020-10-01T21:53:33.352761Z" } }, "outputs": [ { "data": { "text/plain": [ "{'Persons': {'Persons': 'Universe',\n", " 'NP001A': 'Source code',\n", " 'FXS': 'NHGIS code',\n", " 'Total': 'FXS001'},\n", " 'Families': {'Families': 'Universe',\n", " 'NP031A': 'Source code',\n", " 'F2V': 'NHGIS code',\n", " 'Total': 'F2V001'},\n", " 'Households': {'Households': 'Universe',\n", " 'NP010A': 'Source code',\n", " 'FY4': 'NHGIS code',\n", " 'Total': 'FY4001'},\n", " 'Housing Units': {'Housing Units': 'Universe',\n", " 'NH001A': 'Source code',\n", " 'FV5': 'NHGIS code',\n", " 'Total': 'FV5001'}}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nhgisxwalk.desc_code_2000_SF1b" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:53:33.384148Z", "start_time": "2020-10-01T21:53:33.369508Z" } }, "outputs": [ { "data": { "text/plain": [ "['FXS001', 'F2V001', 'FY4001', 'FV5001']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "input_vars = [\n", " nhgisxwalk.desc_code_2000_SF1b[\"Persons\"][\"Total\"],\n", " nhgisxwalk.desc_code_2000_SF1b[\"Families\"][\"Total\"],\n", " nhgisxwalk.desc_code_2000_SF1b[\"Households\"][\"Total\"],\n", " nhgisxwalk.desc_code_2000_SF1b[\"Housing Units\"][\"Total\"]\n", "]\n", "input_vars" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:53:33.399417Z", "start_time": "2020-10-01T21:53:33.385512Z" } }, "outputs": [], "source": [ "input_var_tags = [\"pop\", \"fam\", \"hh\", \"hu\"]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:53:33.414885Z", "start_time": "2020-10-01T21:53:33.400734Z" } }, "outputs": [], "source": [ "xwalk_args = {\n", " \"source_year\": source_year,\n", " \"target_year\": target_year,\n", " \"source_geo\": \"bgp\",\n", " \"target_geo\": \"co\",\n", " \"base_source_table\": base_source_file,\n", " \"input_var\": input_vars,\n", " \"weight_var\": input_var_tags,\n", " \"keep_base\": False,\n", " \"add_geoid\": True\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Generate data product\n", "1. Create a national crosswalk then split by state \n", "2. Write out all products with `README.txt` files" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:55:39.351727Z", "start_time": "2020-10-01T21:53:33.416175Z" } }, "outputs": [], "source": [ "nhgisxwalk.generate_data_product(base_xwalk, xwalk_args, data_in)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-----------------------------------------------" ] } ], "metadata": { "_draft": { "nbviewer_url": "https://gist.github.com/9f47e4ec2cc37bce83acf20abfca69d2" }, "gist": { "data": { "description": "sample-workflow.ipynb", "public": true }, "id": "9f47e4ec2cc37bce83acf20abfca69d2" }, "kernelspec": { "display_name": "Python [conda env:nhgis] *", "language": "python", "name": "conda-env-nhgis-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }