{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This file is part of the Minnesota Population Center's NHGISXWALK.\n", "# For copyright and licensing information, see the NOTICE and LICENSE files\n", "# in this project's top-level directory, and also on-line at:\n", "# https://github.com/ipums/nhgisxwalk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Sample workflow: 1990 block group parts to 2010 block groups\n", "\n", "## Starting from a subset of 2010 Delaware blocks\n", "\n", "For further background information see:\n", "\n", "* **Schroeder, J. P**. 2007. *Target-density weighting interpolation and uncertainty evaluation for temporal analysis of census data*. Geographical Analysis 39 (3):311–335.\n", "\n", "#### NHGIS [block crosswalks](https://www.nhgis.org/user-resources/geographic-crosswalks)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:06:38.723987Z", "start_time": "2020-08-19T22:06:38.608913Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2020-08-19T18:06:38-04:00\n", "\n", "CPython 3.8.5\n", "IPython 7.16.1\n", "\n", "compiler : Clang 10.0.1 \n", "system : Darwin\n", "release : 19.6.0\n", "machine : x86_64\n", "processor : i386\n", "CPU cores : 8\n", "interpreter: 64bit\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:06:38.996511Z", "start_time": "2020-08-19T22:06:38.726409Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "watermark 2.0.2\n", "nhgisxwalk 0.0.9\n", "pandas 1.1.0\n", "numpy 1.19.1\n", "\n" ] } ], "source": [ "import nhgisxwalk\n", "import inspect\n", "import numpy\n", "import pandas\n", "\n", "%load_ext autoreload\n", "%autoreload 2\n", "%watermark -w\n", "%watermark -iv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source and target years for the crosswalk" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:06:39.018534Z", "start_time": "2020-08-19T22:06:38.999226Z" } }, "outputs": [], "source": [ "source_year, target_year = \"1990\", \"2010\"\n", "gj_src, gj_trg = \"GJOIN%s\"%source_year, \"GJOIN%s\"%target_year" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source-target building base" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:06:39.097345Z", "start_time": "2020-08-19T22:06:39.022010Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GJOIN1990GJOIN2010WEIGHTPAREA_VIA_BLK00
0G10000100401101G100001004010010001.0000001.000000
1G10000100401102G100001004010010010.9217500.976774
2G10000100401102G100001004010010020.0782190.023215
3G10000100401102G100001004010010030.0000310.000012
4G10000100401103G100001004010010031.0000001.000000
\n", "
" ], "text/plain": [ " GJOIN1990 GJOIN2010 WEIGHT PAREA_VIA_BLK00\n", "0 G10000100401101 G10000100401001000 1.000000 1.000000\n", "1 G10000100401102 G10000100401001001 0.921750 0.976774\n", "2 G10000100401102 G10000100401001002 0.078219 0.023215\n", "3 G10000100401102 G10000100401001003 0.000031 0.000012\n", "4 G10000100401103 G10000100401001003 1.000000 1.000000" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "subset_data_dir = \"../testing_data_subsets/\"\n", "base_xwalk_name = \"nhgis_blk%s_blk%s_gj\" % (source_year, target_year)\n", "data_types = nhgisxwalk.str_types([gj_src, gj_trg])\n", "from_csv_kws = {\"path\": subset_data_dir, \"archived\": True, \"remove_unpacked\": True}\n", "read_csv_kws = {\"dtype\": data_types}\n", "base_xwalk = nhgisxwalk.xwalk_df_from_csv(\n", " base_xwalk_name, **from_csv_kws, **read_csv_kws\n", ")\n", "base_xwalk.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set the base (source) summary file name" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:06:39.119768Z", "start_time": "2020-08-19T22:06:39.099351Z" } }, "outputs": [], "source": [ "base_source_name = \"%s_block.csv.zip\" % source_year\n", "base_source_file = \"%s%s\" % (subset_data_dir, base_source_name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source supplementary summary data (special case for 1990)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:06:39.141025Z", "start_time": "2020-08-19T22:06:39.121076Z" } }, "outputs": [], "source": [ "supp_source_name = \"%s_blck_grp_598.csv.zip\" % source_year\n", "supp_source_file = subset_data_dir + supp_source_name" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Convenience code shorthand/lookup" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:06:39.164448Z", "start_time": "2020-08-19T22:06:39.142893Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "def valid_geo_shorthand(shorthand_name=True):\n", " \"\"\"Shorthand lookups for census geographies.\"\"\"\n", " lookup = {\n", " \"blk\": \"block\",\n", " \"bgp\": \"block group part\",\n", " \"bg\": \"block group\",\n", " \"tr\": \"tract\",\n", " \"co\": \"county\",\n", " }\n", " if not shorthand_name:\n", " lookup = {v: k for k, v in lookup.items()}\n", " return lookup\n", "\n" ] } ], "source": [ "print(inspect.getsource(nhgisxwalk.valid_geo_shorthand))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:06:39.189374Z", "start_time": "2020-08-19T22:06:39.167620Z" } }, "outputs": [ { "data": { "text/plain": [ "{'block': 'blk',\n", " 'block group part': 'bgp',\n", " 'block group': 'bg',\n", " 'tract': 'tr',\n", " 'county': 'co'}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nhgisxwalk.valid_geo_shorthand(shorthand_name=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Instantiate an `nhgisxwalk.GeoCrossWalk` object\n", "##### see [nhgisxwalk.GeoCrossWalk](https://github.com/ipums/nhgisxwalk/blob/92b4fe55de0a9c53d0315dcda8ec121faaf20aef/nhgisxwalk/geocrosswalk.py#L19) for full details" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:06:39.213948Z", "start_time": "2020-08-19T22:06:39.191694Z" } }, "outputs": [ { "data": { "text/plain": [ "{'Persons': {'Persons': 'Universe',\n", " 'NP1': 'Source code',\n", " 'ET1': 'NHGIS code',\n", " 'Total': 'ET1001'},\n", " 'Families': {'Families': 'Universe',\n", " 'NP2': 'Source code',\n", " 'EUD': 'NHGIS code',\n", " 'Total': 'EUD001'},\n", " 'Households': {'Households': 'Universe',\n", " 'NP3': 'Source code',\n", " 'EUO': 'NHGIS code',\n", " 'Total': 'EUO001'},\n", " 'Housing Units': {'Housing Units': 'Universe',\n", " 'NH1': 'Source code',\n", " 'ESA': 'NHGIS code',\n", " 'Total': 'ESA001'}}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nhgisxwalk.desc_code_1990" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:06:39.239819Z", "start_time": "2020-08-19T22:06:39.215859Z" } }, "outputs": [ { "data": { "text/plain": [ "['ET1001', 'EUD001', 'EUO001', 'ESA001']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "input_vars = [\n", " nhgisxwalk.desc_code_1990[\"Persons\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Families\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Households\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Housing Units\"][\"Total\"]\n", "]\n", "input_vars" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:06:39.262756Z", "start_time": "2020-08-19T22:06:39.241779Z" } }, "outputs": [], "source": [ "input_var_tags = [\"pop\", \"fam\", \"hh\", \"hu\"]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:06:39.730547Z", "start_time": "2020-08-19T22:06:39.264209Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp1990gjbg2010gjbg2010gewt_popwt_famwt_hhwt_hu
0G100001090444072500423009999999999921G100001004320231000104320231.01.01.01.0
1G100001090444444300422009999999999926G100001004220211000104220211.01.01.01.0
2G100001090444612650422009999999219011G100001004120021000104120020.00.00.00.0
3G100001090444612650422009999999219011G100001004220111000104220111.01.01.01.0
4G100001090444612650422009999999219012G100001004220121000104220121.01.01.01.0
........................
1402G100005093552999990515009999999999923G100005005150031000505150031.01.01.01.0
1403G100005093552999990515009999999999924G100005005150041000505150041.01.01.01.0
1404G100005093552999990516009999999999921G100005005170211000505170211.01.01.01.0
1405G10000509900000G100005099000001000599000000.00.00.00.0
1406G340033010610106000204029999999916014G100003099010001000399010000.00.00.00.0
\n", "

1407 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp1990gj bg2010gj bg2010ge \\\n", "0 G100001090444072500423009999999999921 G10000100432023 100010432023 \n", "1 G100001090444444300422009999999999926 G10000100422021 100010422021 \n", "2 G100001090444612650422009999999219011 G10000100412002 100010412002 \n", "3 G100001090444612650422009999999219011 G10000100422011 100010422011 \n", "4 G100001090444612650422009999999219012 G10000100422012 100010422012 \n", "... ... ... ... \n", "1402 G100005093552999990515009999999999923 G10000500515003 100050515003 \n", "1403 G100005093552999990515009999999999924 G10000500515004 100050515004 \n", "1404 G100005093552999990516009999999999921 G10000500517021 100050517021 \n", "1405 G10000509900000 G10000509900000 100059900000 \n", "1406 G340033010610106000204029999999916014 G10000309901000 100039901000 \n", "\n", " wt_pop wt_fam wt_hh wt_hu \n", "0 1.0 1.0 1.0 1.0 \n", "1 1.0 1.0 1.0 1.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 1.0 1.0 1.0 1.0 \n", "4 1.0 1.0 1.0 1.0 \n", "... ... ... ... ... \n", "1402 1.0 1.0 1.0 1.0 \n", "1403 1.0 1.0 1.0 1.0 \n", "1404 1.0 1.0 1.0 1.0 \n", "1405 0.0 0.0 0.0 0.0 \n", "1406 0.0 0.0 0.0 0.0 \n", "\n", "[1407 rows x 7 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "subset_state = \"10\"\n", "bgp1990_to_bg2010 = nhgisxwalk.GeoCrossWalk(\n", " base_xwalk,\n", " source_year=source_year,\n", " target_year=target_year,\n", " source_geo=\"bgp\",\n", " target_geo=\"bg\",\n", " base_source_table=base_source_file,\n", " supp_source_table=supp_source_file,\n", " input_var=input_vars,\n", " weight_var=input_var_tags,\n", " stfips=subset_state,\n", " keep_base=True,\n", " add_geoid=True\n", ")\n", "bgp1990_to_bg2010.xwalk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Prepare a single data product with a `README.txt`" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:06:39.753073Z", "start_time": "2020-08-19T22:06:39.732989Z" } }, "outputs": [], "source": [ "xwalk, xwalk_name = bgp1990_to_bg2010.xwalk, bgp1990_to_bg2010.xwalk_name\n", "xwalk_name_base = \"_\".join(xwalk_name.split(\"_\")[:-1])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:06:39.788390Z", "start_time": "2020-08-19T22:06:39.754809Z" } }, "outputs": [], "source": [ "out_data_dir = \"../../crosswalks/\"\n", "out_path = \"%s%s%s/%s\" % (out_data_dir, xwalk_name_base, \"_state\", xwalk_name)\n", "nhgisxwalk.prepare_data_product(xwalk, xwalk_name, out_path, remove=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read crosswalk from a `.zip` archive" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:06:39.824178Z", "start_time": "2020-08-19T22:06:39.790162Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp1990gjbg2010gjbg2010gewt_popwt_famwt_hhwt_hu
0G100001090444072500423009999999999921G100001004320231000104320231.01.01.01.0
1G100001090444444300422009999999999926G100001004220211000104220211.01.01.01.0
2G100001090444612650422009999999219011G100001004120021000104120020.00.00.00.0
3G100001090444612650422009999999219011G100001004220111000104220111.01.01.01.0
4G100001090444612650422009999999219012G100001004220121000104220121.01.01.01.0
........................
1402G100005093552999990515009999999999923G100005005150031000505150031.01.01.01.0
1403G100005093552999990515009999999999924G100005005150041000505150041.01.01.01.0
1404G100005093552999990516009999999999921G100005005170211000505170211.01.01.01.0
1405G10000509900000G100005099000001000599000000.00.00.00.0
1406G340033010610106000204029999999916014G100003099010001000399010000.00.00.00.0
\n", "

1407 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp1990gj bg2010gj bg2010ge \\\n", "0 G100001090444072500423009999999999921 G10000100432023 100010432023 \n", "1 G100001090444444300422009999999999926 G10000100422021 100010422021 \n", "2 G100001090444612650422009999999219011 G10000100412002 100010412002 \n", "3 G100001090444612650422009999999219011 G10000100422011 100010422011 \n", "4 G100001090444612650422009999999219012 G10000100422012 100010422012 \n", "... ... ... ... \n", "1402 G100005093552999990515009999999999923 G10000500515003 100050515003 \n", "1403 G100005093552999990515009999999999924 G10000500515004 100050515004 \n", "1404 G100005093552999990516009999999999921 G10000500517021 100050517021 \n", "1405 G10000509900000 G10000509900000 100059900000 \n", "1406 G340033010610106000204029999999916014 G10000309901000 100039901000 \n", "\n", " wt_pop wt_fam wt_hh wt_hu \n", "0 1.0 1.0 1.0 1.0 \n", "1 1.0 1.0 1.0 1.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 1.0 1.0 1.0 1.0 \n", "4 1.0 1.0 1.0 1.0 \n", "... ... ... ... ... \n", "1402 1.0 1.0 1.0 1.0 \n", "1403 1.0 1.0 1.0 1.0 \n", "1404 1.0 1.0 1.0 1.0 \n", "1405 0.0 0.0 0.0 0.0 \n", "1406 0.0 0.0 0.0 0.0 \n", "\n", "[1407 rows x 7 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "in_path = \"%s%s%s\" % (out_data_dir, xwalk_name_base, \"_state/\")\n", "id_cols = [c for c in xwalk.columns if not c.startswith(\"wt\")]\n", "data_types = nhgisxwalk.str_types(id_cols)\n", "from_csv_kws = {\"path\": in_path, \"archived\": True, \"remove_unpacked\": True}\n", "read_csv_kws = {\"dtype\": data_types}\n", "bgp1990_to_bg2010_df = nhgisxwalk.xwalk_df_from_csv(\n", " xwalk_name, **from_csv_kws, **read_csv_kws\n", ")\n", "bgp1990_to_bg2010_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-----------------------------------------------" ] } ], "metadata": { "_draft": { "nbviewer_url": "https://gist.github.com/9f47e4ec2cc37bce83acf20abfca69d2" }, "gist": { "data": { "description": "sample-workflow.ipynb", "public": true }, "id": "9f47e4ec2cc37bce83acf20abfca69d2" }, "kernelspec": { "display_name": "Python [conda env:nhgis] *", "language": "python", "name": "conda-env-nhgis-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }