{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This file is part of the Minnesota Population Center's NHGISXWALK.\n", "# For copyright and licensing information, see the NOTICE and LICENSE files\n", "# in this project's top-level directory, and also on-line at:\n", "# https://github.com/ipums/nhgisxwalk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Sample workflow: 2000 block group parts to 2010 counties\n", "\n", "## Starting from a subset of 2010 Delaware blocks\n", "\n", "For further background information see:\n", "\n", "* **Schroeder, J. P**. 2007. *Target-density weighting interpolation and uncertainty evaluation for temporal analysis of census data*. Geographical Analysis 39 (3):311–335.\n", "\n", "#### NHGIS [block crosswalks](https://www.nhgis.org/user-resources/geographic-crosswalks)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:53.079308Z", "start_time": "2020-08-19T22:07:52.962003Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2020-08-19T18:07:53-04:00\n", "\n", "CPython 3.8.5\n", "IPython 7.16.1\n", "\n", "compiler : Clang 10.0.1 \n", "system : Darwin\n", "release : 19.6.0\n", "machine : x86_64\n", "processor : i386\n", "CPU cores : 8\n", "interpreter: 64bit\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:53.366088Z", "start_time": "2020-08-19T22:07:53.082279Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "watermark 2.0.2\n", "pandas 1.1.0\n", "nhgisxwalk 0.0.9\n", "\n" ] } ], "source": [ "import nhgisxwalk\n", "import inspect\n", "import pandas\n", "\n", "%load_ext autoreload\n", "%autoreload 2\n", "%watermark -w\n", "%watermark -iv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source and target years for the crosswalk" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:53.385820Z", "start_time": "2020-08-19T22:07:53.368554Z" } }, "outputs": [], "source": [ "source_year, target_year = \"2000\", \"2010\"\n", "gj_src, gj_trg = \"GJOIN%s\"%source_year, \"GJOIN%s\"%target_year" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source-target building base" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:53.452406Z", "start_time": "2020-08-19T22:07:53.387660Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GJOIN2000GJOIN2010WEIGHTPAREA
0G10000100401001000G100001004010010001.0000001.000000
1G10000100401001001G100001004010010010.9999810.999988
2G10000100401001001G100001004010010030.0000190.000012
3G10000100401001002G100001004010010021.0000001.000000
4G10000100401001003G100001004010010031.0000001.000000
\n", "
" ], "text/plain": [ " GJOIN2000 GJOIN2010 WEIGHT PAREA\n", "0 G10000100401001000 G10000100401001000 1.000000 1.000000\n", "1 G10000100401001001 G10000100401001001 0.999981 0.999988\n", "2 G10000100401001001 G10000100401001003 0.000019 0.000012\n", "3 G10000100401001002 G10000100401001002 1.000000 1.000000\n", "4 G10000100401001003 G10000100401001003 1.000000 1.000000" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "subset_data_dir = \"../testing_data_subsets/\"\n", "base_xwalk_name = \"nhgis_blk%s_blk%s_gj\" % (source_year, target_year)\n", "data_types = nhgisxwalk.str_types([gj_src, gj_trg])\n", "from_csv_kws = {\"path\": subset_data_dir, \"archived\": True, \"remove_unpacked\": True}\n", "read_csv_kws = {\"dtype\": data_types}\n", "base_xwalk = nhgisxwalk.xwalk_df_from_csv(\n", " base_xwalk_name, **from_csv_kws, **read_csv_kws\n", ")\n", "base_xwalk.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Convenience code shorthand/lookup" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:53.472938Z", "start_time": "2020-08-19T22:07:53.453766Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "def valid_geo_shorthand(shorthand_name=True):\n", " \"\"\"Shorthand lookups for census geographies.\"\"\"\n", " lookup = {\n", " \"blk\": \"block\",\n", " \"bgp\": \"block group part\",\n", " \"bg\": \"block group\",\n", " \"tr\": \"tract\",\n", " \"co\": \"county\",\n", " }\n", " if not shorthand_name:\n", " lookup = {v: k for k, v in lookup.items()}\n", " return lookup\n", "\n" ] } ], "source": [ "print(inspect.getsource(nhgisxwalk.valid_geo_shorthand))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:53.493316Z", "start_time": "2020-08-19T22:07:53.474804Z" } }, "outputs": [ { "data": { "text/plain": [ "{'block': 'blk',\n", " 'block group part': 'bgp',\n", " 'block group': 'bg',\n", " 'tract': 'tr',\n", " 'county': 'co'}" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nhgisxwalk.valid_geo_shorthand(shorthand_name=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Instantiate an `nhgisxwalk.GeoCrossWalk` object\n", "##### see [nhgisxwalk.GeoCrossWalk](https://github.com/ipums/nhgisxwalk/blob/92b4fe55de0a9c53d0315dcda8ec121faaf20aef/nhgisxwalk/geocrosswalk.py#L19) for full details" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:53.516112Z", "start_time": "2020-08-19T22:07:53.495167Z" } }, "outputs": [ { "data": { "text/plain": [ "{'Persons': {'Persons': 'Universe',\n", " 'NP001A': 'Source code',\n", " 'FXS': 'NHGIS code',\n", " 'Total': 'FXS001'},\n", " 'Families': {'Families': 'Universe',\n", " 'NP031A': 'Source code',\n", " 'F2V': 'NHGIS code',\n", " 'Total': 'F2V001'},\n", " 'Households': {'Households': 'Universe',\n", " 'NP010A': 'Source code',\n", " 'FY4': 'NHGIS code',\n", " 'Total': 'FY4001'},\n", " 'Housing Units': {'Housing Units': 'Universe',\n", " 'NH001A': 'Source code',\n", " 'FV5': 'NHGIS code',\n", " 'Total': 'FV5001'}}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nhgisxwalk.desc_code_2000_SF1b" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:53.538624Z", "start_time": "2020-08-19T22:07:53.519468Z" } }, "outputs": [ { "data": { "text/plain": [ "['FXS001', 'F2V001', 'FY4001', 'FV5001']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "input_vars = [\n", " nhgisxwalk.desc_code_2000_SF1b[\"Persons\"][\"Total\"],\n", " nhgisxwalk.desc_code_2000_SF1b[\"Families\"][\"Total\"],\n", " nhgisxwalk.desc_code_2000_SF1b[\"Households\"][\"Total\"],\n", " nhgisxwalk.desc_code_2000_SF1b[\"Housing Units\"][\"Total\"]\n", "]\n", "input_vars" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:53.559398Z", "start_time": "2020-08-19T22:07:53.540898Z" } }, "outputs": [], "source": [ "input_var_tags = [\"pop\", \"fam\", \"hh\", \"hu\"]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:53.834370Z", "start_time": "2020-08-19T22:07:53.560797Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp2000gjco2010gjco2010gewt_popwt_famwt_hhwt_hu
0G10000109044444430042202U1G1000010100011.01.01.01.0
1G10000109044461265042201R1G1000010100011.01.01.01.0
2G10000109044461265042201U1G1000010100011.01.01.01.0
3G10000109044461265042201U2G1000010100011.01.01.01.0
4G10000109044461480042202R2G1000010100011.01.01.01.0
........................
903G10000509355299999051500R4G1000050100051.01.01.01.0
904G10000509355299999051500U1G1000050100051.01.01.01.0
905G10000509355299999051500U3G1000050100051.01.01.01.0
906G10000509355299999051500U4G1000050100051.01.01.01.0
907G34003301061010600020400U2G1000030100030.00.00.00.0
\n", "

908 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp2000gj co2010gj co2010ge wt_pop wt_fam wt_hh \\\n", "0 G10000109044444430042202U1 G1000010 10001 1.0 1.0 1.0 \n", "1 G10000109044461265042201R1 G1000010 10001 1.0 1.0 1.0 \n", "2 G10000109044461265042201U1 G1000010 10001 1.0 1.0 1.0 \n", "3 G10000109044461265042201U2 G1000010 10001 1.0 1.0 1.0 \n", "4 G10000109044461480042202R2 G1000010 10001 1.0 1.0 1.0 \n", ".. ... ... ... ... ... ... \n", "903 G10000509355299999051500R4 G1000050 10005 1.0 1.0 1.0 \n", "904 G10000509355299999051500U1 G1000050 10005 1.0 1.0 1.0 \n", "905 G10000509355299999051500U3 G1000050 10005 1.0 1.0 1.0 \n", "906 G10000509355299999051500U4 G1000050 10005 1.0 1.0 1.0 \n", "907 G34003301061010600020400U2 G1000030 10003 0.0 0.0 0.0 \n", "\n", " wt_hu \n", "0 1.0 \n", "1 1.0 \n", "2 1.0 \n", "3 1.0 \n", "4 1.0 \n", ".. ... \n", "903 1.0 \n", "904 1.0 \n", "905 1.0 \n", "906 1.0 \n", "907 0.0 \n", "\n", "[908 rows x 7 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "subset_state = \"10\"\n", "bgp2000_to_co2010 = nhgisxwalk.GeoCrossWalk(\n", " base_xwalk,\n", " source_year=source_year,\n", " target_year=target_year,\n", " source_geo=\"bgp\",\n", " target_geo=\"co\",\n", " base_source_table=subset_data_dir+\"/2000_block.csv.zip\",\n", " input_var=input_vars,\n", " weight_var=input_var_tags,\n", " stfips=subset_state,\n", " keep_base=True,\n", " add_geoid=True\n", ")\n", "bgp2000_to_co2010.xwalk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Prepare a single data product with a `README.txt`" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:53.857127Z", "start_time": "2020-08-19T22:07:53.835988Z" } }, "outputs": [], "source": [ "xwalk, xwalk_name = bgp2000_to_co2010.xwalk, bgp2000_to_co2010.xwalk_name\n", "xwalk_name_base = \"_\".join(xwalk_name.split(\"_\")[:-1])" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:53.886011Z", "start_time": "2020-08-19T22:07:53.859105Z" } }, "outputs": [], "source": [ "out_data_dir = \"../../crosswalks/\"\n", "out_path = \"%s%s%s/%s\" % (out_data_dir, xwalk_name_base, \"_state\", xwalk_name)\n", "nhgisxwalk.prepare_data_product(xwalk, xwalk_name, out_path, remove=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read crosswalk from a `.zip` archive" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:53.918720Z", "start_time": "2020-08-19T22:07:53.887487Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp2000gjco2010gjco2010gewt_popwt_famwt_hhwt_hu
0G10000109044444430042202U1G1000010100011.01.01.01.0
1G10000109044461265042201R1G1000010100011.01.01.01.0
2G10000109044461265042201U1G1000010100011.01.01.01.0
3G10000109044461265042201U2G1000010100011.01.01.01.0
4G10000109044461480042202R2G1000010100011.01.01.01.0
........................
903G10000509355299999051500R4G1000050100051.01.01.01.0
904G10000509355299999051500U1G1000050100051.01.01.01.0
905G10000509355299999051500U3G1000050100051.01.01.01.0
906G10000509355299999051500U4G1000050100051.01.01.01.0
907G34003301061010600020400U2G1000030100030.00.00.00.0
\n", "

908 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp2000gj co2010gj co2010ge wt_pop wt_fam wt_hh \\\n", "0 G10000109044444430042202U1 G1000010 10001 1.0 1.0 1.0 \n", "1 G10000109044461265042201R1 G1000010 10001 1.0 1.0 1.0 \n", "2 G10000109044461265042201U1 G1000010 10001 1.0 1.0 1.0 \n", "3 G10000109044461265042201U2 G1000010 10001 1.0 1.0 1.0 \n", "4 G10000109044461480042202R2 G1000010 10001 1.0 1.0 1.0 \n", ".. ... ... ... ... ... ... \n", "903 G10000509355299999051500R4 G1000050 10005 1.0 1.0 1.0 \n", "904 G10000509355299999051500U1 G1000050 10005 1.0 1.0 1.0 \n", "905 G10000509355299999051500U3 G1000050 10005 1.0 1.0 1.0 \n", "906 G10000509355299999051500U4 G1000050 10005 1.0 1.0 1.0 \n", "907 G34003301061010600020400U2 G1000030 10003 0.0 0.0 0.0 \n", "\n", " wt_hu \n", "0 1.0 \n", "1 1.0 \n", "2 1.0 \n", "3 1.0 \n", "4 1.0 \n", ".. ... \n", "903 1.0 \n", "904 1.0 \n", "905 1.0 \n", "906 1.0 \n", "907 0.0 \n", "\n", "[908 rows x 7 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "in_path = \"%s%s%s\" % (out_data_dir, xwalk_name_base, \"_state/\")\n", "id_cols = [c for c in xwalk.columns if not c.startswith(\"wt\")]\n", "data_types = nhgisxwalk.str_types(id_cols)\n", "from_csv_kws = {\"path\": in_path, \"archived\": True, \"remove_unpacked\": True}\n", "read_csv_kws = {\"dtype\": data_types}\n", "bgp2000_to_co2010_df = nhgisxwalk.xwalk_df_from_csv(\n", " xwalk_name, **from_csv_kws, **read_csv_kws\n", ")\n", "bgp2000_to_co2010_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-----------------------------------------------" ] } ], "metadata": { "_draft": { "nbviewer_url": "https://gist.github.com/9f47e4ec2cc37bce83acf20abfca69d2" }, "gist": { "data": { "description": "sample-workflow.ipynb", "public": true }, "id": "9f47e4ec2cc37bce83acf20abfca69d2" }, "kernelspec": { "display_name": "Python [conda env:nhgis] *", "language": "python", "name": "conda-env-nhgis-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }