{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This file is part of the Minnesota Population Center's NHGISXWALK.\n", "# For copyright and licensing information, see the NOTICE and LICENSE files\n", "# in this project's top-level directory, and also on-line at:\n", "# https://github.com/ipums/nhgisxwalk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Sample workflow: 2000 block group parts to 2010 block groups\n", "\n", "## Starting from a subset of 2010 Delaware blocks\n", "\n", "For further background information see:\n", "\n", "* **Schroeder, J. P**. 2007. *Target-density weighting interpolation and uncertainty evaluation for temporal analysis of census data*. Geographical Analysis 39 (3):311–335.\n", "\n", "#### NHGIS [block crosswalks](https://www.nhgis.org/user-resources/geographic-crosswalks)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:25.359511Z", "start_time": "2020-08-19T22:07:25.243071Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2020-08-19T18:07:25-04:00\n", "\n", "CPython 3.8.5\n", "IPython 7.16.1\n", "\n", "compiler : Clang 10.0.1 \n", "system : Darwin\n", "release : 19.6.0\n", "machine : x86_64\n", "processor : i386\n", "CPU cores : 8\n", "interpreter: 64bit\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:25.648513Z", "start_time": "2020-08-19T22:07:25.361589Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "watermark 2.0.2\n", "pandas 1.1.0\n", "nhgisxwalk 0.0.9\n", "\n" ] } ], "source": [ "import nhgisxwalk\n", "import inspect\n", "import pandas\n", "\n", "%load_ext autoreload\n", "%autoreload 2\n", "%watermark -w\n", "%watermark -iv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source and target years for the crosswalk" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:25.671220Z", "start_time": "2020-08-19T22:07:25.651159Z" } }, "outputs": [], "source": [ "source_year, target_year = \"2000\", \"2010\"\n", "gj_src, gj_trg = \"GJOIN%s\"%source_year, \"GJOIN%s\"%target_year" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source-target building base" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:25.740179Z", "start_time": "2020-08-19T22:07:25.673969Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GJOIN2000GJOIN2010WEIGHTPAREA
0G10000100401001000G100001004010010001.0000001.000000
1G10000100401001001G100001004010010010.9999810.999988
2G10000100401001001G100001004010010030.0000190.000012
3G10000100401001002G100001004010010021.0000001.000000
4G10000100401001003G100001004010010031.0000001.000000
\n", "
" ], "text/plain": [ " GJOIN2000 GJOIN2010 WEIGHT PAREA\n", "0 G10000100401001000 G10000100401001000 1.000000 1.000000\n", "1 G10000100401001001 G10000100401001001 0.999981 0.999988\n", "2 G10000100401001001 G10000100401001003 0.000019 0.000012\n", "3 G10000100401001002 G10000100401001002 1.000000 1.000000\n", "4 G10000100401001003 G10000100401001003 1.000000 1.000000" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "subset_data_dir = \"../testing_data_subsets/\"\n", "base_xwalk_name = \"nhgis_blk%s_blk%s_gj\" % (source_year, target_year)\n", "data_types = nhgisxwalk.str_types([gj_src, gj_trg])\n", "from_csv_kws = {\"path\": subset_data_dir, \"archived\": True, \"remove_unpacked\": True}\n", "read_csv_kws = {\"dtype\": data_types}\n", "base_xwalk = nhgisxwalk.xwalk_df_from_csv(\n", " base_xwalk_name, **from_csv_kws, **read_csv_kws\n", ")\n", "base_xwalk.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Convenience code shorthand/lookup" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:25.764562Z", "start_time": "2020-08-19T22:07:25.742008Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "def valid_geo_shorthand(shorthand_name=True):\n", " \"\"\"Shorthand lookups for census geographies.\"\"\"\n", " lookup = {\n", " \"blk\": \"block\",\n", " \"bgp\": \"block group part\",\n", " \"bg\": \"block group\",\n", " \"tr\": \"tract\",\n", " \"co\": \"county\",\n", " }\n", " if not shorthand_name:\n", " lookup = {v: k for k, v in lookup.items()}\n", " return lookup\n", "\n" ] } ], "source": [ "print(inspect.getsource(nhgisxwalk.valid_geo_shorthand))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:25.786672Z", "start_time": "2020-08-19T22:07:25.766202Z" } }, "outputs": [ { "data": { "text/plain": [ "{'block': 'blk',\n", " 'block group part': 'bgp',\n", " 'block group': 'bg',\n", " 'tract': 'tr',\n", " 'county': 'co'}" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nhgisxwalk.valid_geo_shorthand(shorthand_name=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Instantiate an `nhgisxwalk.GeoCrossWalk` object\n", "##### see [nhgisxwalk.GeoCrossWalk](https://github.com/ipums/nhgisxwalk/blob/92b4fe55de0a9c53d0315dcda8ec121faaf20aef/nhgisxwalk/geocrosswalk.py#L19) for full details" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:25.810746Z", "start_time": "2020-08-19T22:07:25.788752Z" } }, "outputs": [ { "data": { "text/plain": [ "{'Persons': {'Persons': 'Universe',\n", " 'NP001A': 'Source code',\n", " 'FXS': 'NHGIS code',\n", " 'Total': 'FXS001'},\n", " 'Families': {'Families': 'Universe',\n", " 'NP031A': 'Source code',\n", " 'F2V': 'NHGIS code',\n", " 'Total': 'F2V001'},\n", " 'Households': {'Households': 'Universe',\n", " 'NP010A': 'Source code',\n", " 'FY4': 'NHGIS code',\n", " 'Total': 'FY4001'},\n", " 'Housing Units': {'Housing Units': 'Universe',\n", " 'NH001A': 'Source code',\n", " 'FV5': 'NHGIS code',\n", " 'Total': 'FV5001'}}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nhgisxwalk.desc_code_2000_SF1b" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:25.834599Z", "start_time": "2020-08-19T22:07:25.814093Z" } }, "outputs": [ { "data": { "text/plain": [ "['FXS001', 'F2V001', 'FY4001', 'FV5001']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "input_vars = [\n", " nhgisxwalk.desc_code_2000_SF1b[\"Persons\"][\"Total\"],\n", " nhgisxwalk.desc_code_2000_SF1b[\"Families\"][\"Total\"],\n", " nhgisxwalk.desc_code_2000_SF1b[\"Households\"][\"Total\"],\n", " nhgisxwalk.desc_code_2000_SF1b[\"Housing Units\"][\"Total\"]\n", "]\n", "input_vars" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:25.856409Z", "start_time": "2020-08-19T22:07:25.836915Z" } }, "outputs": [], "source": [ "input_var_tags = [\"pop\", \"fam\", \"hh\", \"hu\"]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:26.159122Z", "start_time": "2020-08-19T22:07:25.857972Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp2000gjbg2010gjbg2010gewt_popwt_famwt_hhwt_hu
0G10000109044444430042202U1G100001004220211000104220211.01.01.01.0
1G10000109044461265042201R1G100001004220111000104220111.01.01.01.0
2G10000109044461265042201U1G100001004220111000104220111.01.01.01.0
3G10000109044461265042201U2G100001004220121000104220121.01.01.01.0
4G10000109044461480042202R2G100001004220221000104220221.01.01.01.0
........................
1220G10000509355299999051500R4G100005005150041000505150041.01.01.01.0
1221G10000509355299999051500U1G100005005150011000505150011.01.01.01.0
1222G10000509355299999051500U3G100005005150031000505150031.01.01.01.0
1223G10000509355299999051500U4G100005005150041000505150041.01.01.01.0
1224G34003301061010600020400U2G100003099010001000399010000.00.00.00.0
\n", "

1225 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp2000gj bg2010gj bg2010ge wt_pop \\\n", "0 G10000109044444430042202U1 G10000100422021 100010422021 1.0 \n", "1 G10000109044461265042201R1 G10000100422011 100010422011 1.0 \n", "2 G10000109044461265042201U1 G10000100422011 100010422011 1.0 \n", "3 G10000109044461265042201U2 G10000100422012 100010422012 1.0 \n", "4 G10000109044461480042202R2 G10000100422022 100010422022 1.0 \n", "... ... ... ... ... \n", "1220 G10000509355299999051500R4 G10000500515004 100050515004 1.0 \n", "1221 G10000509355299999051500U1 G10000500515001 100050515001 1.0 \n", "1222 G10000509355299999051500U3 G10000500515003 100050515003 1.0 \n", "1223 G10000509355299999051500U4 G10000500515004 100050515004 1.0 \n", "1224 G34003301061010600020400U2 G10000309901000 100039901000 0.0 \n", "\n", " wt_fam wt_hh wt_hu \n", "0 1.0 1.0 1.0 \n", "1 1.0 1.0 1.0 \n", "2 1.0 1.0 1.0 \n", "3 1.0 1.0 1.0 \n", "4 1.0 1.0 1.0 \n", "... ... ... ... \n", "1220 1.0 1.0 1.0 \n", "1221 1.0 1.0 1.0 \n", "1222 1.0 1.0 1.0 \n", "1223 1.0 1.0 1.0 \n", "1224 0.0 0.0 0.0 \n", "\n", "[1225 rows x 7 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "subset_state = \"10\"\n", "bgp2000_to_bg2010 = nhgisxwalk.GeoCrossWalk(\n", " base_xwalk,\n", " source_year=source_year,\n", " target_year=target_year,\n", " source_geo=\"bgp\",\n", " target_geo=\"bg\",\n", " base_source_table=subset_data_dir+\"/2000_block.csv.zip\",\n", " input_var=input_vars,\n", " weight_var=input_var_tags,\n", " stfips=subset_state,\n", " keep_base=True,\n", " add_geoid=True\n", ")\n", "bgp2000_to_bg2010.xwalk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Prepare a single data product with a `README.txt`" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:26.181895Z", "start_time": "2020-08-19T22:07:26.161187Z" } }, "outputs": [], "source": [ "xwalk, xwalk_name = bgp2000_to_bg2010.xwalk, bgp2000_to_bg2010.xwalk_name\n", "xwalk_name_base = \"_\".join(xwalk_name.split(\"_\")[:-1])" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:26.218005Z", "start_time": "2020-08-19T22:07:26.183262Z" } }, "outputs": [], "source": [ "out_data_dir = \"../../crosswalks/\"\n", "out_path = \"%s%s%s/%s\" % (out_data_dir, xwalk_name_base, \"_state\", xwalk_name)\n", "nhgisxwalk.prepare_data_product(xwalk, xwalk_name, out_path, remove=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read crosswalk from a `.zip` archive" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:26.254819Z", "start_time": "2020-08-19T22:07:26.219857Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp2000gjbg2010gjbg2010gewt_popwt_famwt_hhwt_hu
0G10000109044444430042202U1G100001004220211000104220211.01.01.01.0
1G10000109044461265042201R1G100001004220111000104220111.01.01.01.0
2G10000109044461265042201U1G100001004220111000104220111.01.01.01.0
3G10000109044461265042201U2G100001004220121000104220121.01.01.01.0
4G10000109044461480042202R2G100001004220221000104220221.01.01.01.0
........................
1220G10000509355299999051500R4G100005005150041000505150041.01.01.01.0
1221G10000509355299999051500U1G100005005150011000505150011.01.01.01.0
1222G10000509355299999051500U3G100005005150031000505150031.01.01.01.0
1223G10000509355299999051500U4G100005005150041000505150041.01.01.01.0
1224G34003301061010600020400U2G100003099010001000399010000.00.00.00.0
\n", "

1225 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp2000gj bg2010gj bg2010ge wt_pop \\\n", "0 G10000109044444430042202U1 G10000100422021 100010422021 1.0 \n", "1 G10000109044461265042201R1 G10000100422011 100010422011 1.0 \n", "2 G10000109044461265042201U1 G10000100422011 100010422011 1.0 \n", "3 G10000109044461265042201U2 G10000100422012 100010422012 1.0 \n", "4 G10000109044461480042202R2 G10000100422022 100010422022 1.0 \n", "... ... ... ... ... \n", "1220 G10000509355299999051500R4 G10000500515004 100050515004 1.0 \n", "1221 G10000509355299999051500U1 G10000500515001 100050515001 1.0 \n", "1222 G10000509355299999051500U3 G10000500515003 100050515003 1.0 \n", "1223 G10000509355299999051500U4 G10000500515004 100050515004 1.0 \n", "1224 G34003301061010600020400U2 G10000309901000 100039901000 0.0 \n", "\n", " wt_fam wt_hh wt_hu \n", "0 1.0 1.0 1.0 \n", "1 1.0 1.0 1.0 \n", "2 1.0 1.0 1.0 \n", "3 1.0 1.0 1.0 \n", "4 1.0 1.0 1.0 \n", "... ... ... ... \n", "1220 1.0 1.0 1.0 \n", "1221 1.0 1.0 1.0 \n", "1222 1.0 1.0 1.0 \n", "1223 1.0 1.0 1.0 \n", "1224 0.0 0.0 0.0 \n", "\n", "[1225 rows x 7 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "in_path = \"%s%s%s\" % (out_data_dir, xwalk_name_base, \"_state/\")\n", "id_cols = [c for c in xwalk.columns if not c.startswith(\"wt\")]\n", "data_types = nhgisxwalk.str_types(id_cols)\n", "from_csv_kws = {\"path\": in_path, \"archived\": True, \"remove_unpacked\": True}\n", "read_csv_kws = {\"dtype\": data_types}\n", "bgp2000_to_bg2010_df = nhgisxwalk.xwalk_df_from_csv(\n", " xwalk_name, **from_csv_kws, **read_csv_kws\n", ")\n", "bgp2000_to_bg2010_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-----------------------------------------------" ] } ], "metadata": { "_draft": { "nbviewer_url": "https://gist.github.com/9f47e4ec2cc37bce83acf20abfca69d2" }, "gist": { "data": { "description": "sample-workflow.ipynb", "public": true }, "id": "9f47e4ec2cc37bce83acf20abfca69d2" }, "kernelspec": { "display_name": "Python [conda env:nhgis] *", "language": "python", "name": "conda-env-nhgis-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }