{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Sample workflow: 1990 block group parts to 2010 counties\n", "\n", "## Starting from a subset of 2010 Delaware blocks\n", "\n", "For further background information see:\n", "\n", "* **Schroeder, J. P**. 2007. *Target-density weighting interpolation and uncertainty evaluation for temporal analysis of census data*. Geographical Analysis 39 (3):311–335.\n", "\n", "#### NHGIS [block crosswalks](https://www.nhgis.org/user-resources/geographic-crosswalks)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:08.088361Z", "start_time": "2020-08-19T22:07:07.960850Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2020-08-19T18:07:08-04:00\n", "\n", "CPython 3.8.5\n", "IPython 7.16.1\n", "\n", "compiler : Clang 10.0.1 \n", "system : Darwin\n", "release : 19.6.0\n", "machine : x86_64\n", "processor : i386\n", "CPU cores : 8\n", "interpreter: 64bit\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:08.375817Z", "start_time": "2020-08-19T22:07:08.091617Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "watermark 2.0.2\n", "nhgisxwalk 0.0.9\n", "numpy 1.19.1\n", "pandas 1.1.0\n", "\n" ] } ], "source": [ "import nhgisxwalk\n", "import inspect\n", "import numpy\n", "import pandas\n", "\n", "%load_ext autoreload\n", "%autoreload 2\n", "%watermark -w\n", "%watermark -iv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source and target years for the crosswalk" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:08.397900Z", "start_time": "2020-08-19T22:07:08.378466Z" } }, "outputs": [], "source": [ "source_year, target_year = \"1990\", \"2010\"\n", "gj_src, gj_trg = \"GJOIN%s\"%source_year, \"GJOIN%s\"%target_year" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source-target building base" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:08.478500Z", "start_time": "2020-08-19T22:07:08.400236Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GJOIN1990GJOIN2010WEIGHTPAREA_VIA_BLK00
0G10000100401101G100001004010010001.0000001.000000
1G10000100401102G100001004010010010.9217500.976774
2G10000100401102G100001004010010020.0782190.023215
3G10000100401102G100001004010010030.0000310.000012
4G10000100401103G100001004010010031.0000001.000000
\n", "
" ], "text/plain": [ " GJOIN1990 GJOIN2010 WEIGHT PAREA_VIA_BLK00\n", "0 G10000100401101 G10000100401001000 1.000000 1.000000\n", "1 G10000100401102 G10000100401001001 0.921750 0.976774\n", "2 G10000100401102 G10000100401001002 0.078219 0.023215\n", "3 G10000100401102 G10000100401001003 0.000031 0.000012\n", "4 G10000100401103 G10000100401001003 1.000000 1.000000" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "subset_data_dir = \"../testing_data_subsets/\"\n", "base_xwalk_name = \"nhgis_blk%s_blk%s_gj\" % (source_year, target_year)\n", "data_types = nhgisxwalk.str_types([gj_src, gj_trg])\n", "from_csv_kws = {\"path\": subset_data_dir, \"archived\": True, \"remove_unpacked\": True}\n", "read_csv_kws = {\"dtype\": data_types}\n", "base_xwalk = nhgisxwalk.xwalk_df_from_csv(\n", " base_xwalk_name, **from_csv_kws, **read_csv_kws\n", ")\n", "base_xwalk.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set the base (source) summary file name" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:08.504710Z", "start_time": "2020-08-19T22:07:08.482163Z" } }, "outputs": [], "source": [ "base_source_name = \"%s_block.csv.zip\" % source_year\n", "base_source_file = \"%s%s\" % (subset_data_dir, base_source_name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source supplementary summary data (special case for 1990)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:08.525465Z", "start_time": "2020-08-19T22:07:08.506738Z" } }, "outputs": [], "source": [ "supp_source_name = \"%s_blck_grp_598.csv.zip\" % source_year\n", "supp_source_file = subset_data_dir + supp_source_name" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Convenience code shorthand/lookup" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:08.547460Z", "start_time": "2020-08-19T22:07:08.526993Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "def valid_geo_shorthand(shorthand_name=True):\n", " \"\"\"Shorthand lookups for census geographies.\"\"\"\n", " lookup = {\n", " \"blk\": \"block\",\n", " \"bgp\": \"block group part\",\n", " \"bg\": \"block group\",\n", " \"tr\": \"tract\",\n", " \"co\": \"county\",\n", " }\n", " if not shorthand_name:\n", " lookup = {v: k for k, v in lookup.items()}\n", " return lookup\n", "\n" ] } ], "source": [ "print(inspect.getsource(nhgisxwalk.valid_geo_shorthand))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:08.571138Z", "start_time": "2020-08-19T22:07:08.550923Z" } }, "outputs": [ { "data": { "text/plain": [ "{'block': 'blk',\n", " 'block group part': 'bgp',\n", " 'block group': 'bg',\n", " 'tract': 'tr',\n", " 'county': 'co'}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nhgisxwalk.valid_geo_shorthand(shorthand_name=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Instantiate an `nhgisxwalk.GeoCrossWalk` object\n", "##### see [nhgisxwalk.GeoCrossWalk](https://github.com/jGaboardi/nhgisxwalk/blob/92b4fe55de0a9c53d0315dcda8ec121faaf20aef/nhgisxwalk/geocrosswalk.py#L19) for full details" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:08.592668Z", "start_time": "2020-08-19T22:07:08.573502Z" } }, "outputs": [ { "data": { "text/plain": [ "{'Persons': {'Persons': 'Universe',\n", " 'NP1': 'Source code',\n", " 'ET1': 'NHGIS code',\n", " 'Total': 'ET1001'},\n", " 'Families': {'Families': 'Universe',\n", " 'NP2': 'Source code',\n", " 'EUD': 'NHGIS code',\n", " 'Total': 'EUD001'},\n", " 'Households': {'Households': 'Universe',\n", " 'NP3': 'Source code',\n", " 'EUO': 'NHGIS code',\n", " 'Total': 'EUO001'},\n", " 'Housing Units': {'Housing Units': 'Universe',\n", " 'NH1': 'Source code',\n", " 'ESA': 'NHGIS code',\n", " 'Total': 'ESA001'}}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nhgisxwalk.desc_code_1990" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:08.614325Z", "start_time": "2020-08-19T22:07:08.594789Z" } }, "outputs": [ { "data": { "text/plain": [ "['ET1001', 'EUD001', 'EUO001', 'ESA001']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "input_vars = [\n", " nhgisxwalk.desc_code_1990[\"Persons\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Families\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Households\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Housing Units\"][\"Total\"]\n", "]\n", "input_vars" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:08.634374Z", "start_time": "2020-08-19T22:07:08.615903Z" } }, "outputs": [], "source": [ "input_var_tags = [\"pop\", \"fam\", \"hh\", \"hu\"]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:09.065023Z", "start_time": "2020-08-19T22:07:08.636210Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp1990gjco2010gjco2010gewt_popwt_famwt_hhwt_hu
0G100001090444072500423009999999999921G1000010100011.01.01.01.0
1G100001090444444300422009999999999926G1000010100011.01.01.01.0
2G100001090444612650422009999999219011G1000010100011.01.01.01.0
3G100001090444612650422009999999219012G1000010100011.01.01.01.0
4G100001090444614800422009999999999924G1000010100011.01.01.01.0
........................
772G100005093552999990515009999999999922G1000050100051.01.01.01.0
773G100005093552999990515009999999999923G1000050100051.01.01.01.0
774G100005093552999990515009999999999924G1000050100051.01.01.01.0
775G100005093552999990516009999999999921G1000050100051.01.01.01.0
776G340033010610106000204029999999916014G1000030100030.00.00.00.0
\n", "

777 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp1990gj co2010gj co2010ge wt_pop wt_fam \\\n", "0 G100001090444072500423009999999999921 G1000010 10001 1.0 1.0 \n", "1 G100001090444444300422009999999999926 G1000010 10001 1.0 1.0 \n", "2 G100001090444612650422009999999219011 G1000010 10001 1.0 1.0 \n", "3 G100001090444612650422009999999219012 G1000010 10001 1.0 1.0 \n", "4 G100001090444614800422009999999999924 G1000010 10001 1.0 1.0 \n", ".. ... ... ... ... ... \n", "772 G100005093552999990515009999999999922 G1000050 10005 1.0 1.0 \n", "773 G100005093552999990515009999999999923 G1000050 10005 1.0 1.0 \n", "774 G100005093552999990515009999999999924 G1000050 10005 1.0 1.0 \n", "775 G100005093552999990516009999999999921 G1000050 10005 1.0 1.0 \n", "776 G340033010610106000204029999999916014 G1000030 10003 0.0 0.0 \n", "\n", " wt_hh wt_hu \n", "0 1.0 1.0 \n", "1 1.0 1.0 \n", "2 1.0 1.0 \n", "3 1.0 1.0 \n", "4 1.0 1.0 \n", ".. ... ... \n", "772 1.0 1.0 \n", "773 1.0 1.0 \n", "774 1.0 1.0 \n", "775 1.0 1.0 \n", "776 0.0 0.0 \n", "\n", "[777 rows x 7 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "subset_state = \"10\"\n", "bgp1990_to_co2010 = nhgisxwalk.GeoCrossWalk(\n", " base_xwalk,\n", " source_year=source_year,\n", " target_year=target_year,\n", " source_geo=\"bgp\",\n", " target_geo=\"co\",\n", " base_source_table=base_source_file,\n", " supp_source_table=supp_source_file,\n", " input_var=input_vars,\n", " weight_var=input_var_tags,\n", " stfips=subset_state,\n", " keep_base=True,\n", " add_geoid=True\n", ")\n", "bgp1990_to_co2010.xwalk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Prepare a single data product with a `README.txt`" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:09.086097Z", "start_time": "2020-08-19T22:07:09.066828Z" } }, "outputs": [], "source": [ "xwalk, xwalk_name = bgp1990_to_co2010.xwalk, bgp1990_to_co2010.xwalk_name\n", "xwalk_name_base = \"_\".join(xwalk_name.split(\"_\")[:-1])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:09.113915Z", "start_time": "2020-08-19T22:07:09.087948Z" } }, "outputs": [], "source": [ "out_data_dir = \"../../crosswalks/\"\n", "out_path = \"%s%s%s/%s\" % (out_data_dir, xwalk_name_base, \"_state\", xwalk_name)\n", "nhgisxwalk.prepare_data_product(xwalk, xwalk_name, out_path, remove=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read crosswalk from a `.zip` archive" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:09.147217Z", "start_time": "2020-08-19T22:07:09.115305Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp1990gjco2010gjco2010gewt_popwt_famwt_hhwt_hu
0G100001090444072500423009999999999921G1000010100011.01.01.01.0
1G100001090444444300422009999999999926G1000010100011.01.01.01.0
2G100001090444612650422009999999219011G1000010100011.01.01.01.0
3G100001090444612650422009999999219012G1000010100011.01.01.01.0
4G100001090444614800422009999999999924G1000010100011.01.01.01.0
........................
772G100005093552999990515009999999999922G1000050100051.01.01.01.0
773G100005093552999990515009999999999923G1000050100051.01.01.01.0
774G100005093552999990515009999999999924G1000050100051.01.01.01.0
775G100005093552999990516009999999999921G1000050100051.01.01.01.0
776G340033010610106000204029999999916014G1000030100030.00.00.00.0
\n", "

777 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp1990gj co2010gj co2010ge wt_pop wt_fam \\\n", "0 G100001090444072500423009999999999921 G1000010 10001 1.0 1.0 \n", "1 G100001090444444300422009999999999926 G1000010 10001 1.0 1.0 \n", "2 G100001090444612650422009999999219011 G1000010 10001 1.0 1.0 \n", "3 G100001090444612650422009999999219012 G1000010 10001 1.0 1.0 \n", "4 G100001090444614800422009999999999924 G1000010 10001 1.0 1.0 \n", ".. ... ... ... ... ... \n", "772 G100005093552999990515009999999999922 G1000050 10005 1.0 1.0 \n", "773 G100005093552999990515009999999999923 G1000050 10005 1.0 1.0 \n", "774 G100005093552999990515009999999999924 G1000050 10005 1.0 1.0 \n", "775 G100005093552999990516009999999999921 G1000050 10005 1.0 1.0 \n", "776 G340033010610106000204029999999916014 G1000030 10003 0.0 0.0 \n", "\n", " wt_hh wt_hu \n", "0 1.0 1.0 \n", "1 1.0 1.0 \n", "2 1.0 1.0 \n", "3 1.0 1.0 \n", "4 1.0 1.0 \n", ".. ... ... \n", "772 1.0 1.0 \n", "773 1.0 1.0 \n", "774 1.0 1.0 \n", "775 1.0 1.0 \n", "776 0.0 0.0 \n", "\n", "[777 rows x 7 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "in_path = \"%s%s%s\" % (out_data_dir, xwalk_name_base, \"_state/\")\n", "id_cols = [c for c in xwalk.columns if not c.startswith(\"wt\")]\n", "data_types = nhgisxwalk.str_types(id_cols)\n", "from_csv_kws = {\"path\": in_path, \"archived\": True, \"remove_unpacked\": True}\n", "read_csv_kws = {\"dtype\": data_types}\n", "bgp1990_to_co2010_df = nhgisxwalk.xwalk_df_from_csv(\n", " xwalk_name, **from_csv_kws, **read_csv_kws\n", ")\n", "bgp1990_to_co2010_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-----------------------------------------------" ] } ], "metadata": { "_draft": { "nbviewer_url": "https://gist.github.com/9f47e4ec2cc37bce83acf20abfca69d2" }, "gist": { "data": { "description": "sample-workflow.ipynb", "public": true }, "id": "9f47e4ec2cc37bce83acf20abfca69d2" }, "kernelspec": { "display_name": "Python [conda env:nhgis]", "language": "python", "name": "conda-env-nhgis-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }