{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Sample workflow: 1990 block group parts to 2010 counties\n", "\n", "## Starting from a subset of 2010 Delaware blocks\n", "\n", "For further background information see:\n", "* **Schroeder, J. P**. 2007. *Target-density weighting interpolation and uncertainty evaluation for temporal analysis of census data*. Geographical Analysis 39 (3):311–335.\n", "\n", "#### NHGIS [block crosswalks](https://www.nhgis.org/user-resources/geographic-crosswalks)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-06-22T00:09:36.595166Z", "start_time": "2020-06-22T00:09:36.562841Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2020-06-21T20:09:36-04:00\n", "\n", "CPython 3.7.6\n", "IPython 7.15.0\n", "\n", "compiler : Clang 9.0.1 \n", "system : Darwin\n", "release : 19.5.0\n", "machine : x86_64\n", "processor : i386\n", "CPU cores : 8\n", "interpreter: 64bit\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2020-06-22T00:09:37.352276Z", "start_time": "2020-06-22T00:09:37.073590Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "watermark 2.0.2\n", "nhgisxwalk 0.0.5\n", "numpy 1.18.5\n", "pandas 1.0.4\n", "\n" ] } ], "source": [ "import nhgisxwalk\n", "import inspect\n", "import numpy\n", "import pandas\n", "\n", "%load_ext autoreload\n", "%autoreload 2\n", "%watermark -w\n", "%watermark -iv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source and target years for the crosswalk" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2020-06-22T00:09:38.160877Z", "start_time": "2020-06-22T00:09:38.130399Z" } }, "outputs": [], "source": [ "source_year, target_year = \"1990\", \"2010\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source-target building base" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2020-06-22T00:09:38.965841Z", "start_time": "2020-06-22T00:09:38.874865Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GJOIN1990GJOIN2010WEIGHTPAREA_VIA_BLK00
0NaNG100001004320210780.00.0
1NaNG100001004320230140.00.0
2NaNG100001004320230150.00.0
3NaNG100001099000000110.00.0
4NaNG100001099000000120.00.0
\n", "
" ], "text/plain": [ " GJOIN1990 GJOIN2010 WEIGHT PAREA_VIA_BLK00\n", "0 NaN G10000100432021078 0.0 0.0\n", "1 NaN G10000100432023014 0.0 0.0\n", "2 NaN G10000100432023015 0.0 0.0\n", "3 NaN G10000109900000011 0.0 0.0\n", "4 NaN G10000109900000012 0.0 0.0" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "subset_data_dir = \"../testing_data_subsets\"\n", "base_xwalk_name = \"/nhgis_blk%s_blk%s_gj.csv.zip\" % (source_year, target_year)\n", "base_xwalk_file = subset_data_dir + base_xwalk_name\n", "data_types = nhgisxwalk.str_types([\"GJOIN%s\"%source_year, \"GJOIN%s\"%target_year])\n", "base_xwalk = pandas.read_csv(base_xwalk_file, index_col=0, dtype=data_types)\n", "base_xwalk.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source supplementary summary data (special case for 1990)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2020-06-22T00:09:40.983368Z", "start_time": "2020-06-22T00:09:40.953058Z" } }, "outputs": [], "source": [ "supp_source_name = \"%s_blck_grp_598_103.csv.zip\" % source_year\n", "supp_source_file = subset_data_dir + supp_source_name" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Convenience code shorthand/lookup" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2020-06-22T00:09:41.794199Z", "start_time": "2020-06-22T00:09:41.761857Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "def valid_geo_shorthand(shorthand_name=True):\n", " \"\"\"Shorthand lookups for census geographies.\"\"\"\n", " lookup = {\n", " \"blk\": \"block\",\n", " \"bgp\": \"block group part\",\n", " \"bkg\": \"block group\",\n", " \"trt\": \"tract\",\n", " \"cty\": \"county\",\n", " }\n", " if not shorthand_name:\n", " lookup = {v: k for k, v in lookup.items()}\n", " return lookup\n", "\n" ] } ], "source": [ "print(inspect.getsource(nhgisxwalk.valid_geo_shorthand))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2020-06-22T00:09:42.534098Z", "start_time": "2020-06-22T00:09:42.502108Z" } }, "outputs": [ { "data": { "text/plain": [ "{'block': 'blk',\n", " 'block group part': 'bgp',\n", " 'block group': 'bkg',\n", " 'tract': 'trt',\n", " 'county': 'cty'}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nhgisxwalk.valid_geo_shorthand(shorthand_name=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Instantiate an `nhgisxwalk.GeoCrossWalk` object\n", "##### see [nhgisxwalk.GeoCrossWalk](https://github.com/jGaboardi/nhgisxwalk/blob/92b4fe55de0a9c53d0315dcda8ec121faaf20aef/nhgisxwalk/geocrosswalk.py#L19) for full details" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2020-06-22T00:09:43.218813Z", "start_time": "2020-06-22T00:09:43.185876Z" } }, "outputs": [ { "data": { "text/plain": [ "{'Persons': {'Persons': 'Universe',\n", " 'NP1': 'Source code',\n", " 'ET1': 'NHGIS code',\n", " 'Total': 'ET1001'},\n", " 'Families': {'Families': 'Universe',\n", " 'NP2': 'Source code',\n", " 'EUD': 'NHGIS code',\n", " 'Total': 'EUD001'},\n", " 'Households': {'Households': 'Universe',\n", " 'NP3': 'Source code',\n", " 'EUO': 'NHGIS code',\n", " 'Total': 'EUO001'},\n", " 'Housing Units': {'Housing Units': 'Universe',\n", " 'NH1': 'Source code',\n", " 'ESA': 'NHGIS code',\n", " 'Total': 'ESA001'}}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nhgisxwalk.desc_code_1990" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2020-06-22T00:09:43.651933Z", "start_time": "2020-06-22T00:09:43.617962Z" } }, "outputs": [ { "data": { "text/plain": [ "['ET1001', 'EUD001', 'EUO001', 'ESA001']" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "input_vars = [\n", " nhgisxwalk.desc_code_1990[\"Persons\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Families\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Households\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Housing Units\"][\"Total\"]\n", "]\n", "input_vars" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-06-22T00:09:44.595756Z", "start_time": "2020-06-22T00:09:44.565838Z" } }, "outputs": [], "source": [ "input_var_tags = [\"pop\", \"fam\", \"hh\", \"hu\"]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2020-06-22T00:09:46.260049Z", "start_time": "2020-06-22T00:09:45.799249Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp1990gjcty2010gjcty2010gewt_popwt_famwt_hhwt_hu
0G100001090444072500423009999999999921G1000010100011.01.01.01.0
1G100001090444444300422009999999999926G1000010100011.01.01.01.0
2G100001090444612650422009999999219011G1000010100011.01.01.01.0
3G100001090444612650422009999999219012G1000010100011.01.01.01.0
4G100001090444614800422009999999999924G1000010100011.01.01.01.0
........................
772G100005093552999990515009999999999922G1000050100051.01.01.01.0
773G100005093552999990515009999999999923G1000050100051.01.01.01.0
774G100005093552999990515009999999999924G1000050100051.01.01.01.0
775G100005093552999990516009999999999921G1000050100051.01.01.01.0
776G340033010610106000204029999999916014G1000030100030.00.00.00.0
\n", "

777 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp1990gj cty2010gj cty2010ge wt_pop \\\n", "0 G100001090444072500423009999999999921 G1000010 10001 1.0 \n", "1 G100001090444444300422009999999999926 G1000010 10001 1.0 \n", "2 G100001090444612650422009999999219011 G1000010 10001 1.0 \n", "3 G100001090444612650422009999999219012 G1000010 10001 1.0 \n", "4 G100001090444614800422009999999999924 G1000010 10001 1.0 \n", ".. ... ... ... ... \n", "772 G100005093552999990515009999999999922 G1000050 10005 1.0 \n", "773 G100005093552999990515009999999999923 G1000050 10005 1.0 \n", "774 G100005093552999990515009999999999924 G1000050 10005 1.0 \n", "775 G100005093552999990516009999999999921 G1000050 10005 1.0 \n", "776 G340033010610106000204029999999916014 G1000030 10003 0.0 \n", "\n", " wt_fam wt_hh wt_hu \n", "0 1.0 1.0 1.0 \n", "1 1.0 1.0 1.0 \n", "2 1.0 1.0 1.0 \n", "3 1.0 1.0 1.0 \n", "4 1.0 1.0 1.0 \n", ".. ... ... ... \n", "772 1.0 1.0 1.0 \n", "773 1.0 1.0 1.0 \n", "774 1.0 1.0 1.0 \n", "775 1.0 1.0 1.0 \n", "776 0.0 0.0 0.0 \n", "\n", "[777 rows x 7 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "subset_state = \"10\"\n", "bgp1990_to_cty2010 = nhgisxwalk.GeoCrossWalk(\n", " base_xwalk,\n", " source_year=source_year,\n", " target_year=target_year,\n", " source_geo=\"bgp\",\n", " target_geo=\"cty\",\n", " base_source_table=subset_data_dir+\"/1990_block.csv.zip\",\n", " supp_source_table=subset_data_dir+\"/1990_blck_grp_598_103.csv.zip\",\n", " input_var=input_vars,\n", " weight_var=input_var_tags,\n", " stfips=subset_state,\n", " keep_base=True,\n", " add_geoid=True\n", ")\n", "bgp1990_to_cty2010.xwalk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Write crosswalk to a `.csv`" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2020-06-22T00:26:46.621188Z", "start_time": "2020-06-22T00:26:46.509756Z" } }, "outputs": [], "source": [ "state_dir = \"../../crosswalks/nhgis_bgp1990_cty2010_state/\"\n", "nhgisxwalk.xwalk_df_to_csv(\n", " cls=bgp1990_to_cty2010,\n", " path=state_dir\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read crosswalk from a `.csv`" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2020-06-22T00:26:49.866489Z", "start_time": "2020-06-22T00:26:49.833544Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp1990gjcty2010gjcty2010gewt_popwt_famwt_hhwt_hu
0G100001090444072500423009999999999921G1000010100011.01.01.01.0
1G100001090444444300422009999999999926G1000010100011.01.01.01.0
2G100001090444612650422009999999219011G1000010100011.01.01.01.0
3G100001090444612650422009999999219012G1000010100011.01.01.01.0
4G100001090444614800422009999999999924G1000010100011.01.01.01.0
........................
772G100005093552999990515009999999999922G1000050100051.01.01.01.0
773G100005093552999990515009999999999923G1000050100051.01.01.01.0
774G100005093552999990515009999999999924G1000050100051.01.01.01.0
775G100005093552999990516009999999999921G1000050100051.01.01.01.0
776G340033010610106000204029999999916014G1000030100030.00.00.00.0
\n", "

777 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp1990gj cty2010gj cty2010ge wt_pop \\\n", "0 G100001090444072500423009999999999921 G1000010 10001 1.0 \n", "1 G100001090444444300422009999999999926 G1000010 10001 1.0 \n", "2 G100001090444612650422009999999219011 G1000010 10001 1.0 \n", "3 G100001090444612650422009999999219012 G1000010 10001 1.0 \n", "4 G100001090444614800422009999999999924 G1000010 10001 1.0 \n", ".. ... ... ... ... \n", "772 G100005093552999990515009999999999922 G1000050 10005 1.0 \n", "773 G100005093552999990515009999999999923 G1000050 10005 1.0 \n", "774 G100005093552999990515009999999999924 G1000050 10005 1.0 \n", "775 G100005093552999990516009999999999921 G1000050 10005 1.0 \n", "776 G340033010610106000204029999999916014 G1000030 10003 0.0 \n", "\n", " wt_fam wt_hh wt_hu \n", "0 1.0 1.0 1.0 \n", "1 1.0 1.0 1.0 \n", "2 1.0 1.0 1.0 \n", "3 1.0 1.0 1.0 \n", "4 1.0 1.0 1.0 \n", ".. ... ... ... \n", "772 1.0 1.0 1.0 \n", "773 1.0 1.0 1.0 \n", "774 1.0 1.0 1.0 \n", "775 1.0 1.0 1.0 \n", "776 0.0 0.0 0.0 \n", "\n", "[777 rows x 7 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fname = state_dir + bgp1990_to_cty2010.xwalk_name\n", "bgp1990_to_bkg2010_df = nhgisxwalk.xwalk_df_from_csv(fname)\n", "bgp1990_to_bkg2010_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-----------------------------------------------" ] } ], "metadata": { "_draft": { "nbviewer_url": "https://gist.github.com/9f47e4ec2cc37bce83acf20abfca69d2" }, "gist": { "data": { "description": "sample-workflow.ipynb", "public": true }, "id": "9f47e4ec2cc37bce83acf20abfca69d2" }, "kernelspec": { "display_name": "Python [conda env:nhgis]", "language": "python", "name": "conda-env-nhgis-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }