{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This file is part of the Minnesota Population Center's NHGISXWALK.\n", "# For copyright and licensing information, see the NOTICE and LICENSE files\n", "# in this project's top-level directory, and also on-line at:\n", "# https://github.com/ipums/nhgisxwalk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Sample workflow: 1990 block group parts to 2010 tracts\n", "\n", "## Starting from a subset of 2010 Delaware blocks\n", "\n", "For further background information see:\n", "\n", "* **Schroeder, J. P**. 2007. *Target-density weighting interpolation and uncertainty evaluation for temporal analysis of census data*. Geographical Analysis 39 (3):311–335.\n", "\n", "#### NHGIS [block crosswalks](https://www.nhgis.org/user-resources/geographic-crosswalks)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:22.191149Z", "start_time": "2020-08-19T22:07:22.064267Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2020-08-19T18:07:22-04:00\n", "\n", "CPython 3.8.5\n", "IPython 7.16.1\n", "\n", "compiler : Clang 10.0.1 \n", "system : Darwin\n", "release : 19.6.0\n", "machine : x86_64\n", "processor : i386\n", "CPU cores : 8\n", "interpreter: 64bit\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:22.478158Z", "start_time": "2020-08-19T22:07:22.193759Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "watermark 2.0.2\n", "numpy 1.19.1\n", "pandas 1.1.0\n", "nhgisxwalk 0.0.9\n", "\n" ] } ], "source": [ "import nhgisxwalk\n", "import inspect\n", "import numpy\n", "import pandas\n", "\n", "%load_ext autoreload\n", "%autoreload 2\n", "%watermark -w\n", "%watermark -iv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source and target years for the crosswalk" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:22.498507Z", "start_time": "2020-08-19T22:07:22.480973Z" } }, "outputs": [], "source": [ "source_year, target_year = \"1990\", \"2010\"\n", "gj_src, gj_trg = \"GJOIN%s\"%source_year, \"GJOIN%s\"%target_year" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source-target building base" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:22.571834Z", "start_time": "2020-08-19T22:07:22.500302Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GJOIN1990GJOIN2010WEIGHTPAREA_VIA_BLK00
0G10000100401101G100001004010010001.0000001.000000
1G10000100401102G100001004010010010.9217500.976774
2G10000100401102G100001004010010020.0782190.023215
3G10000100401102G100001004010010030.0000310.000012
4G10000100401103G100001004010010031.0000001.000000
\n", "
" ], "text/plain": [ " GJOIN1990 GJOIN2010 WEIGHT PAREA_VIA_BLK00\n", "0 G10000100401101 G10000100401001000 1.000000 1.000000\n", "1 G10000100401102 G10000100401001001 0.921750 0.976774\n", "2 G10000100401102 G10000100401001002 0.078219 0.023215\n", "3 G10000100401102 G10000100401001003 0.000031 0.000012\n", "4 G10000100401103 G10000100401001003 1.000000 1.000000" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "subset_data_dir = \"../testing_data_subsets/\"\n", "base_xwalk_name = \"nhgis_blk%s_blk%s_gj\" % (source_year, target_year)\n", "data_types = nhgisxwalk.str_types([gj_src, gj_trg])\n", "from_csv_kws = {\"path\": subset_data_dir, \"archived\": True, \"remove_unpacked\": True}\n", "read_csv_kws = {\"dtype\": data_types}\n", "base_xwalk = nhgisxwalk.xwalk_df_from_csv(\n", " base_xwalk_name, **from_csv_kws, **read_csv_kws\n", ")\n", "base_xwalk.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set the base (source) summary file name" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:22.591531Z", "start_time": "2020-08-19T22:07:22.573773Z" } }, "outputs": [], "source": [ "base_source_name = \"%s_block.csv.zip\" % source_year\n", "base_source_file = \"%s%s\" % (subset_data_dir, base_source_name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source supplementary summary data (special case for 1990)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:22.609948Z", "start_time": "2020-08-19T22:07:22.592899Z" } }, "outputs": [], "source": [ "supp_source_name = \"%s_blck_grp_598.csv.zip\" % source_year\n", "supp_source_file = \"%s%s\" % (subset_data_dir, supp_source_name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Convenience code shorthand/lookup" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:22.629510Z", "start_time": "2020-08-19T22:07:22.611398Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "def valid_geo_shorthand(shorthand_name=True):\n", " \"\"\"Shorthand lookups for census geographies.\"\"\"\n", " lookup = {\n", " \"blk\": \"block\",\n", " \"bgp\": \"block group part\",\n", " \"bg\": \"block group\",\n", " \"tr\": \"tract\",\n", " \"co\": \"county\",\n", " }\n", " if not shorthand_name:\n", " lookup = {v: k for k, v in lookup.items()}\n", " return lookup\n", "\n" ] } ], "source": [ "print(inspect.getsource(nhgisxwalk.valid_geo_shorthand))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:22.649827Z", "start_time": "2020-08-19T22:07:22.632253Z" } }, "outputs": [ { "data": { "text/plain": [ "{'block': 'blk',\n", " 'block group part': 'bgp',\n", " 'block group': 'bg',\n", " 'tract': 'tr',\n", " 'county': 'co'}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nhgisxwalk.valid_geo_shorthand(shorthand_name=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Instantiate an `nhgisxwalk.GeoCrossWalk` object\n", "##### see [nhgisxwalk.GeoCrossWalk](https://github.com/ipums/nhgisxwalk/blob/92b4fe55de0a9c53d0315dcda8ec121faaf20aef/nhgisxwalk/geocrosswalk.py#L19) for full details" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:22.670875Z", "start_time": "2020-08-19T22:07:22.651614Z" } }, "outputs": [ { "data": { "text/plain": [ "{'Persons': {'Persons': 'Universe',\n", " 'NP1': 'Source code',\n", " 'ET1': 'NHGIS code',\n", " 'Total': 'ET1001'},\n", " 'Families': {'Families': 'Universe',\n", " 'NP2': 'Source code',\n", " 'EUD': 'NHGIS code',\n", " 'Total': 'EUD001'},\n", " 'Households': {'Households': 'Universe',\n", " 'NP3': 'Source code',\n", " 'EUO': 'NHGIS code',\n", " 'Total': 'EUO001'},\n", " 'Housing Units': {'Housing Units': 'Universe',\n", " 'NH1': 'Source code',\n", " 'ESA': 'NHGIS code',\n", " 'Total': 'ESA001'}}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nhgisxwalk.desc_code_1990" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:22.691725Z", "start_time": "2020-08-19T22:07:22.672883Z" } }, "outputs": [ { "data": { "text/plain": [ "['ET1001', 'EUD001', 'EUO001', 'ESA001']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "input_vars = [\n", " nhgisxwalk.desc_code_1990[\"Persons\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Families\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Households\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Housing Units\"][\"Total\"]\n", "]\n", "input_vars" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:22.710638Z", "start_time": "2020-08-19T22:07:22.692955Z" } }, "outputs": [], "source": [ "input_var_tags = [\"pop\", \"fam\", \"hh\", \"hu\"]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:23.149076Z", "start_time": "2020-08-19T22:07:22.711943Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp1990gjtr2010gjtr2010gewt_popwt_famwt_hhwt_hu
0G100001090444072500423009999999999921G1000010043202100010432021.01.01.01.0
1G100001090444444300422009999999999926G1000010042202100010422021.01.01.01.0
2G100001090444612650422009999999219011G1000010041200100010412000.00.00.00.0
3G100001090444612650422009999999219011G1000010042201100010422011.01.01.01.0
4G100001090444612650422009999999219012G1000010042201100010422011.01.01.01.0
........................
1058G100005093552999990515009999999999923G1000050051500100050515001.01.01.01.0
1059G100005093552999990515009999999999924G1000050051500100050515001.01.01.01.0
1060G100005093552999990516009999999999921G1000050051702100050517021.01.01.01.0
1061G340033010610106000204029999999916014G1000030990100100039901000.00.00.00.0
1062NaNG1000050990000100059900000.00.00.00.0
\n", "

1063 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp1990gj tr2010gj tr2010ge \\\n", "0 G100001090444072500423009999999999921 G1000010043202 10001043202 \n", "1 G100001090444444300422009999999999926 G1000010042202 10001042202 \n", "2 G100001090444612650422009999999219011 G1000010041200 10001041200 \n", "3 G100001090444612650422009999999219011 G1000010042201 10001042201 \n", "4 G100001090444612650422009999999219012 G1000010042201 10001042201 \n", "... ... ... ... \n", "1058 G100005093552999990515009999999999923 G1000050051500 10005051500 \n", "1059 G100005093552999990515009999999999924 G1000050051500 10005051500 \n", "1060 G100005093552999990516009999999999921 G1000050051702 10005051702 \n", "1061 G340033010610106000204029999999916014 G1000030990100 10003990100 \n", "1062 NaN G1000050990000 10005990000 \n", "\n", " wt_pop wt_fam wt_hh wt_hu \n", "0 1.0 1.0 1.0 1.0 \n", "1 1.0 1.0 1.0 1.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 1.0 1.0 1.0 1.0 \n", "4 1.0 1.0 1.0 1.0 \n", "... ... ... ... ... \n", "1058 1.0 1.0 1.0 1.0 \n", "1059 1.0 1.0 1.0 1.0 \n", "1060 1.0 1.0 1.0 1.0 \n", "1061 0.0 0.0 0.0 0.0 \n", "1062 0.0 0.0 0.0 0.0 \n", "\n", "[1063 rows x 7 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "subset_state = \"10\"\n", "bgp1990_to_tr2010 = nhgisxwalk.GeoCrossWalk(\n", " base_xwalk,\n", " source_year=source_year,\n", " target_year=target_year,\n", " source_geo=\"bgp\",\n", " target_geo=\"tr\",\n", " base_source_table=base_source_file,\n", " supp_source_table=supp_source_file,\n", " input_var=input_vars,\n", " weight_var=input_var_tags,\n", " stfips=subset_state,\n", " keep_base=True,\n", " add_geoid=True\n", ")\n", "bgp1990_to_tr2010.xwalk" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2020-08-10T18:49:59.203300Z", "start_time": "2020-08-10T18:49:59.184356Z" } }, "source": [ "### Prepare a single data product with a `README.txt`" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:23.168951Z", "start_time": "2020-08-19T22:07:23.150700Z" } }, "outputs": [], "source": [ "xwalk, xwalk_name = bgp1990_to_tr2010.xwalk, bgp1990_to_tr2010.xwalk_name\n", "xwalk_name_base = \"_\".join(xwalk_name.split(\"_\")[:-1])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:23.200102Z", "start_time": "2020-08-19T22:07:23.170495Z" } }, "outputs": [], "source": [ "out_data_dir = \"../../crosswalks/\"\n", "out_path = \"%s%s%s/%s\" % (out_data_dir, xwalk_name_base, \"_state\", xwalk_name)\n", "nhgisxwalk.prepare_data_product(xwalk, xwalk_name, out_path, remove=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read crosswalk from a `.zip` archive" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2020-08-19T22:07:23.234234Z", "start_time": "2020-08-19T22:07:23.201433Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp1990gjtr2010gjtr2010gewt_popwt_famwt_hhwt_hu
0G100001090444072500423009999999999921G1000010043202100010432021.01.01.01.0
1G100001090444444300422009999999999926G1000010042202100010422021.01.01.01.0
2G100001090444612650422009999999219011G1000010041200100010412000.00.00.00.0
3G100001090444612650422009999999219011G1000010042201100010422011.01.01.01.0
4G100001090444612650422009999999219012G1000010042201100010422011.01.01.01.0
........................
1058G100005093552999990515009999999999923G1000050051500100050515001.01.01.01.0
1059G100005093552999990515009999999999924G1000050051500100050515001.01.01.01.0
1060G100005093552999990516009999999999921G1000050051702100050517021.01.01.01.0
1061G340033010610106000204029999999916014G1000030990100100039901000.00.00.00.0
1062NaNG1000050990000100059900000.00.00.00.0
\n", "

1063 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp1990gj tr2010gj tr2010ge \\\n", "0 G100001090444072500423009999999999921 G1000010043202 10001043202 \n", "1 G100001090444444300422009999999999926 G1000010042202 10001042202 \n", "2 G100001090444612650422009999999219011 G1000010041200 10001041200 \n", "3 G100001090444612650422009999999219011 G1000010042201 10001042201 \n", "4 G100001090444612650422009999999219012 G1000010042201 10001042201 \n", "... ... ... ... \n", "1058 G100005093552999990515009999999999923 G1000050051500 10005051500 \n", "1059 G100005093552999990515009999999999924 G1000050051500 10005051500 \n", "1060 G100005093552999990516009999999999921 G1000050051702 10005051702 \n", "1061 G340033010610106000204029999999916014 G1000030990100 10003990100 \n", "1062 NaN G1000050990000 10005990000 \n", "\n", " wt_pop wt_fam wt_hh wt_hu \n", "0 1.0 1.0 1.0 1.0 \n", "1 1.0 1.0 1.0 1.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 1.0 1.0 1.0 1.0 \n", "4 1.0 1.0 1.0 1.0 \n", "... ... ... ... ... \n", "1058 1.0 1.0 1.0 1.0 \n", "1059 1.0 1.0 1.0 1.0 \n", "1060 1.0 1.0 1.0 1.0 \n", "1061 0.0 0.0 0.0 0.0 \n", "1062 0.0 0.0 0.0 0.0 \n", "\n", "[1063 rows x 7 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "in_path = \"%s%s%s\" % (out_data_dir, xwalk_name_base, \"_state/\")\n", "id_cols = [c for c in xwalk.columns if not c.startswith(\"wt\")]\n", "data_types = nhgisxwalk.str_types(id_cols)\n", "from_csv_kws = {\"path\": in_path, \"archived\": True, \"remove_unpacked\": True}\n", "read_csv_kws = {\"dtype\": data_types}\n", "bgp1990_to_tr2010_df = nhgisxwalk.xwalk_df_from_csv(\n", " xwalk_name, **from_csv_kws, **read_csv_kws\n", ")\n", "bgp1990_to_tr2010_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-----------------------------------------------" ] } ], "metadata": { "_draft": { "nbviewer_url": "https://gist.github.com/9f47e4ec2cc37bce83acf20abfca69d2" }, "gist": { "data": { "description": "sample-workflow.ipynb", "public": true }, "id": "9f47e4ec2cc37bce83acf20abfca69d2" }, "kernelspec": { "display_name": "Python [conda env:nhgis] *", "language": "python", "name": "conda-env-nhgis-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }