{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Generate national and state-level crosswalks\n", "## 1990 block group parts to 2010 counties\n", "\n", "### NHGIS [block crosswalks](https://www.nhgis.org/user-resources/geographic-crosswalks)\n", "\n", "**James D. Gaboardi, 06/2020**" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-07-29T20:31:43.451105Z", "start_time": "2020-07-29T20:31:43.426413Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2020-07-29T16:31:43-04:00\n", "\n", "CPython 3.7.6\n", "IPython 7.15.0\n", "\n", "compiler : Clang 9.0.1 \n", "system : Darwin\n", "release : 19.6.0\n", "machine : x86_64\n", "processor : i386\n", "CPU cores : 8\n", "interpreter: 64bit\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2020-07-29T20:31:44.002560Z", "start_time": "2020-07-29T20:31:43.453062Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "watermark 2.0.2\n", "pandas 1.0.4\n", "numpy 1.18.5\n", "nhgisxwalk 0.0.6\n", "\n" ] } ], "source": [ "import nhgisxwalk\n", "import inspect\n", "import numpy\n", "import pandas\n", "\n", "%load_ext autoreload\n", "%autoreload 2\n", "%watermark -w\n", "%watermark -iv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source and target years for the crosswalk" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2020-07-29T20:31:44.025223Z", "start_time": "2020-07-29T20:31:44.005734Z" } }, "outputs": [], "source": [ "source_year, target_year = \"1990\", \"2010\"\n", "gj_src, gj_trg = \"GJOIN%s\"%source_year, \"GJOIN%s\"%target_year" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source-target building base" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2020-07-29T20:32:01.160049Z", "start_time": "2020-07-29T20:31:44.027008Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GJOIN1990GJOIN2010WEIGHTPAREA_VIA_BLK00
0G01000100201101AG010001002010020040.0007530.014284
1G01000100201101AG010001002010020050.0420200.109618
2G01000100201101AG010001002010020060.2621460.498133
3G01000100201101AG010001002010020160.2371870.218109
4G01000100201101AG010001002010020230.0990970.012864
\n", "
" ], "text/plain": [ " GJOIN1990 GJOIN2010 WEIGHT PAREA_VIA_BLK00\n", "0 G01000100201101A G01000100201002004 0.000753 0.014284\n", "1 G01000100201101A G01000100201002005 0.042020 0.109618\n", "2 G01000100201101A G01000100201002006 0.262146 0.498133\n", "3 G01000100201101A G01000100201002016 0.237187 0.218109\n", "4 G01000100201101A G01000100201002023 0.099097 0.012864" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "base_xwalk_name = \"nhgis_blk%s_blk%s_gj.zip\" % (source_year, target_year)\n", "base_xwalk_file = \"../../crosswalks/%s\" % base_xwalk_name\n", "data_types = nhgisxwalk.str_types([gj_src, gj_trg])\n", "base_xwalk = pandas.read_csv(base_xwalk_file, dtype=data_types)\n", "base_xwalk.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source summary data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2020-07-29T20:32:01.179780Z", "start_time": "2020-07-29T20:32:01.161490Z" } }, "outputs": [], "source": [ "base_source_name = \"%s_block/%s_block.csv\" % (source_year, source_year)\n", "base_source_file = \"../../tabular_data/%s\" % base_source_name" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Source supplementary summary data (special case for 1990)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2020-07-29T20:32:01.198813Z", "start_time": "2020-07-29T20:32:01.182057Z" } }, "outputs": [], "source": [ "supp_source_name = \"%s_blck_grp_598_103/%s_blck_grp_598_103.csv\" % (\n", " source_year, source_year\n", ")\n", "supp_source_file = \"../../tabular_data/%s\" % supp_source_name" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Convenience code shorthand/lookup" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2020-07-29T20:32:01.218175Z", "start_time": "2020-07-29T20:32:01.200662Z" } }, "outputs": [ { "data": { "text/plain": [ "{'block': 'blk',\n", " 'block group part': 'bgp',\n", " 'block group': 'bg',\n", " 'tract': 'tr',\n", " 'county': 'co'}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nhgisxwalk.valid_geo_shorthand(shorthand_name=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Instantiate an `nhgisxwalk.GeoCrossWalk` object\n", "##### see [nhgisxwalk.GeoCrossWalk](https://github.com/jGaboardi/nhgisxwalk/blob/92b4fe55de0a9c53d0315dcda8ec121faaf20aef/nhgisxwalk/geocrosswalk.py#L19) for full details" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2020-07-29T20:32:01.238774Z", "start_time": "2020-07-29T20:32:01.220999Z" } }, "outputs": [ { "data": { "text/plain": [ "{'Persons': {'Persons': 'Universe',\n", " 'NP1': 'Source code',\n", " 'ET1': 'NHGIS code',\n", " 'Total': 'ET1001'},\n", " 'Families': {'Families': 'Universe',\n", " 'NP2': 'Source code',\n", " 'EUD': 'NHGIS code',\n", " 'Total': 'EUD001'},\n", " 'Households': {'Households': 'Universe',\n", " 'NP3': 'Source code',\n", " 'EUO': 'NHGIS code',\n", " 'Total': 'EUO001'},\n", " 'Housing Units': {'Housing Units': 'Universe',\n", " 'NH1': 'Source code',\n", " 'ESA': 'NHGIS code',\n", " 'Total': 'ESA001'}}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nhgisxwalk.desc_code_1990" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2020-07-29T20:32:01.258729Z", "start_time": "2020-07-29T20:32:01.240560Z" } }, "outputs": [ { "data": { "text/plain": [ "['ET1001', 'EUD001', 'EUO001', 'ESA001']" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "input_vars = [\n", " nhgisxwalk.desc_code_1990[\"Persons\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Families\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Households\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Housing Units\"][\"Total\"]\n", "]\n", "input_vars" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-07-29T20:32:01.277089Z", "start_time": "2020-07-29T20:32:01.260111Z" } }, "outputs": [], "source": [ "input_var_tags = [\"pop\", \"fam\", \"hh\", \"hu\"]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2020-07-29T20:35:35.280538Z", "start_time": "2020-07-29T20:32:01.278452Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp1990gjco2010gjco2010gewt_popwt_famwt_hhwt_hu
0G010001090171032200211039999999999922G0100010010011.01.01.01.0
1G010001090171032200211039999999999923G0100010010011.01.01.01.0
2G010001090171999990211039999999999921G0100010010011.01.01.01.0
3G010001090171999990211039999999999922G0100010010011.01.01.01.0
4G010001090171999990211039999999999923G0100010010011.01.01.01.0
........................
375950G560045093520999999512009999999999923G5600450560451.01.01.01.0
375951G560045093520999999512009999999999924G5600450560451.01.01.01.0
375952G560045093520999999512009999999999925G5600450560451.01.01.01.0
375953G560045093520999999512009999999999926G5600450560451.01.01.01.0
375954G560045093520999999512009999999999927G5600450560451.01.01.01.0
\n", "

375955 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp1990gj co2010gj co2010ge wt_pop \\\n", "0 G010001090171032200211039999999999922 G0100010 01001 1.0 \n", "1 G010001090171032200211039999999999923 G0100010 01001 1.0 \n", "2 G010001090171999990211039999999999921 G0100010 01001 1.0 \n", "3 G010001090171999990211039999999999922 G0100010 01001 1.0 \n", "4 G010001090171999990211039999999999923 G0100010 01001 1.0 \n", "... ... ... ... ... \n", "375950 G560045093520999999512009999999999923 G5600450 56045 1.0 \n", "375951 G560045093520999999512009999999999924 G5600450 56045 1.0 \n", "375952 G560045093520999999512009999999999925 G5600450 56045 1.0 \n", "375953 G560045093520999999512009999999999926 G5600450 56045 1.0 \n", "375954 G560045093520999999512009999999999927 G5600450 56045 1.0 \n", "\n", " wt_fam wt_hh wt_hu \n", "0 1.0 1.0 1.0 \n", "1 1.0 1.0 1.0 \n", "2 1.0 1.0 1.0 \n", "3 1.0 1.0 1.0 \n", "4 1.0 1.0 1.0 \n", "... ... ... ... \n", "375950 1.0 1.0 1.0 \n", "375951 1.0 1.0 1.0 \n", "375952 1.0 1.0 1.0 \n", "375953 1.0 1.0 1.0 \n", "375954 1.0 1.0 1.0 \n", "\n", "[375955 rows x 7 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bgp1990_to_co2010 = nhgisxwalk.GeoCrossWalk(\n", " base_xwalk,\n", " source_year=source_year,\n", " target_year=target_year,\n", " source_geo=\"bgp\",\n", " target_geo=\"co\",\n", " base_source_table=base_source_file,\n", " supp_source_table=supp_source_file,\n", " input_var=input_vars,\n", " weight_var=input_var_tags,\n", " keep_base=False,\n", " add_geoid=True\n", ")\n", "del base_xwalk\n", "bgp1990_to_co2010.xwalk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Write crosswalk to a `.csv`" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2020-07-29T20:35:36.969158Z", "start_time": "2020-07-29T20:35:35.282376Z" } }, "outputs": [], "source": [ "nat_dir = \"../../crosswalks/\"\n", "nhgisxwalk.xwalk_df_to_csv(\n", " dfkwds={\n", " \"df\": bgp1990_to_co2010.xwalk,\n", " \"xwalk_name\": bgp1990_to_co2010.xwalk_name\n", " },\n", " path=nat_dir\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Split by (target) state and write out" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2020-07-29T20:35:37.173270Z", "start_time": "2020-07-29T20:35:36.970560Z" } }, "outputs": [ { "data": { "text/plain": [ "['01',\n", " '02',\n", " '04',\n", " '05',\n", " '06',\n", " '08',\n", " '09',\n", " '10',\n", " '11',\n", " '12',\n", " '13',\n", " '15',\n", " '16',\n", " '17',\n", " '18',\n", " '19',\n", " '20',\n", " '21',\n", " '22',\n", " '23',\n", " '24',\n", " '25',\n", " '26',\n", " '27',\n", " '28',\n", " '29',\n", " '30',\n", " '31',\n", " '32',\n", " '33',\n", " '34',\n", " '35',\n", " '36',\n", " '37',\n", " '38',\n", " '39',\n", " '40',\n", " '41',\n", " '42',\n", " '44',\n", " '45',\n", " '46',\n", " '47',\n", " '48',\n", " '49',\n", " '50',\n", " '51',\n", " '53',\n", " '54',\n", " '55',\n", " '56',\n", " 'nan']" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stfips_codes = nhgisxwalk.extract_unique_stfips(\n", " df=bgp1990_to_co2010.xwalk, endpoint=bgp1990_to_co2010.target\n", ")\n", "stfips_codes = sorted(list(stfips_codes))\n", "stfips_codes" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2020-07-29T20:35:47.663279Z", "start_time": "2020-07-29T20:35:37.175005Z" } }, "outputs": [], "source": [ "state_dir = nat_dir + \"nhgis_bgp1990_co2010_state/\"\n", "for stfips in stfips_codes:\n", " xwalk_name = bgp1990_to_co2010.xwalk_name\n", " source, target = bgp1990_to_co2010.target, bgp1990_to_co2010.target\n", " _stxwalk = nhgisxwalk.extract_state(\n", " bgp1990_to_co2010.xwalk,\n", " stfips,\n", " xwalk_name,\n", " target,\n", " sort_by=[source, target]\n", " )\n", " dfkwds = {\"df\": _stxwalk, \"stfips\": stfips, \"xwalk_name\": xwalk_name}\n", " nhgisxwalk.xwalk_df_to_csv(dfkwds=dfkwds, path=state_dir)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-----------------------------------------------" ] } ], "metadata": { "_draft": { "nbviewer_url": "https://gist.github.com/9f47e4ec2cc37bce83acf20abfca69d2" }, "gist": { "data": { "description": "sample-workflow.ipynb", "public": true }, "id": "9f47e4ec2cc37bce83acf20abfca69d2" }, "kernelspec": { "display_name": "Python [conda env:nhgis]", "language": "python", "name": "conda-env-nhgis-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }