{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Generating Subsets for testing: 1990\n", "## blocks, block groups parts, and blocks\n", "\n", "\n", "1. From a national crosswalk: \n", " 1. Create target state-level subsets for NHGIS base crosswalks\n", " 1. Create target state-level subsets for NHGIS base tabular data\n", " 1. Record unit tests values for posterity\n", "\n", "\n", "\n", "**This is currently only intended for use with block-level data as base units.**\n", "\n", "\n", "**James Gaboardi** **(), 2020-05**" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:08:18.097159Z", "start_time": "2020-06-13T16:08:18.063400Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2020-06-13T12:08:18-04:00\n", "\n", "CPython 3.7.6\n", "IPython 7.15.0\n", "\n", "compiler : Clang 9.0.1 \n", "system : Darwin\n", "release : 19.5.0\n", "machine : x86_64\n", "processor : i386\n", "CPU cores : 8\n", "interpreter: 64bit\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:08:18.655723Z", "start_time": "2020-06-13T16:08:18.100806Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "watermark 2.0.2\n", "numpy 1.18.5\n", "nhgisxwalk 0.0.2\n", "pandas 1.0.4\n", "\n" ] } ], "source": [ "import inspect\n", "import nhgisxwalk\n", "import numpy\n", "import pandas\n", "\n", "%load_ext autoreload\n", "%autoreload 2\n", "%watermark -w\n", "%watermark -iv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set the state (for subsetting), source & target, and year & geography" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:08:18.681204Z", "start_time": "2020-06-13T16:08:18.660642Z" } }, "outputs": [], "source": [ "subset_state = \"10\" # Delaware\n", "#subset_state = \"11\" # DC\n", "#subset_state = \"15\" # Hawaii\n", "source_year, target_year = \"1990\", \"2010\"\n", "gj_src, gj_trg = \"GJOIN%s\"%source_year, \"GJOIN%s\"%target_year" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set the base-level crosswalk file name" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:08:18.712195Z", "start_time": "2020-06-13T16:08:18.684191Z" } }, "outputs": [ { "data": { "text/plain": [ "'../../crosswalks/nhgis_blk1990_blk2010_gj.csv.zip'" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "base_xwalk_name = \"nhgis_blk%s_blk%s_gj.csv.zip\" % (source_year, target_year)\n", "base_xwalk_file = \"../../crosswalks/%s\" % base_xwalk_name\n", "base_xwalk_file" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set the base (source) summary file name" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:08:18.736919Z", "start_time": "2020-06-13T16:08:18.714332Z" } }, "outputs": [ { "data": { "text/plain": [ "'../../tabular_data/1990_block/1990_block.csv'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "base_source_name = \"%s_block/%s_block.csv\" % (source_year, source_year)\n", "base_source_file = \"../../tabular_data/%s\" % base_source_name\n", "base_source_file" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set the supplementary summary file name" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:08:18.759514Z", "start_time": "2020-06-13T16:08:18.739423Z" } }, "outputs": [ { "data": { "text/plain": [ "'../../tabular_data/1990_blck_grp_598_103/1990_blck_grp_598_103.csv'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "supp_source_name = \"%s_blck_grp_598_103/%s_blck_grp_598_103.csv\" % (\n", " source_year, source_year\n", ")\n", "supp_source_file = \"../../tabular_data/%s\" % supp_source_name\n", "supp_source_file" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read in the national the base-level crosswalk" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:08:38.509144Z", "start_time": "2020-06-13T16:08:18.761736Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/the-gaboardi/miniconda3/envs/nhgis/lib/python3.7/site-packages/numpy/lib/arraysetops.py:569: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n", " mask |= (ar1 == a)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GJOIN1990GJOIN2010WEIGHTPAREA_VIA_BLK00
0NaNG010003001070140850.00.0
1NaNG010003001070140860.00.0
2NaNG010003001070140890.00.0
3NaNG010003001070140910.00.0
4NaNG010003001070141090.00.0
\n", "
" ], "text/plain": [ " GJOIN1990 GJOIN2010 WEIGHT PAREA_VIA_BLK00\n", "0 NaN G01000300107014085 0.0 0.0\n", "1 NaN G01000300107014086 0.0 0.0\n", "2 NaN G01000300107014089 0.0 0.0\n", "3 NaN G01000300107014091 0.0 0.0\n", "4 NaN G01000300107014109 0.0 0.0" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_types = nhgisxwalk.str_types([gj_src, gj_trg])\n", "base_xwalk = pandas.read_csv(base_xwalk_file, index_col=0, dtype=data_types)\n", "base_xwalk.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create the state subset of the base-level crosswalk (for use in GH testing)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:08:41.774582Z", "start_time": "2020-06-13T16:08:38.512453Z" } }, "outputs": [], "source": [ "ss_base = base_xwalk[\n", " base_xwalk[\"GJOIN2010\"].map(lambda x: x[1:3] == subset_state)\n", "].copy()\n", "ss_base.reset_index(drop=True, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Declare input variable\n", "**not needed for creating a subset perse, but should do regardless**" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:08:41.798065Z", "start_time": "2020-06-13T16:08:41.779252Z" } }, "outputs": [], "source": [ "input_vars = [\n", " nhgisxwalk.desc_code_1990[\"Persons\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Families\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Households\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Housing Units\"][\"Total\"]\n", "]\n", "input_var_tags = [\"pop\", \"fam\", \"hh\", \"hu\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Generate the desired crosswalk and subset down to the target state" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:12:25.339479Z", "start_time": "2020-06-13T16:08:41.801063Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp1990gjtrt2010gjtrt2010gewt_popwt_famwt_hhwt_hu
80076G100001090444072500423009999999999921G1000010043202100010432021.01.01.01.0
80077G100001090444444300422009999999999926G1000010042202100010422021.01.01.01.0
80078G100001090444612650422009999999219011G1000010041200100010412000.00.00.00.0
80079G100001090444612650422009999999219011G1000010042201100010422011.01.01.01.0
80080G100001090444612650422009999999219012G1000010042201100010422011.01.01.01.0
........................
81134G100005093552999990515009999999999923G1000050051500100050515001.01.01.01.0
81135G100005093552999990515009999999999924G1000050051500100050515001.01.01.01.0
81136G100005093552999990516009999999999921G1000050051702100050517021.01.01.01.0
284765G340033010610106000204029999999916014G1000030990100100039901000.00.00.00.0
507805NaNG1000050990000100059900000.00.00.00.0
\n", "

1063 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp1990gj trt2010gj trt2010ge \\\n", "80076 G100001090444072500423009999999999921 G1000010043202 10001043202 \n", "80077 G100001090444444300422009999999999926 G1000010042202 10001042202 \n", "80078 G100001090444612650422009999999219011 G1000010041200 10001041200 \n", "80079 G100001090444612650422009999999219011 G1000010042201 10001042201 \n", "80080 G100001090444612650422009999999219012 G1000010042201 10001042201 \n", "... ... ... ... \n", "81134 G100005093552999990515009999999999923 G1000050051500 10005051500 \n", "81135 G100005093552999990515009999999999924 G1000050051500 10005051500 \n", "81136 G100005093552999990516009999999999921 G1000050051702 10005051702 \n", "284765 G340033010610106000204029999999916014 G1000030990100 10003990100 \n", "507805 NaN G1000050990000 10005990000 \n", "\n", " wt_pop wt_fam wt_hh wt_hu \n", "80076 1.0 1.0 1.0 1.0 \n", "80077 1.0 1.0 1.0 1.0 \n", "80078 0.0 0.0 0.0 0.0 \n", "80079 1.0 1.0 1.0 1.0 \n", "80080 1.0 1.0 1.0 1.0 \n", "... ... ... ... ... \n", "81134 1.0 1.0 1.0 1.0 \n", "81135 1.0 1.0 1.0 1.0 \n", "81136 1.0 1.0 1.0 1.0 \n", "284765 0.0 0.0 0.0 0.0 \n", "507805 0.0 0.0 0.0 0.0 \n", "\n", "[1063 rows x 7 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "state_bgp1990trt2010 = nhgisxwalk.GeoCrossWalk(\n", " base_xwalk,\n", " source_year=source_year,\n", " target_year=target_year,\n", " source_geo=\"bgp\",\n", " target_geo=\"trt\",\n", " base_source_table=base_source_file,\n", " supp_source_table=supp_source_file,\n", " input_var=input_vars,\n", " weight_var=input_var_tags,\n", " keep_base=True,\n", " add_geoid=True,\n", " stfips=subset_state\n", ")\n", "del base_xwalk\n", "state_bgp1990trt2010.xwalk" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:12:25.374825Z", "start_time": "2020-06-13T16:12:25.341380Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp1990gjtrt2010gjtrt2010gewt_popwt_famwt_hhwt_hu
80076G100001090444072500423009999999999921G1000010043202100010432021.01.01.01.0
80077G100001090444444300422009999999999926G1000010042202100010422021.01.01.01.0
80078G100001090444612650422009999999219011G1000010041200100010412000.00.00.00.0
80079G100001090444612650422009999999219011G1000010042201100010422011.01.01.01.0
80080G100001090444612650422009999999219012G1000010042201100010422011.01.01.01.0
........................
81134G100005093552999990515009999999999923G1000050051500100050515001.01.01.01.0
81135G100005093552999990515009999999999924G1000050051500100050515001.01.01.01.0
81136G100005093552999990516009999999999921G1000050051702100050517021.01.01.01.0
284765G340033010610106000204029999999916014G1000030990100100039901000.00.00.00.0
507805NaNG1000050990000100059900000.00.00.00.0
\n", "

1063 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp1990gj trt2010gj trt2010ge \\\n", "80076 G100001090444072500423009999999999921 G1000010043202 10001043202 \n", "80077 G100001090444444300422009999999999926 G1000010042202 10001042202 \n", "80078 G100001090444612650422009999999219011 G1000010041200 10001041200 \n", "80079 G100001090444612650422009999999219011 G1000010042201 10001042201 \n", "80080 G100001090444612650422009999999219012 G1000010042201 10001042201 \n", "... ... ... ... \n", "81134 G100005093552999990515009999999999923 G1000050051500 10005051500 \n", "81135 G100005093552999990515009999999999924 G1000050051500 10005051500 \n", "81136 G100005093552999990516009999999999921 G1000050051702 10005051702 \n", "284765 G340033010610106000204029999999916014 G1000030990100 10003990100 \n", "507805 NaN G1000050990000 10005990000 \n", "\n", " wt_pop wt_fam wt_hh wt_hu \n", "80076 1.0 1.0 1.0 1.0 \n", "80077 1.0 1.0 1.0 1.0 \n", "80078 0.0 0.0 0.0 0.0 \n", "80079 1.0 1.0 1.0 1.0 \n", "80080 1.0 1.0 1.0 1.0 \n", "... ... ... ... ... \n", "81134 1.0 1.0 1.0 1.0 \n", "81135 1.0 1.0 1.0 1.0 \n", "81136 1.0 1.0 1.0 1.0 \n", "284765 0.0 0.0 0.0 0.0 \n", "507805 0.0 0.0 0.0 0.0 \n", "\n", "[1063 rows x 7 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "state_bgp1990trt2010.xwalk.drop_duplicates(subset=[\"bgp1990gj\", \"trt2010gj\"])" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:12:25.396155Z", "start_time": "2020-06-13T16:12:25.376480Z" } }, "outputs": [ { "data": { "text/plain": [ "777" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "state_bgp1990trt2010.xwalk[\"bgp1990gj\"].nunique()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:12:25.417031Z", "start_time": "2020-06-13T16:12:25.397911Z" } }, "outputs": [ { "data": { "text/plain": [ "218" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "state_bgp1990trt2010.xwalk[\"trt2010gj\"].nunique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Write out the state subset of the base-level crosswalk (for use in GH testing)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:12:25.657736Z", "start_time": "2020-06-13T16:12:25.418416Z" } }, "outputs": [], "source": [ "ss_base.to_csv(\"../testing_data_subsets/%s\" % base_xwalk_name)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:12:25.685418Z", "start_time": "2020-06-13T16:12:25.661110Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GJOIN1990GJOIN2010WEIGHTPAREA_VIA_BLK00
0NaNG100001004320210780.00.0
1NaNG100001004320230140.00.0
2NaNG100001004320230150.00.0
3NaNG100001099000000110.00.0
4NaNG100001099000000120.00.0
...............
38292G10000500519289G100005005190021251.01.0
38293G34003300204401AG100003099010000070.00.0
38294G34003300204418G100003099010000070.00.0
38295G34003300204419G100003099010000070.00.0
38296G34003300204420G100003099010000070.00.0
\n", "

38297 rows × 4 columns

\n", "
" ], "text/plain": [ " GJOIN1990 GJOIN2010 WEIGHT PAREA_VIA_BLK00\n", "0 NaN G10000100432021078 0.0 0.0\n", "1 NaN G10000100432023014 0.0 0.0\n", "2 NaN G10000100432023015 0.0 0.0\n", "3 NaN G10000109900000011 0.0 0.0\n", "4 NaN G10000109900000012 0.0 0.0\n", "... ... ... ... ...\n", "38292 G10000500519289 G10000500519002125 1.0 1.0\n", "38293 G34003300204401A G10000309901000007 0.0 0.0\n", "38294 G34003300204418 G10000309901000007 0.0 0.0\n", "38295 G34003300204419 G10000309901000007 0.0 0.0\n", "38296 G34003300204420 G10000309901000007 0.0 0.0\n", "\n", "[38297 rows x 4 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ss_base" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Record, subset, and write out the 1990 BLKs (sf1) needed to create this subset" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:12:25.711134Z", "start_time": "2020-06-13T16:12:25.687027Z" } }, "outputs": [ { "data": { "text/plain": [ "149 G10000100401101\n", "150 G10000100401102\n", "151 G10000100401102\n", "152 G10000100401102\n", "153 G10000100401103\n", " ... \n", "38292 G10000500519289\n", "38293 G34003300204401A\n", "38294 G34003300204418\n", "38295 G34003300204419\n", "38296 G34003300204420\n", "Name: GJOIN1990, Length: 38148, dtype: object" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "blk1990 = ss_base[~ss_base[\"GJOIN1990\"].isna()][\"GJOIN1990\"]\n", "blk1990" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:12:25.733615Z", "start_time": "2020-06-13T16:12:25.712781Z" } }, "outputs": [ { "data": { "text/plain": [ "(38148,)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "blk1990.shape" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:12:25.758780Z", "start_time": "2020-06-13T16:12:25.736130Z" } }, "outputs": [ { "data": { "text/plain": [ "15538" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "blk1990.nunique()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:12:39.342441Z", "start_time": "2020-06-13T16:12:25.760405Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GISJOINYEARANRCAAIANHHARES_ONLYATRUSTARES_TRSTABLOCKABLCK_GRPATRACTA...STATESTATEAURBRURALAURB_AREAACD103AANPSADPIET1001EUD001EUO001ESA001
0G01000100201101A1990999999999999999101A10201...Alabama0115240NaNBlock 101A33291107112
1G01000100201101B1990999999999999999101B10201...Alabama0129999NaNBlock 101B14367
2G01000100201102A1990999999999999999102A10201...Alabama0115240NaNBlock 102A248748889
3G01000100201103199099999999999999910310201...Alabama0115240NaNBlock 10349151516
4G01000100201104199099999999999999910410201...Alabama0115240NaNBlock 10412344
..................................................................
4934102G56004509513577B1990999999999999999577B59513...Wyoming5629999NaNBlock 577B5133
4934103G56004509513578199099999999999999957859513...Wyoming5619999NaNBlock 57859202122
4934104G56004509513579199099999999999999957959513...Wyoming5619999NaNBlock 5793081011
4934105G56004509513580199099999999999999958059513...Wyoming5629999NaNBlock 5803591111
4934106G56004509513597199099999999999999959759513...Wyoming5629999NaNBlock 59739101313
\n", "

4934107 rows × 30 columns

\n", "
" ], "text/plain": [ " GISJOIN YEAR ANRCA AIANHHA RES_ONLYA TRUSTA RES_TRSTA \\\n", "0 G01000100201101A 1990 99 9999 9999 9999 9 \n", "1 G01000100201101B 1990 99 9999 9999 9999 9 \n", "2 G01000100201102A 1990 99 9999 9999 9999 9 \n", "3 G01000100201103 1990 99 9999 9999 9999 9 \n", "4 G01000100201104 1990 99 9999 9999 9999 9 \n", "... ... ... ... ... ... ... ... \n", "4934102 G56004509513577B 1990 99 9999 9999 9999 9 \n", "4934103 G56004509513578 1990 99 9999 9999 9999 9 \n", "4934104 G56004509513579 1990 99 9999 9999 9999 9 \n", "4934105 G56004509513580 1990 99 9999 9999 9999 9 \n", "4934106 G56004509513597 1990 99 9999 9999 9999 9 \n", "\n", " BLOCKA BLCK_GRPA TRACTA ... STATE STATEA URBRURALA URB_AREAA \\\n", "0 101A 1 0201 ... Alabama 01 1 5240 \n", "1 101B 1 0201 ... Alabama 01 2 9999 \n", "2 102A 1 0201 ... Alabama 01 1 5240 \n", "3 103 1 0201 ... Alabama 01 1 5240 \n", "4 104 1 0201 ... Alabama 01 1 5240 \n", "... ... ... ... ... ... ... ... ... \n", "4934102 577B 5 9513 ... Wyoming 56 2 9999 \n", "4934103 578 5 9513 ... Wyoming 56 1 9999 \n", "4934104 579 5 9513 ... Wyoming 56 1 9999 \n", "4934105 580 5 9513 ... Wyoming 56 2 9999 \n", "4934106 597 5 9513 ... Wyoming 56 2 9999 \n", "\n", " CD103A ANPSADPI ET1001 EUD001 EUO001 ESA001 \n", "0 NaN Block 101A 332 91 107 112 \n", "1 NaN Block 101B 14 3 6 7 \n", "2 NaN Block 102A 248 74 88 89 \n", "3 NaN Block 103 49 15 15 16 \n", "4 NaN Block 104 12 3 4 4 \n", "... ... ... ... ... ... ... \n", "4934102 NaN Block 577B 5 1 3 3 \n", "4934103 NaN Block 578 59 20 21 22 \n", "4934104 NaN Block 579 30 8 10 11 \n", "4934105 NaN Block 580 35 9 11 11 \n", "4934106 NaN Block 597 39 10 13 13 \n", "\n", "[4934107 rows x 30 columns]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# read in base source file\n", "base_source_df = pandas.read_csv(base_source_file, dtype=str)\n", "base_source_df" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:12:39.679913Z", "start_time": "2020-06-13T16:12:39.344164Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GISJOINYEARANRCAAIANHHARES_ONLYATRUSTARES_TRSTABLOCKABLCK_GRPATRACTA...STATESTATEAURBRURALAURB_AREAACD103AANPSADPIET1001EUD001EUO001ESA001
0G10000100401101199099999999999999910110401...Delaware1029999NaNBlock 10124689
1G10000100401102199099999999999999910210401...Delaware1029999NaNBlock 102145395058
2G10000100401103199099999999999999910310401...Delaware1029999NaNBlock 10375232729
3G10000100401104199099999999999999910410401...Delaware1029999NaNBlock 10469192122
4G10000100401105199099999999999999910510401...Delaware1029999NaNBlock 1052023
..................................................................
11935G10000500519289199099999999999999928920519...Delaware1029999NaNBlock 2898166
11936G34003300204401A1990999999999999999401A40204...New Jersey3419160NaNBlock 401A122315152
11937G34003300204418199099999999999999941840204...New Jersey3419160NaNBlock 41886253536
11938G34003300204419199099999999999999941940204...New Jersey3419160NaNBlock 41920750114123
11939G34003300204420199099999999999999942040204...New Jersey3419160NaNBlock 420101195356
\n", "

11940 rows × 30 columns

\n", "
" ], "text/plain": [ " GISJOIN YEAR ANRCA AIANHHA RES_ONLYA TRUSTA RES_TRSTA BLOCKA \\\n", "0 G10000100401101 1990 99 9999 9999 9999 9 101 \n", "1 G10000100401102 1990 99 9999 9999 9999 9 102 \n", "2 G10000100401103 1990 99 9999 9999 9999 9 103 \n", "3 G10000100401104 1990 99 9999 9999 9999 9 104 \n", "4 G10000100401105 1990 99 9999 9999 9999 9 105 \n", "... ... ... ... ... ... ... ... ... \n", "11935 G10000500519289 1990 99 9999 9999 9999 9 289 \n", "11936 G34003300204401A 1990 99 9999 9999 9999 9 401A \n", "11937 G34003300204418 1990 99 9999 9999 9999 9 418 \n", "11938 G34003300204419 1990 99 9999 9999 9999 9 419 \n", "11939 G34003300204420 1990 99 9999 9999 9999 9 420 \n", "\n", " BLCK_GRPA TRACTA ... STATE STATEA URBRURALA URB_AREAA CD103A \\\n", "0 1 0401 ... Delaware 10 2 9999 NaN \n", "1 1 0401 ... Delaware 10 2 9999 NaN \n", "2 1 0401 ... Delaware 10 2 9999 NaN \n", "3 1 0401 ... Delaware 10 2 9999 NaN \n", "4 1 0401 ... Delaware 10 2 9999 NaN \n", "... ... ... ... ... ... ... ... ... \n", "11935 2 0519 ... Delaware 10 2 9999 NaN \n", "11936 4 0204 ... New Jersey 34 1 9160 NaN \n", "11937 4 0204 ... New Jersey 34 1 9160 NaN \n", "11938 4 0204 ... New Jersey 34 1 9160 NaN \n", "11939 4 0204 ... New Jersey 34 1 9160 NaN \n", "\n", " ANPSADPI ET1001 EUD001 EUO001 ESA001 \n", "0 Block 101 24 6 8 9 \n", "1 Block 102 145 39 50 58 \n", "2 Block 103 75 23 27 29 \n", "3 Block 104 69 19 21 22 \n", "4 Block 105 2 0 2 3 \n", "... ... ... ... ... ... \n", "11935 Block 289 8 1 6 6 \n", "11936 Block 401A 122 31 51 52 \n", "11937 Block 418 86 25 35 36 \n", "11938 Block 419 207 50 114 123 \n", "11939 Block 420 101 19 53 56 \n", "\n", "[11940 rows x 30 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "base_source_df = base_source_df[base_source_df[\"GISJOIN\"].isin(blk1990.unique())]\n", "base_source_df.reset_index(drop=True, inplace=True)\n", "base_source_df" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:12:39.778199Z", "start_time": "2020-06-13T16:12:39.681692Z" } }, "outputs": [], "source": [ "base_source_df.to_csv(\"../testing_data_subsets/%s_block.csv.zip\" % source_year)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Record, subset, and write out the 1990 BGPs (sf1) needed to create this subset" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:12:39.802515Z", "start_time": "2020-06-13T16:12:39.779713Z" } }, "outputs": [ { "data": { "text/plain": [ "80076 G100001090444072500423009999999999921\n", "80077 G100001090444444300422009999999999926\n", "80078 G100001090444612650422009999999219011\n", "80079 G100001090444612650422009999999219011\n", "80080 G100001090444612650422009999999219012\n", " ... \n", "81133 G100005093552999990515009999999999922\n", "81134 G100005093552999990515009999999999923\n", "81135 G100005093552999990515009999999999924\n", "81136 G100005093552999990516009999999999921\n", "284765 G340033010610106000204029999999916014\n", "Name: bgp1990gj, Length: 1062, dtype: object" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bgp1990 = state_bgp1990trt2010.xwalk[\n", " ~state_bgp1990trt2010.xwalk[\"bgp1990gj\"].isna()\n", "][\"bgp1990gj\"]\n", "bgp1990" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:12:41.115535Z", "start_time": "2020-06-13T16:12:39.808175Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GISJOINYEARANRCAAIANHHAIANHHARES_ONLYATRUSTARES_TRSTABLOCKABLCK_GRPA...STATEAURBRURALAURB_AREAURB_AREAACD103AANPSADPIET1001EUD001EUO001ESA001
0G0100010901710322002119999999999922199099NaN9999999999999NaN2...012NaN9999NaNBG 2 (pt.)402105144167
1G0100010901710322002119999999999923199099NaN9999999999999NaN3...012NaN9999NaNBG 3 (pt.)279638896
2G0100010901719999902119999999999921199099NaN9999999999999NaN1...012NaN9999NaNBG 1611150179213
3G0100010901719999902119999999999922199099NaN9999999999999NaN2...012NaN9999NaNBG 2 (pt.)1244318410444
4G0100010901719999902119999999999923199099NaN9999999999999NaN3...012NaN9999NaNBG 3 (pt.)447126166197
..................................................................
367248G5600450935209999995129999999999923199099NaN9999999999999NaN3...562NaN9999NaNBG 344121321
367249G5600450935209999995129999999999924199099NaN9999999999999NaN4...562NaN9999NaNBG 442121822
367250G5600450935209999995129999999999925199099NaN9999999999999NaN5...562NaN9999NaNBG 5296915
367251G5600450935209999995129999999999926199099NaN9999999999999NaN6...562NaN9999NaNBG 6 (pt.)3081015
367252G5600450935209999995129999999999927199099NaN9999999999999NaN7...562NaN9999NaNBG 7 (pt.)308911
\n", "

367253 rows × 34 columns

\n", "
" ], "text/plain": [ " GISJOIN YEAR ANRCA AIANHH AIANHHA \\\n", "0 G0100010901710322002119999999999922 1990 99 NaN 9999 \n", "1 G0100010901710322002119999999999923 1990 99 NaN 9999 \n", "2 G0100010901719999902119999999999921 1990 99 NaN 9999 \n", "3 G0100010901719999902119999999999922 1990 99 NaN 9999 \n", "4 G0100010901719999902119999999999923 1990 99 NaN 9999 \n", "... ... ... ... ... ... \n", "367248 G5600450935209999995129999999999923 1990 99 NaN 9999 \n", "367249 G5600450935209999995129999999999924 1990 99 NaN 9999 \n", "367250 G5600450935209999995129999999999925 1990 99 NaN 9999 \n", "367251 G5600450935209999995129999999999926 1990 99 NaN 9999 \n", "367252 G5600450935209999995129999999999927 1990 99 NaN 9999 \n", "\n", " RES_ONLYA TRUSTA RES_TRSTA BLOCKA BLCK_GRPA ... STATEA URBRURALA \\\n", "0 9999 9999 9 NaN 2 ... 01 2 \n", "1 9999 9999 9 NaN 3 ... 01 2 \n", "2 9999 9999 9 NaN 1 ... 01 2 \n", "3 9999 9999 9 NaN 2 ... 01 2 \n", "4 9999 9999 9 NaN 3 ... 01 2 \n", "... ... ... ... ... ... ... ... ... \n", "367248 9999 9999 9 NaN 3 ... 56 2 \n", "367249 9999 9999 9 NaN 4 ... 56 2 \n", "367250 9999 9999 9 NaN 5 ... 56 2 \n", "367251 9999 9999 9 NaN 6 ... 56 2 \n", "367252 9999 9999 9 NaN 7 ... 56 2 \n", "\n", " URB_AREA URB_AREAA CD103A ANPSADPI ET1001 EUD001 EUO001 ESA001 \n", "0 NaN 9999 NaN BG 2 (pt.) 402 105 144 167 \n", "1 NaN 9999 NaN BG 3 (pt.) 279 63 88 96 \n", "2 NaN 9999 NaN BG 1 611 150 179 213 \n", "3 NaN 9999 NaN BG 2 (pt.) 1244 318 410 444 \n", "4 NaN 9999 NaN BG 3 (pt.) 447 126 166 197 \n", "... ... ... ... ... ... ... ... ... \n", "367248 NaN 9999 NaN BG 3 44 12 13 21 \n", "367249 NaN 9999 NaN BG 4 42 12 18 22 \n", "367250 NaN 9999 NaN BG 5 29 6 9 15 \n", "367251 NaN 9999 NaN BG 6 (pt.) 30 8 10 15 \n", "367252 NaN 9999 NaN BG 7 (pt.) 30 8 9 11 \n", "\n", "[367253 rows x 34 columns]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# read in supplement source file\n", "supp_source_df = pandas.read_csv(supp_source_file, dtype=str)\n", "supp_source_df" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:12:43.372607Z", "start_time": "2020-06-13T16:12:41.117905Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GISJOINYEARANRCAAIANHHAIANHHARES_ONLYATRUSTARES_TRSTABLOCKABLCK_GRPA...URBRURALAURB_AREAURB_AREAACD103AANPSADPIET1001EUD001EUO001ESA001__GISJOIN
0G0100010901710322002119999999999922199099NaN9999999999999NaN2...2NaN9999NaNBG 2 (pt.)402105144167G010001090171032200211039999999999922
1G0100010901710322002119999999999923199099NaN9999999999999NaN3...2NaN9999NaNBG 3 (pt.)279638896G010001090171032200211039999999999923
2G0100010901719999902119999999999921199099NaN9999999999999NaN1...2NaN9999NaNBG 1611150179213G010001090171999990211039999999999921
3G0100010901719999902119999999999922199099NaN9999999999999NaN2...2NaN9999NaNBG 2 (pt.)1244318410444G010001090171999990211039999999999922
4G0100010901719999902119999999999923199099NaN9999999999999NaN3...2NaN9999NaNBG 3 (pt.)447126166197G010001090171999990211039999999999923
..................................................................
367248G5600450935209999995129999999999923199099NaN9999999999999NaN3...2NaN9999NaNBG 344121321G560045093520999999512009999999999923
367249G5600450935209999995129999999999924199099NaN9999999999999NaN4...2NaN9999NaNBG 442121822G560045093520999999512009999999999924
367250G5600450935209999995129999999999925199099NaN9999999999999NaN5...2NaN9999NaNBG 5296915G560045093520999999512009999999999925
367251G5600450935209999995129999999999926199099NaN9999999999999NaN6...2NaN9999NaNBG 6 (pt.)3081015G560045093520999999512009999999999926
367252G5600450935209999995129999999999927199099NaN9999999999999NaN7...2NaN9999NaNBG 7 (pt.)308911G560045093520999999512009999999999927
\n", "

367253 rows × 35 columns

\n", "
" ], "text/plain": [ " GISJOIN YEAR ANRCA AIANHH AIANHHA \\\n", "0 G0100010901710322002119999999999922 1990 99 NaN 9999 \n", "1 G0100010901710322002119999999999923 1990 99 NaN 9999 \n", "2 G0100010901719999902119999999999921 1990 99 NaN 9999 \n", "3 G0100010901719999902119999999999922 1990 99 NaN 9999 \n", "4 G0100010901719999902119999999999923 1990 99 NaN 9999 \n", "... ... ... ... ... ... \n", "367248 G5600450935209999995129999999999923 1990 99 NaN 9999 \n", "367249 G5600450935209999995129999999999924 1990 99 NaN 9999 \n", "367250 G5600450935209999995129999999999925 1990 99 NaN 9999 \n", "367251 G5600450935209999995129999999999926 1990 99 NaN 9999 \n", "367252 G5600450935209999995129999999999927 1990 99 NaN 9999 \n", "\n", " RES_ONLYA TRUSTA RES_TRSTA BLOCKA BLCK_GRPA ... URBRURALA URB_AREA \\\n", "0 9999 9999 9 NaN 2 ... 2 NaN \n", "1 9999 9999 9 NaN 3 ... 2 NaN \n", "2 9999 9999 9 NaN 1 ... 2 NaN \n", "3 9999 9999 9 NaN 2 ... 2 NaN \n", "4 9999 9999 9 NaN 3 ... 2 NaN \n", "... ... ... ... ... ... ... ... ... \n", "367248 9999 9999 9 NaN 3 ... 2 NaN \n", "367249 9999 9999 9 NaN 4 ... 2 NaN \n", "367250 9999 9999 9 NaN 5 ... 2 NaN \n", "367251 9999 9999 9 NaN 6 ... 2 NaN \n", "367252 9999 9999 9 NaN 7 ... 2 NaN \n", "\n", " URB_AREAA CD103A ANPSADPI ET1001 EUD001 EUO001 ESA001 \\\n", "0 9999 NaN BG 2 (pt.) 402 105 144 167 \n", "1 9999 NaN BG 3 (pt.) 279 63 88 96 \n", "2 9999 NaN BG 1 611 150 179 213 \n", "3 9999 NaN BG 2 (pt.) 1244 318 410 444 \n", "4 9999 NaN BG 3 (pt.) 447 126 166 197 \n", "... ... ... ... ... ... ... ... \n", "367248 9999 NaN BG 3 44 12 13 21 \n", "367249 9999 NaN BG 4 42 12 18 22 \n", "367250 9999 NaN BG 5 29 6 9 15 \n", "367251 9999 NaN BG 6 (pt.) 30 8 10 15 \n", "367252 9999 NaN BG 7 (pt.) 30 8 9 11 \n", "\n", " __GISJOIN \n", "0 G010001090171032200211039999999999922 \n", "1 G010001090171032200211039999999999923 \n", "2 G010001090171999990211039999999999921 \n", "3 G010001090171999990211039999999999922 \n", "4 G010001090171999990211039999999999923 \n", "... ... \n", "367248 G560045093520999999512009999999999923 \n", "367249 G560045093520999999512009999999999924 \n", "367250 G560045093520999999512009999999999925 \n", "367251 G560045093520999999512009999999999926 \n", "367252 G560045093520999999512009999999999927 \n", "\n", "[367253 rows x 35 columns]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# GISJOIN ID correction\n", "bgpidcols = nhgisxwalk.id_codes.code_cols(\"bgp\", \"1990\")\n", "supp_source_df = nhgisxwalk.id_codes.bgp_gj(supp_source_df, bgpidcols, cname=\"__GISJOIN\")\n", "supp_source_df" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:12:43.587147Z", "start_time": "2020-06-13T16:12:43.374638Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GISJOINYEARANRCAAIANHHAIANHHARES_ONLYATRUSTARES_TRSTABLOCKABLCK_GRPA...URBRURALAURB_AREAURB_AREAACD103AANPSADPIET1001EUD001EUO001ESA001__GISJOIN
57044G1000010904440725004239999999999921199099NaN9999999999999NaN1...2NaN9999NaNBG 1 (pt.)1794883168G100001090444072500423009999999999921
57045G1000010904444443004229999999999926199099NaN9999999999999NaN6...2NaN9999NaNBG 6 (pt.)211628186G100001090444444300422009999999999926
57046G1000010904446126504229999999219011199099NaN9999999999999NaN1...1Dover, DE2190NaNBG 1 (pt.)1848490615694G100001090444612650422009999999219011
57047G1000010904446126504229999999219012199099NaN9999999999999NaN2...1Dover, DE2190NaNBG 2 (pt.)67172122G100001090444612650422009999999219012
57048G1000010904446148004229999999999924199099NaN9999999999999NaN4...2NaN9999NaNBG 4 (pt.)924261307315G100001090444614800422009999999999924
..................................................................
57816G1000050935529999905159999999999922199099NaN9999999999999NaN2...2NaN9999NaNBG 2 (pt.)1127316399446G100005093552999990515009999999999922
57817G1000050935529999905159999999999923199099NaN9999999999999NaN3...2NaN9999NaNBG 3 (pt.)2287499108G100005093552999990515009999999999923
57818G1000050935529999905159999999999924199099NaN9999999999999NaN4...2NaN9999NaNBG 4808214269310G100005093552999990515009999999999924
57819G1000050935529999905169999999999921199099NaN9999999999999NaN1...2NaN9999NaNBG 11100323401445G100005093552999990516009999999999921
213548G3400330106101060002049999999916014199099NaN9999999999999NaN4...1Wilmington, DE--NJ--MD--PA9160NaNBG 4 (pt.)1049215380400G340033010610106000204029999999916014
\n", "

777 rows × 35 columns

\n", "
" ], "text/plain": [ " GISJOIN YEAR ANRCA AIANHH AIANHHA \\\n", "57044 G1000010904440725004239999999999921 1990 99 NaN 9999 \n", "57045 G1000010904444443004229999999999926 1990 99 NaN 9999 \n", "57046 G1000010904446126504229999999219011 1990 99 NaN 9999 \n", "57047 G1000010904446126504229999999219012 1990 99 NaN 9999 \n", "57048 G1000010904446148004229999999999924 1990 99 NaN 9999 \n", "... ... ... ... ... ... \n", "57816 G1000050935529999905159999999999922 1990 99 NaN 9999 \n", "57817 G1000050935529999905159999999999923 1990 99 NaN 9999 \n", "57818 G1000050935529999905159999999999924 1990 99 NaN 9999 \n", "57819 G1000050935529999905169999999999921 1990 99 NaN 9999 \n", "213548 G3400330106101060002049999999916014 1990 99 NaN 9999 \n", "\n", " RES_ONLYA TRUSTA RES_TRSTA BLOCKA BLCK_GRPA ... URBRURALA \\\n", "57044 9999 9999 9 NaN 1 ... 2 \n", "57045 9999 9999 9 NaN 6 ... 2 \n", "57046 9999 9999 9 NaN 1 ... 1 \n", "57047 9999 9999 9 NaN 2 ... 1 \n", "57048 9999 9999 9 NaN 4 ... 2 \n", "... ... ... ... ... ... ... ... \n", "57816 9999 9999 9 NaN 2 ... 2 \n", "57817 9999 9999 9 NaN 3 ... 2 \n", "57818 9999 9999 9 NaN 4 ... 2 \n", "57819 9999 9999 9 NaN 1 ... 2 \n", "213548 9999 9999 9 NaN 4 ... 1 \n", "\n", " URB_AREA URB_AREAA CD103A ANPSADPI ET1001 EUD001 \\\n", "57044 NaN 9999 NaN BG 1 (pt.) 179 48 \n", "57045 NaN 9999 NaN BG 6 (pt.) 211 62 \n", "57046 Dover, DE 2190 NaN BG 1 (pt.) 1848 490 \n", "57047 Dover, DE 2190 NaN BG 2 (pt.) 67 17 \n", "57048 NaN 9999 NaN BG 4 (pt.) 924 261 \n", "... ... ... ... ... ... ... \n", "57816 NaN 9999 NaN BG 2 (pt.) 1127 316 \n", "57817 NaN 9999 NaN BG 3 (pt.) 228 74 \n", "57818 NaN 9999 NaN BG 4 808 214 \n", "57819 NaN 9999 NaN BG 1 1100 323 \n", "213548 Wilmington, DE--NJ--MD--PA 9160 NaN BG 4 (pt.) 1049 215 \n", "\n", " EUO001 ESA001 __GISJOIN \n", "57044 83 168 G100001090444072500423009999999999921 \n", "57045 81 86 G100001090444444300422009999999999926 \n", "57046 615 694 G100001090444612650422009999999219011 \n", "57047 21 22 G100001090444612650422009999999219012 \n", "57048 307 315 G100001090444614800422009999999999924 \n", "... ... ... ... \n", "57816 399 446 G100005093552999990515009999999999922 \n", "57817 99 108 G100005093552999990515009999999999923 \n", "57818 269 310 G100005093552999990515009999999999924 \n", "57819 401 445 G100005093552999990516009999999999921 \n", "213548 380 400 G340033010610106000204029999999916014 \n", "\n", "[777 rows x 35 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "supp_source_df = supp_source_df[supp_source_df[\"__GISJOIN\"].isin(bgp1990.unique())].copy()\n", "supp_source_df" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:12:43.626003Z", "start_time": "2020-06-13T16:12:43.589231Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GISJOINYEARANRCAAIANHHAIANHHARES_ONLYATRUSTARES_TRSTABLOCKABLCK_GRPA...STATEAURBRURALAURB_AREAURB_AREAACD103AANPSADPIET1001EUD001EUO001ESA001
0G1000010904440725004239999999999921199099NaN9999999999999NaN1...102NaN9999NaNBG 1 (pt.)1794883168
1G1000010904444443004229999999999926199099NaN9999999999999NaN6...102NaN9999NaNBG 6 (pt.)211628186
2G1000010904446126504229999999219011199099NaN9999999999999NaN1...101Dover, DE2190NaNBG 1 (pt.)1848490615694
3G1000010904446126504229999999219012199099NaN9999999999999NaN2...101Dover, DE2190NaNBG 2 (pt.)67172122
4G1000010904446148004229999999999924199099NaN9999999999999NaN4...102NaN9999NaNBG 4 (pt.)924261307315
..................................................................
772G1000050935529999905159999999999922199099NaN9999999999999NaN2...102NaN9999NaNBG 2 (pt.)1127316399446
773G1000050935529999905159999999999923199099NaN9999999999999NaN3...102NaN9999NaNBG 3 (pt.)2287499108
774G1000050935529999905159999999999924199099NaN9999999999999NaN4...102NaN9999NaNBG 4808214269310
775G1000050935529999905169999999999921199099NaN9999999999999NaN1...102NaN9999NaNBG 11100323401445
776G3400330106101060002049999999916014199099NaN9999999999999NaN4...341Wilmington, DE--NJ--MD--PA9160NaNBG 4 (pt.)1049215380400
\n", "

777 rows × 34 columns

\n", "
" ], "text/plain": [ " GISJOIN YEAR ANRCA AIANHH AIANHHA RES_ONLYA \\\n", "0 G1000010904440725004239999999999921 1990 99 NaN 9999 9999 \n", "1 G1000010904444443004229999999999926 1990 99 NaN 9999 9999 \n", "2 G1000010904446126504229999999219011 1990 99 NaN 9999 9999 \n", "3 G1000010904446126504229999999219012 1990 99 NaN 9999 9999 \n", "4 G1000010904446148004229999999999924 1990 99 NaN 9999 9999 \n", ".. ... ... ... ... ... ... \n", "772 G1000050935529999905159999999999922 1990 99 NaN 9999 9999 \n", "773 G1000050935529999905159999999999923 1990 99 NaN 9999 9999 \n", "774 G1000050935529999905159999999999924 1990 99 NaN 9999 9999 \n", "775 G1000050935529999905169999999999921 1990 99 NaN 9999 9999 \n", "776 G3400330106101060002049999999916014 1990 99 NaN 9999 9999 \n", "\n", " TRUSTA RES_TRSTA BLOCKA BLCK_GRPA ... STATEA URBRURALA \\\n", "0 9999 9 NaN 1 ... 10 2 \n", "1 9999 9 NaN 6 ... 10 2 \n", "2 9999 9 NaN 1 ... 10 1 \n", "3 9999 9 NaN 2 ... 10 1 \n", "4 9999 9 NaN 4 ... 10 2 \n", ".. ... ... ... ... ... ... ... \n", "772 9999 9 NaN 2 ... 10 2 \n", "773 9999 9 NaN 3 ... 10 2 \n", "774 9999 9 NaN 4 ... 10 2 \n", "775 9999 9 NaN 1 ... 10 2 \n", "776 9999 9 NaN 4 ... 34 1 \n", "\n", " URB_AREA URB_AREAA CD103A ANPSADPI ET1001 EUD001 \\\n", "0 NaN 9999 NaN BG 1 (pt.) 179 48 \n", "1 NaN 9999 NaN BG 6 (pt.) 211 62 \n", "2 Dover, DE 2190 NaN BG 1 (pt.) 1848 490 \n", "3 Dover, DE 2190 NaN BG 2 (pt.) 67 17 \n", "4 NaN 9999 NaN BG 4 (pt.) 924 261 \n", ".. ... ... ... ... ... ... \n", "772 NaN 9999 NaN BG 2 (pt.) 1127 316 \n", "773 NaN 9999 NaN BG 3 (pt.) 228 74 \n", "774 NaN 9999 NaN BG 4 808 214 \n", "775 NaN 9999 NaN BG 1 1100 323 \n", "776 Wilmington, DE--NJ--MD--PA 9160 NaN BG 4 (pt.) 1049 215 \n", "\n", " EUO001 ESA001 \n", "0 83 168 \n", "1 81 86 \n", "2 615 694 \n", "3 21 22 \n", "4 307 315 \n", ".. ... ... \n", "772 399 446 \n", "773 99 108 \n", "774 269 310 \n", "775 401 445 \n", "776 380 400 \n", "\n", "[777 rows x 34 columns]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "supp_source_df.drop(columns=[\"__GISJOIN\"], inplace=True)\n", "supp_source_df.reset_index(drop=True, inplace=True)\n", "supp_source_df" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "ExecuteTime": { "end_time": "2020-06-13T16:12:43.653364Z", "start_time": "2020-06-13T16:12:43.627559Z" } }, "outputs": [], "source": [ "supp_source_df.to_csv(\n", " \"../testing_data_subsets/%s_blck_grp_598_103.csv.zip\" % source_year\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-----------------" ] } ], "metadata": { "_draft": { "nbviewer_url": "https://gist.github.com/9f47e4ec2cc37bce83acf20abfca69d2" }, "gist": { "data": { "description": "sample-workflow.ipynb", "public": true }, "id": "9f47e4ec2cc37bce83acf20abfca69d2" }, "kernelspec": { "display_name": "Python [conda env:nhgis]", "language": "python", "name": "conda-env-nhgis-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }