{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This file is part of the Minnesota Population Center's NHGISXWALK.\n", "# For copyright and licensing information, see the NOTICE and LICENSE files\n", "# in this project's top-level directory, and also on-line at:\n", "# https://github.com/ipums/nhgisxwalk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Generating Subsets for testing: 2000\n", "## blocks, block groups parts, and blocks\n", "\n", "\n", "1. From a national crosswalk: \n", " 1. Create target state-level subsets for NHGIS base crosswalks\n", " 1. Create target state-level subsets for NHGIS base tabular data\n", " 1. Record unit tests values for posterity\n", "\n", "\n", "\n", "**This is currently only intended for use with block-level data as base units.**\n", "\n", "\n", "**James Gaboardi** **(), 2020-05**" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:36:58.307226Z", "start_time": "2020-10-01T21:36:58.159730Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2020-10-01T17:36:58-04:00\n", "\n", "CPython 3.8.5\n", "IPython 7.18.1\n", "\n", "compiler : Clang 10.0.1 \n", "system : Darwin\n", "release : 19.6.0\n", "machine : x86_64\n", "processor : i386\n", "CPU cores : 8\n", "interpreter: 64bit\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:36:58.860931Z", "start_time": "2020-10-01T21:36:58.309097Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "watermark 2.0.2\n", "numpy 1.19.1\n", "pandas 1.1.1\n", "nhgisxwalk 0.0.9post1\n", "\n" ] } ], "source": [ "import inspect\n", "import nhgisxwalk\n", "import numpy\n", "import pandas\n", "\n", "%load_ext autoreload\n", "%autoreload 2\n", "%watermark -w\n", "%watermark -iv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set the state (for subsetting), source & target, and year & geography" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:36:58.880251Z", "start_time": "2020-10-01T21:36:58.866001Z" } }, "outputs": [], "source": [ "subset_state = \"10\" # Delaware\n", "source_year, target_year = \"2000\", \"2010\"\n", "gj_src, gj_trg = \"GJOIN%s\"%source_year, \"GJOIN%s\"%target_year" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:36:58.896037Z", "start_time": "2020-10-01T21:36:58.882148Z" } }, "outputs": [], "source": [ "# Set these to a local directory\n", "data_in = \"path/to/data/\"\n", "data_tab = \"path/to/data/\"" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:36:58.910713Z", "start_time": "2020-10-01T21:36:58.897998Z" } }, "outputs": [], "source": [ "data_out = \"../testing_data_subsets/\"\n", "block_file = \"%s_block\" % source_year" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set the base-level crosswalk file name" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:36:58.929990Z", "start_time": "2020-10-01T21:36:58.912106Z" } }, "outputs": [ { "data": { "text/plain": [ "'nhgis_blk2000_blk2010_gj'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "base_xwalk_name = \"nhgis_blk%s_blk%s_gj\" % (source_year, target_year)\n", "base_xwalk_name" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set the base (source) summary file name" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:36:58.945777Z", "start_time": "2020-10-01T21:36:58.933133Z" } }, "outputs": [], "source": [ "base_source_name = \"%s/%s.csv\" % (block_file, block_file)\n", "base_source_file = \"%s%s\" % (data_tab, base_source_name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read in the national the base-level crosswalk" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:37:11.797785Z", "start_time": "2020-10-01T21:36:58.947891Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GJOIN2000GJOIN2010WEIGHTPAREA
0G01000100201001000G010001002010020000.0358970.008988
1G01000100201001000G010001002010020010.2533300.263725
2G01000100201001000G010001002010020020.0000000.000385
3G01000100201001000G010001002010020030.0762970.055430
4G01000100201001000G010001002010020040.0324410.007543
\n", "
" ], "text/plain": [ " GJOIN2000 GJOIN2010 WEIGHT PAREA\n", "0 G01000100201001000 G01000100201002000 0.035897 0.008988\n", "1 G01000100201001000 G01000100201002001 0.253330 0.263725\n", "2 G01000100201001000 G01000100201002002 0.000000 0.000385\n", "3 G01000100201001000 G01000100201002003 0.076297 0.055430\n", "4 G01000100201001000 G01000100201002004 0.032441 0.007543" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_types = nhgisxwalk.str_types([gj_src, gj_trg])\n", "from_csv_kws = {\"path\": data_in, \"archived\": True, \"remove_unpacked\": True}\n", "read_csv_kws = {\"dtype\": data_types}\n", "base_xwalk = nhgisxwalk.xwalk_df_from_csv(\n", " base_xwalk_name, **from_csv_kws, **read_csv_kws\n", ")\n", "base_xwalk.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create the state subset of the base-level crosswalk (for use in GH testing)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:37:14.263994Z", "start_time": "2020-10-01T21:37:11.799933Z" } }, "outputs": [], "source": [ "lambda_func = lambda x: x[1:3] == subset_state\n", "ss_base = base_xwalk[base_xwalk[\"GJOIN2010\"].map(lambda_func)].copy()\n", "ss_base.reset_index(drop=True, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Declare input variable\n", "**not needed for creating a subset perse, but should do regardless**" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:37:14.279115Z", "start_time": "2020-10-01T21:37:14.265809Z" } }, "outputs": [], "source": [ "input_vars = [\n", " nhgisxwalk.desc_code_2000_SF1b[\"Persons\"][\"Total\"],\n", " nhgisxwalk.desc_code_2000_SF1b[\"Families\"][\"Total\"],\n", " nhgisxwalk.desc_code_2000_SF1b[\"Households\"][\"Total\"],\n", " nhgisxwalk.desc_code_2000_SF1b[\"Housing Units\"][\"Total\"]\n", "]\n", "input_var_tags = [\"pop\", \"fam\", \"hh\", \"hu\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Generate the desired crosswalk and subset down to the target state" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:39:12.551781Z", "start_time": "2020-10-01T21:37:14.281698Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp2000gjtr2010gjtr2010gewt_popwt_famwt_hhwt_hu
0G10000109044444430042202U1G1000010042202100010422021.01.01.01.0
1G10000109044461265042201R1G1000010042201100010422011.01.01.01.0
2G10000109044461265042201U1G1000010042201100010422011.01.01.01.0
3G10000109044461265042201U2G1000010042201100010422011.01.01.01.0
4G10000109044461480042202R2G1000010042202100010422021.01.01.01.0
........................
1038G10000509355299999051500R4G1000050051500100050515001.01.01.01.0
1039G10000509355299999051500U1G1000050051500100050515001.01.01.01.0
1040G10000509355299999051500U3G1000050051500100050515001.01.01.01.0
1041G10000509355299999051500U4G1000050051500100050515001.01.01.01.0
1042G34003301061010600020400U2G1000030990100100039901000.00.00.00.0
\n", "

1043 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp2000gj tr2010gj tr2010ge wt_pop wt_fam \\\n", "0 G10000109044444430042202U1 G1000010042202 10001042202 1.0 1.0 \n", "1 G10000109044461265042201R1 G1000010042201 10001042201 1.0 1.0 \n", "2 G10000109044461265042201U1 G1000010042201 10001042201 1.0 1.0 \n", "3 G10000109044461265042201U2 G1000010042201 10001042201 1.0 1.0 \n", "4 G10000109044461480042202R2 G1000010042202 10001042202 1.0 1.0 \n", "... ... ... ... ... ... \n", "1038 G10000509355299999051500R4 G1000050051500 10005051500 1.0 1.0 \n", "1039 G10000509355299999051500U1 G1000050051500 10005051500 1.0 1.0 \n", "1040 G10000509355299999051500U3 G1000050051500 10005051500 1.0 1.0 \n", "1041 G10000509355299999051500U4 G1000050051500 10005051500 1.0 1.0 \n", "1042 G34003301061010600020400U2 G1000030990100 10003990100 0.0 0.0 \n", "\n", " wt_hh wt_hu \n", "0 1.0 1.0 \n", "1 1.0 1.0 \n", "2 1.0 1.0 \n", "3 1.0 1.0 \n", "4 1.0 1.0 \n", "... ... ... \n", "1038 1.0 1.0 \n", "1039 1.0 1.0 \n", "1040 1.0 1.0 \n", "1041 1.0 1.0 \n", "1042 0.0 0.0 \n", "\n", "[1043 rows x 7 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "state_bgp2000tr2010 = nhgisxwalk.GeoCrossWalk(\n", " base_xwalk,\n", " source_year=source_year,\n", " target_year=target_year,\n", " source_geo=\"bgp\",\n", " target_geo=\"tr\",\n", " base_source_table=base_source_file,\n", " input_var=input_vars,\n", " weight_var=input_var_tags,\n", " keep_base=True,\n", " add_geoid=True,\n", " stfips=subset_state\n", ")\n", "del base_xwalk\n", "state_bgp2000tr2010.xwalk" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:39:12.575925Z", "start_time": "2020-10-01T21:39:12.553264Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp2000gjtr2010gjtr2010gewt_popwt_famwt_hhwt_hu
0G10000109044444430042202U1G1000010042202100010422021.01.01.01.0
1G10000109044461265042201R1G1000010042201100010422011.01.01.01.0
2G10000109044461265042201U1G1000010042201100010422011.01.01.01.0
3G10000109044461265042201U2G1000010042201100010422011.01.01.01.0
4G10000109044461480042202R2G1000010042202100010422021.01.01.01.0
........................
1038G10000509355299999051500R4G1000050051500100050515001.01.01.01.0
1039G10000509355299999051500U1G1000050051500100050515001.01.01.01.0
1040G10000509355299999051500U3G1000050051500100050515001.01.01.01.0
1041G10000509355299999051500U4G1000050051500100050515001.01.01.01.0
1042G34003301061010600020400U2G1000030990100100039901000.00.00.00.0
\n", "

1043 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp2000gj tr2010gj tr2010ge wt_pop wt_fam \\\n", "0 G10000109044444430042202U1 G1000010042202 10001042202 1.0 1.0 \n", "1 G10000109044461265042201R1 G1000010042201 10001042201 1.0 1.0 \n", "2 G10000109044461265042201U1 G1000010042201 10001042201 1.0 1.0 \n", "3 G10000109044461265042201U2 G1000010042201 10001042201 1.0 1.0 \n", "4 G10000109044461480042202R2 G1000010042202 10001042202 1.0 1.0 \n", "... ... ... ... ... ... \n", "1038 G10000509355299999051500R4 G1000050051500 10005051500 1.0 1.0 \n", "1039 G10000509355299999051500U1 G1000050051500 10005051500 1.0 1.0 \n", "1040 G10000509355299999051500U3 G1000050051500 10005051500 1.0 1.0 \n", "1041 G10000509355299999051500U4 G1000050051500 10005051500 1.0 1.0 \n", "1042 G34003301061010600020400U2 G1000030990100 10003990100 0.0 0.0 \n", "\n", " wt_hh wt_hu \n", "0 1.0 1.0 \n", "1 1.0 1.0 \n", "2 1.0 1.0 \n", "3 1.0 1.0 \n", "4 1.0 1.0 \n", "... ... ... \n", "1038 1.0 1.0 \n", "1039 1.0 1.0 \n", "1040 1.0 1.0 \n", "1041 1.0 1.0 \n", "1042 0.0 0.0 \n", "\n", "[1043 rows x 7 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "state_bgp2000tr2010.xwalk.drop_duplicates(subset=[\"bgp2000gj\", \"tr2010gj\"])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:39:12.590575Z", "start_time": "2020-10-01T21:39:12.577200Z" } }, "outputs": [ { "data": { "text/plain": [ "908" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "state_bgp2000tr2010.xwalk[\"bgp2000gj\"].nunique()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:39:12.605949Z", "start_time": "2020-10-01T21:39:12.591996Z" } }, "outputs": [ { "data": { "text/plain": [ "218" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "state_bgp2000tr2010.xwalk[\"tr2010gj\"].nunique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Write out the state subset of the base-level crosswalk (for use in GH testing)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:39:12.783504Z", "start_time": "2020-10-01T21:39:12.607442Z" } }, "outputs": [], "source": [ "out_path = \"%s%s\" % (data_out, base_xwalk_name)\n", "nhgisxwalk.prepare_data_product(ss_base, base_xwalk_name, out_path, remove=True)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:39:12.804673Z", "start_time": "2020-10-01T21:39:12.785519Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GJOIN2000GJOIN2010WEIGHTPAREA
0G10000100401001000G100001004010010001.0000001.000000
1G10000100401001001G100001004010010010.9999810.999988
2G10000100401001001G100001004010010030.0000190.000012
3G10000100401001002G100001004010010021.0000001.000000
4G10000100401001003G100001004010010031.0000001.000000
...............
28471G10000500519002095G100005005190020751.0000001.000000
28472G10000500519002096G100005005190021311.0000001.000000
28473G10000500519002097G100005005190021301.0000001.000000
28474G10000500519002098G100005005190020791.0000001.000000
28475G34003300204002001G100003099010000070.0000000.000000
\n", "

28476 rows × 4 columns

\n", "
" ], "text/plain": [ " GJOIN2000 GJOIN2010 WEIGHT PAREA\n", "0 G10000100401001000 G10000100401001000 1.000000 1.000000\n", "1 G10000100401001001 G10000100401001001 0.999981 0.999988\n", "2 G10000100401001001 G10000100401001003 0.000019 0.000012\n", "3 G10000100401001002 G10000100401001002 1.000000 1.000000\n", "4 G10000100401001003 G10000100401001003 1.000000 1.000000\n", "... ... ... ... ...\n", "28471 G10000500519002095 G10000500519002075 1.000000 1.000000\n", "28472 G10000500519002096 G10000500519002131 1.000000 1.000000\n", "28473 G10000500519002097 G10000500519002130 1.000000 1.000000\n", "28474 G10000500519002098 G10000500519002079 1.000000 1.000000\n", "28475 G34003300204002001 G10000309901000007 0.000000 0.000000\n", "\n", "[28476 rows x 4 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ss_base" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Record, subset, and write out the 1990 BLKs (sf1) needed to create this subset" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:39:12.823829Z", "start_time": "2020-10-01T21:39:12.806354Z" } }, "outputs": [ { "data": { "text/plain": [ "0 G10000100401001000\n", "1 G10000100401001001\n", "2 G10000100401001001\n", "3 G10000100401001002\n", "4 G10000100401001003\n", " ... \n", "28471 G10000500519002095\n", "28472 G10000500519002096\n", "28473 G10000500519002097\n", "28474 G10000500519002098\n", "28475 G34003300204002001\n", "Name: GJOIN2000, Length: 28476, dtype: object" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "blk2000 = ss_base[~ss_base[\"GJOIN2000\"].isna()][\"GJOIN2000\"]\n", "blk2000" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:39:12.837596Z", "start_time": "2020-10-01T21:39:12.825069Z" } }, "outputs": [ { "data": { "text/plain": [ "(28476,)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "blk2000.shape" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:39:12.854418Z", "start_time": "2020-10-01T21:39:12.838867Z" } }, "outputs": [ { "data": { "text/plain": [ "17484" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "blk2000.nunique()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:39:25.340911Z", "start_time": "2020-10-01T21:39:12.855865Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GISJOINYEARSTATESTATEACOUNTYCOUNTYACTY_SUBAPLACEATRACTABLCK_GRPABLOCKAAIANHHAURBRURALANAMEFXS001FY4001F2V001FV5001
0G010001002010010002000Alabama01Autauga0019262899999020100110009999RBlock 100016747
1G010001002010010012000Alabama01Autauga0019262862328020100110019999RBlock 100140131314
2G010001002010010022000Alabama01Autauga0019262862328020100110029999UBlock 100228410188104
3G010001002010010032000Alabama01Autauga0019262862328020100110039999UBlock 100347161317
4G010001002010010042000Alabama01Autauga0019262862328020100110049999UBlock 100418749
.........................................................
8205577G560045095130030802000Wyoming56Weston0459225599999951300330809999RBlock 30800000
8205578G560045095130030812000Wyoming56Weston0459225599999951300330819999RBlock 30810000
8205579G560045095130030822000Wyoming56Weston0459225599999951300330829999RBlock 30820000
8205580G560045095130030832000Wyoming56Weston0459225599999951300330839999RBlock 30831101
8205581G560045095130030842000Wyoming56Weston0459225599999951300330849999RBlock 308434141116
\n", "

8205582 rows × 18 columns

\n", "
" ], "text/plain": [ " GISJOIN YEAR STATE STATEA COUNTY COUNTYA CTY_SUBA \\\n", "0 G01000100201001000 2000 Alabama 01 Autauga 001 92628 \n", "1 G01000100201001001 2000 Alabama 01 Autauga 001 92628 \n", "2 G01000100201001002 2000 Alabama 01 Autauga 001 92628 \n", "3 G01000100201001003 2000 Alabama 01 Autauga 001 92628 \n", "4 G01000100201001004 2000 Alabama 01 Autauga 001 92628 \n", "... ... ... ... ... ... ... ... \n", "8205577 G56004509513003080 2000 Wyoming 56 Weston 045 92255 \n", "8205578 G56004509513003081 2000 Wyoming 56 Weston 045 92255 \n", "8205579 G56004509513003082 2000 Wyoming 56 Weston 045 92255 \n", "8205580 G56004509513003083 2000 Wyoming 56 Weston 045 92255 \n", "8205581 G56004509513003084 2000 Wyoming 56 Weston 045 92255 \n", "\n", " PLACEA TRACTA BLCK_GRPA BLOCKA AIANHHA URBRURALA NAME FXS001 \\\n", "0 99999 020100 1 1000 9999 R Block 1000 16 \n", "1 62328 020100 1 1001 9999 R Block 1001 40 \n", "2 62328 020100 1 1002 9999 U Block 1002 284 \n", "3 62328 020100 1 1003 9999 U Block 1003 47 \n", "4 62328 020100 1 1004 9999 U Block 1004 18 \n", "... ... ... ... ... ... ... ... ... \n", "8205577 99999 951300 3 3080 9999 R Block 3080 0 \n", "8205578 99999 951300 3 3081 9999 R Block 3081 0 \n", "8205579 99999 951300 3 3082 9999 R Block 3082 0 \n", "8205580 99999 951300 3 3083 9999 R Block 3083 1 \n", "8205581 99999 951300 3 3084 9999 R Block 3084 34 \n", "\n", " FY4001 F2V001 FV5001 \n", "0 7 4 7 \n", "1 13 13 14 \n", "2 101 88 104 \n", "3 16 13 17 \n", "4 7 4 9 \n", "... ... ... ... \n", "8205577 0 0 0 \n", "8205578 0 0 0 \n", "8205579 0 0 0 \n", "8205580 1 0 1 \n", "8205581 14 11 16 \n", "\n", "[8205582 rows x 18 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# read in base source file\n", "base_source_df = pandas.read_csv(base_source_file, dtype=str)\n", "base_source_df" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:39:25.903249Z", "start_time": "2020-10-01T21:39:25.342878Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GISJOINYEARSTATESTATEACOUNTYCOUNTYACTY_SUBAPLACEATRACTABLCK_GRPABLOCKAAIANHHAURBRURALANAMEFXS001FY4001F2V001FV5001
0G100001004010010002000Delaware10Kent0019148099999040100110009999RBlock 100015969
1G100001004010010012000Delaware10Kent0019148099999040100110019999RBlock 100136410494104
2G100001004010010022000Delaware10Kent0019148099999040100110029999RBlock 100216666
3G100001004010010032000Delaware10Kent0019148099999040100110039999RBlock 100380272427
4G100001004010010042000Delaware10Kent0019148099999040100110049999RBlock 100455201521
.........................................................
17479G100005005190020952000Delaware10Sussex0059162820380051900220959999UBlock 2095189412
17480G100005005190020962000Delaware10Sussex0059162820380051900220969999RBlock 20963212
17481G100005005190020972000Delaware10Sussex0059162899999051900220979999UBlock 20970000
17482G100005005190020982000Delaware10Sussex0059162899999051900220989999UBlock 209818535
17483G340033002040020012000New Jersey34Salem0331061010600020400220019999UBlock 200141823492257
\n", "

17484 rows × 18 columns

\n", "
" ], "text/plain": [ " GISJOIN YEAR STATE STATEA COUNTY COUNTYA CTY_SUBA \\\n", "0 G10000100401001000 2000 Delaware 10 Kent 001 91480 \n", "1 G10000100401001001 2000 Delaware 10 Kent 001 91480 \n", "2 G10000100401001002 2000 Delaware 10 Kent 001 91480 \n", "3 G10000100401001003 2000 Delaware 10 Kent 001 91480 \n", "4 G10000100401001004 2000 Delaware 10 Kent 001 91480 \n", "... ... ... ... ... ... ... ... \n", "17479 G10000500519002095 2000 Delaware 10 Sussex 005 91628 \n", "17480 G10000500519002096 2000 Delaware 10 Sussex 005 91628 \n", "17481 G10000500519002097 2000 Delaware 10 Sussex 005 91628 \n", "17482 G10000500519002098 2000 Delaware 10 Sussex 005 91628 \n", "17483 G34003300204002001 2000 New Jersey 34 Salem 033 10610 \n", "\n", " PLACEA TRACTA BLCK_GRPA BLOCKA AIANHHA URBRURALA NAME FXS001 \\\n", "0 99999 040100 1 1000 9999 R Block 1000 15 \n", "1 99999 040100 1 1001 9999 R Block 1001 364 \n", "2 99999 040100 1 1002 9999 R Block 1002 16 \n", "3 99999 040100 1 1003 9999 R Block 1003 80 \n", "4 99999 040100 1 1004 9999 R Block 1004 55 \n", "... ... ... ... ... ... ... ... ... \n", "17479 20380 051900 2 2095 9999 U Block 2095 18 \n", "17480 20380 051900 2 2096 9999 R Block 2096 3 \n", "17481 99999 051900 2 2097 9999 U Block 2097 0 \n", "17482 99999 051900 2 2098 9999 U Block 2098 18 \n", "17483 10600 020400 2 2001 9999 U Block 2001 418 \n", "\n", " FY4001 F2V001 FV5001 \n", "0 9 6 9 \n", "1 104 94 104 \n", "2 6 6 6 \n", "3 27 24 27 \n", "4 20 15 21 \n", "... ... ... ... \n", "17479 9 4 12 \n", "17480 2 1 2 \n", "17481 0 0 0 \n", "17482 5 3 5 \n", "17483 234 92 257 \n", "\n", "[17484 rows x 18 columns]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "uniques = base_source_df[\"GISJOIN\"].isin(blk2000.unique())\n", "base_source_df = base_source_df[uniques]\n", "base_source_df.reset_index(drop=True, inplace=True)\n", "base_source_df" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "ExecuteTime": { "end_time": "2020-10-01T21:39:26.023155Z", "start_time": "2020-10-01T21:39:25.908292Z" } }, "outputs": [], "source": [ "base_source_df.to_csv(\"%s%s.csv.zip\" % (data_out, block_file))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-----------------" ] } ], "metadata": { "_draft": { "nbviewer_url": "https://gist.github.com/9f47e4ec2cc37bce83acf20abfca69d2" }, "gist": { "data": { "description": "sample-workflow.ipynb", "public": true }, "id": "9f47e4ec2cc37bce83acf20abfca69d2" }, "kernelspec": { "display_name": "Python [conda env:nhgis] *", "language": "python", "name": "conda-env-nhgis-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }