{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2021-07-07T23:49:42.739638Z", "start_time": "2021-07-07T23:49:42.736613Z" } }, "outputs": [], "source": [ "# This file is part of the Minnesota Population Center's NHGISXWALK.\n", "# For copyright and licensing information, see the NOTICE and LICENSE files\n", "# in this project's top-level directory, and also on-line at:\n", "# https://github.com/ipums/nhgisxwalk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Test Subsets for testing: 1990\n", "## blocks, block groups parts, and blocks\n", "\n", "\n", "1. From a national crosswalk: \n", " 1. Create target state-level subsets for NHGIS base crosswalks\n", " 1. Create target state-level subsets for NHGIS base tabular data\n", " 1. Record unit tests values for posterity\n", "\n", "\n", "\n", "**This is currently only intended for use with block-level data as base units.**\n", "\n", "\n", "**James Gaboardi** **(), 2020-05**" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2021-07-07T23:49:42.775311Z", "start_time": "2021-07-07T23:49:42.742265Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Last updated: 2021-07-07T19:49:42.757472-04:00\n", "\n", "Python implementation: CPython\n", "Python version : 3.9.6\n", "IPython version : 7.25.0\n", "\n", "Compiler : Clang 11.1.0 \n", "OS : Darwin\n", "Release : 20.5.0\n", "Machine : x86_64\n", "Processor : i386\n", "CPU cores : 8\n", "Architecture: 64bit\n", "\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2021-07-07T23:49:43.102934Z", "start_time": "2021-07-07T23:49:42.779295Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Watermark: 2.2.0\n", "\n", "json : 2.0.9\n", "numpy : 1.21.0\n", "pandas : 1.3.0\n", "nhgisxwalk: 0.1.1\n", "\n" ] } ], "source": [ "import inspect\n", "import nhgisxwalk\n", "import numpy\n", "import pandas\n", "\n", "%load_ext autoreload\n", "%autoreload 2\n", "%watermark -w\n", "%watermark -iv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set the state (for subsetting), source & target, and year & geography" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2021-07-07T23:49:43.128747Z", "start_time": "2021-07-07T23:49:43.105050Z" } }, "outputs": [], "source": [ "source_year, target_year = \"1990\", \"2010\"\n", "gj_src, gj_trg = \"GJOIN%s\"%source_year, \"GJOIN%s\"%target_year" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2021-07-07T23:49:43.153433Z", "start_time": "2021-07-07T23:49:43.131013Z" } }, "outputs": [], "source": [ "data_path = \"../testing_data_subsets/\"\n", "block_file = \"%s_block\" % source_year\n", "supp_file = \"%s_blck_grp_598\" % source_year" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set the base-level crosswalk file name" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2021-07-07T23:49:43.179773Z", "start_time": "2021-07-07T23:49:43.155331Z" } }, "outputs": [ { "data": { "text/plain": [ "'nhgis_blk1990_blk2010_gj'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "base_xwalk_name = \"nhgis_blk%s_blk%s_gj\" % (source_year, target_year)\n", "base_xwalk_name" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set the base (source) summary file name" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2021-07-07T23:49:43.202464Z", "start_time": "2021-07-07T23:49:43.181315Z" } }, "outputs": [ { "data": { "text/plain": [ "'../testing_data_subsets/1990_block.csv.zip'" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "base_source_name = \"%s.csv.zip\" % block_file\n", "base_source_file = \"%s%s\" % (data_path, base_source_name)\n", "base_source_file" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set the supplementary summary file name" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2021-07-07T23:49:43.233569Z", "start_time": "2021-07-07T23:49:43.207738Z" } }, "outputs": [ { "data": { "text/plain": [ "'../testing_data_subsets/1990_blck_grp_598.csv.zip'" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "supp_source_name = \"%s.csv.zip\" % supp_file\n", "supp_source_file = \"%s%s\" % (data_path, supp_source_name)\n", "supp_source_file" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read in the national the base-level crosswalk" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2021-07-07T23:49:43.320083Z", "start_time": "2021-07-07T23:49:43.235448Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GJOIN1990GJOIN2010WEIGHTPAREA_VIA_BLK00
0G10000100401101G100001004010010001.0000001.000000
1G10000100401102G100001004010010010.9217500.976774
2G10000100401102G100001004010010020.0782190.023215
3G10000100401102G100001004010010030.0000310.000012
4G10000100401103G100001004010010031.0000001.000000
\n", "
" ], "text/plain": [ " GJOIN1990 GJOIN2010 WEIGHT PAREA_VIA_BLK00\n", "0 G10000100401101 G10000100401001000 1.000000 1.000000\n", "1 G10000100401102 G10000100401001001 0.921750 0.976774\n", "2 G10000100401102 G10000100401001002 0.078219 0.023215\n", "3 G10000100401102 G10000100401001003 0.000031 0.000012\n", "4 G10000100401103 G10000100401001003 1.000000 1.000000" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_types = nhgisxwalk.str_types([gj_src, gj_trg])\n", "from_csv_kws = {\"path\": data_path, \"archived\": True, \"remove_unpacked\": True}\n", "read_csv_kws = {\"dtype\": data_types}\n", "base_xwalk = nhgisxwalk.xwalk_df_from_csv(\n", " base_xwalk_name, **from_csv_kws, **read_csv_kws\n", ")\n", "base_xwalk.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Declare input variable\n", "**not needed for creating a subset perse, but should do regardless**" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2021-07-07T23:49:43.341375Z", "start_time": "2021-07-07T23:49:43.321761Z" } }, "outputs": [], "source": [ "input_vars = [\n", " nhgisxwalk.desc_code_1990[\"Persons\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Families\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Households\"][\"Total\"],\n", " nhgisxwalk.desc_code_1990[\"Housing Units\"][\"Total\"]\n", "]\n", "input_var_tags = [\"pop\", \"fam\", \"hh\", \"hu\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Generate the desired crosswalk and subset down to the target state" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2021-07-07T23:49:43.813431Z", "start_time": "2021-07-07T23:49:43.343282Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp1990gjtr2010gjtr2010gewt_popwt_famwt_hhwt_hu
0G100001090444072500423009999999999921G1000010043202100010432021.01.01.01.0
1G100001090444444300422009999999999926G1000010042202100010422021.01.01.01.0
2G100001090444612650422009999999219011G1000010041200100010412000.00.00.00.0
3G100001090444612650422009999999219011G1000010042201100010422011.01.01.01.0
4G100001090444612650422009999999219012G1000010042201100010422011.01.01.01.0
........................
1058G100005093552999990515009999999999923G1000050051500100050515001.01.01.01.0
1059G100005093552999990515009999999999924G1000050051500100050515001.01.01.01.0
1060G100005093552999990516009999999999921G1000050051702100050517021.01.01.01.0
1061G340033010610106000204029999999916014G1000030990100100039901000.00.00.00.0
1062NaNG1000050990000100059900000.00.00.00.0
\n", "

1063 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp1990gj tr2010gj tr2010ge \\\n", "0 G100001090444072500423009999999999921 G1000010043202 10001043202 \n", "1 G100001090444444300422009999999999926 G1000010042202 10001042202 \n", "2 G100001090444612650422009999999219011 G1000010041200 10001041200 \n", "3 G100001090444612650422009999999219011 G1000010042201 10001042201 \n", "4 G100001090444612650422009999999219012 G1000010042201 10001042201 \n", "... ... ... ... \n", "1058 G100005093552999990515009999999999923 G1000050051500 10005051500 \n", "1059 G100005093552999990515009999999999924 G1000050051500 10005051500 \n", "1060 G100005093552999990516009999999999921 G1000050051702 10005051702 \n", "1061 G340033010610106000204029999999916014 G1000030990100 10003990100 \n", "1062 NaN G1000050990000 10005990000 \n", "\n", " wt_pop wt_fam wt_hh wt_hu \n", "0 1.0 1.0 1.0 1.0 \n", "1 1.0 1.0 1.0 1.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 1.0 1.0 1.0 1.0 \n", "4 1.0 1.0 1.0 1.0 \n", "... ... ... ... ... \n", "1058 1.0 1.0 1.0 1.0 \n", "1059 1.0 1.0 1.0 1.0 \n", "1060 1.0 1.0 1.0 1.0 \n", "1061 0.0 0.0 0.0 0.0 \n", "1062 0.0 0.0 0.0 0.0 \n", "\n", "[1063 rows x 7 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "state_bgp1990tr2010 = nhgisxwalk.GeoCrossWalk(\n", " base_xwalk,\n", " source_year=source_year,\n", " target_year=target_year,\n", " source_geo=\"bgp\",\n", " target_geo=\"tr\",\n", " base_source_table=base_source_file,\n", " supp_source_table=supp_source_file,\n", " input_var=input_vars,\n", " weight_var=input_var_tags,\n", " keep_base=True,\n", " add_geoid=True\n", ")\n", "state_bgp1990tr2010.xwalk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### unittests" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2021-07-07T23:49:43.848026Z", "start_time": "2021-07-07T23:49:43.814899Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp1990gjtr2010gjtr2010gewt_popwt_famwt_hhwt_hu
0G100001090444072500423009999999999921G1000010043202100010432021.01.01.01.0
1G100001090444444300422009999999999926G1000010042202100010422021.01.01.01.0
2G100001090444612650422009999999219011G1000010041200100010412000.00.00.00.0
3G100001090444612650422009999999219011G1000010042201100010422011.01.01.01.0
4G100001090444612650422009999999219012G1000010042201100010422011.01.01.01.0
........................
1058G100005093552999990515009999999999923G1000050051500100050515001.01.01.01.0
1059G100005093552999990515009999999999924G1000050051500100050515001.01.01.01.0
1060G100005093552999990516009999999999921G1000050051702100050517021.01.01.01.0
1061G340033010610106000204029999999916014G1000030990100100039901000.00.00.00.0
1062NaNG1000050990000100059900000.00.00.00.0
\n", "

1063 rows × 7 columns

\n", "
" ], "text/plain": [ " bgp1990gj tr2010gj tr2010ge \\\n", "0 G100001090444072500423009999999999921 G1000010043202 10001043202 \n", "1 G100001090444444300422009999999999926 G1000010042202 10001042202 \n", "2 G100001090444612650422009999999219011 G1000010041200 10001041200 \n", "3 G100001090444612650422009999999219011 G1000010042201 10001042201 \n", "4 G100001090444612650422009999999219012 G1000010042201 10001042201 \n", "... ... ... ... \n", "1058 G100005093552999990515009999999999923 G1000050051500 10005051500 \n", "1059 G100005093552999990515009999999999924 G1000050051500 10005051500 \n", "1060 G100005093552999990516009999999999921 G1000050051702 10005051702 \n", "1061 G340033010610106000204029999999916014 G1000030990100 10003990100 \n", "1062 NaN G1000050990000 10005990000 \n", "\n", " wt_pop wt_fam wt_hh wt_hu \n", "0 1.0 1.0 1.0 1.0 \n", "1 1.0 1.0 1.0 1.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 1.0 1.0 1.0 1.0 \n", "4 1.0 1.0 1.0 1.0 \n", "... ... ... ... ... \n", "1058 1.0 1.0 1.0 1.0 \n", "1059 1.0 1.0 1.0 1.0 \n", "1060 1.0 1.0 1.0 1.0 \n", "1061 0.0 0.0 0.0 0.0 \n", "1062 0.0 0.0 0.0 0.0 \n", "\n", "[1063 rows x 7 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "state_bgp1990tr2010.xwalk.drop_duplicates(subset=[\"bgp1990gj\", \"tr2010gj\"])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2021-07-07T23:49:43.880522Z", "start_time": "2021-07-07T23:49:43.849275Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp1990gjtr2010gjtr2010gewt_popwt_famwt_hhwt_hu
13G100001090444999990421009999999219012G1000010042100100010421001.0000001.0000001.0000001.000000
14G100001090444999990421009999999999921G1000010042100100010421000.9976640.9971660.9971480.997278
15G100001090444999990421009999999999921G1000010042201100010422010.0023360.0028340.0028520.002722
16G100001090444999990421009999999999922G1000010042100100010421001.0000001.0000001.0000001.000000
\n", "
" ], "text/plain": [ " bgp1990gj tr2010gj tr2010ge \\\n", "13 G100001090444999990421009999999219012 G1000010042100 10001042100 \n", "14 G100001090444999990421009999999999921 G1000010042100 10001042100 \n", "15 G100001090444999990421009999999999921 G1000010042201 10001042201 \n", "16 G100001090444999990421009999999999922 G1000010042100 10001042100 \n", "\n", " wt_pop wt_fam wt_hh wt_hu \n", "13 1.000000 1.000000 1.000000 1.000000 \n", "14 0.997664 0.997166 0.997148 0.997278 \n", "15 0.002336 0.002834 0.002852 0.002722 \n", "16 1.000000 1.000000 1.000000 1.000000 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ix1, ix2 = 13, 17\n", "state_bgp1990tr2010.xwalk.loc[ix1:ix2-1]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2021-07-07T23:49:43.905399Z", "start_time": "2021-07-07T23:49:43.882102Z" } }, "outputs": [ { "data": { "text/plain": [ "array([['G100001090444999990421009999999219012', 'G1000010042100',\n", " '10001042100'],\n", " ['G100001090444999990421009999999999921', 'G1000010042100',\n", " '10001042100'],\n", " ['G100001090444999990421009999999999921', 'G1000010042201',\n", " '10001042201'],\n", " ['G100001090444999990421009999999999922', 'G1000010042100',\n", " '10001042100']], dtype=object)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "id_cols = [\"bgp1990gj\", \"tr2010gj\", \"tr2010ge\"]\n", "obs_str_vals = state_bgp1990tr2010.xwalk[id_cols][ix1:ix2].values\n", "obs_str_vals" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2021-07-07T23:49:43.932269Z", "start_time": "2021-07-07T23:49:43.907185Z" } }, "outputs": [ { "data": { "text/plain": [ "array([[1. , 1. , 1. , 1. ],\n", " [0.99766436, 0.99716625, 0.99714829, 0.99727768],\n", " [0.00233564, 0.00283375, 0.00285171, 0.00272232],\n", " [1. , 1. , 1. , 1. ]])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wgt_cols = [\"wt_pop\", \"wt_fam\", \"wt_hh\", \"wt_hu\"]\n", "obs_num_vals = state_bgp1990tr2010.xwalk[wgt_cols][ix1:ix2].values\n", "obs_num_vals" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2021-07-07T23:49:43.959830Z", "start_time": "2021-07-07T23:49:43.933753Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wt_popwt_famwt_hhwt_hu
131.0000001.0000001.0000001.000000
140.9976640.9971660.9971480.997278
150.0023360.0028340.0028520.002722
161.0000001.0000001.0000001.000000
\n", "
" ], "text/plain": [ " wt_pop wt_fam wt_hh wt_hu\n", "13 1.000000 1.000000 1.000000 1.000000\n", "14 0.997664 0.997166 0.997148 0.997278\n", "15 0.002336 0.002834 0.002852 0.002722\n", "16 1.000000 1.000000 1.000000 1.000000" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "state_bgp1990tr2010.xwalk[wgt_cols][ix1:ix2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-----------------" ] } ], "metadata": { "_draft": { "nbviewer_url": "https://gist.github.com/9f47e4ec2cc37bce83acf20abfca69d2" }, "gist": { "data": { "description": "sample-workflow.ipynb", "public": true }, "id": "9f47e4ec2cc37bce83acf20abfca69d2" }, "kernelspec": { "display_name": "Python [conda env:nhgis]", "language": "python", "name": "conda-env-nhgis-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 4 }