In [None]:
# This file is part of the Minnesota Population Center's NHGISXWALK.
# For copyright and licensing information, see the NOTICE and LICENSE files
# in this project's top-level directory, and also on-line at:
#   https://github.com/ipums/nhgisxwalk

# Sample workflow: 1990 block group parts to 2010 tracts

## Starting from a subset of 2010 Delaware blocks

For further background information see:

* **Schroeder, J. P**. 2007. *Target-density weighting interpolation and uncertainty evaluation for temporal analysis of census data*. Geographical Analysis 39 (3):311–335.

#### NHGIS [block crosswalks](https://www.nhgis.org/user-resources/geographic-crosswalks)

In [1]:
%load_ext watermark
%watermark

2020-08-19T18:07:22-04:00

CPython 3.8.5
IPython 7.16.1

compiler   : Clang 10.0.1 
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 8
interpreter: 64bit


In [2]:
import nhgisxwalk
import inspect
import numpy
import pandas

%load_ext autoreload
%autoreload 2
%watermark -w
%watermark -iv

watermark 2.0.2
numpy      1.19.1
pandas     1.1.0
nhgisxwalk 0.0.9



### Source and target years for the crosswalk

In [3]:
source_year, target_year = "1990", "2010"
gj_src, gj_trg = "GJOIN%s"%source_year, "GJOIN%s"%target_year

### Source-target building base

In [4]:
subset_data_dir = "../testing_data_subsets/"
base_xwalk_name = "nhgis_blk%s_blk%s_gj" % (source_year, target_year)
data_types = nhgisxwalk.str_types([gj_src, gj_trg])
from_csv_kws = {"path": subset_data_dir, "archived": True, "remove_unpacked": True}
read_csv_kws = {"dtype": data_types}
base_xwalk = nhgisxwalk.xwalk_df_from_csv(
    base_xwalk_name, **from_csv_kws, **read_csv_kws
)
base_xwalk.head()

Unnamed: 0,GJOIN1990,GJOIN2010,WEIGHT,PAREA_VIA_BLK00
0,G10000100401101,G10000100401001000,1.0,1.0
1,G10000100401102,G10000100401001001,0.92175,0.976774
2,G10000100401102,G10000100401001002,0.078219,0.023215
3,G10000100401102,G10000100401001003,3.1e-05,1.2e-05
4,G10000100401103,G10000100401001003,1.0,1.0


### Set the base (source) summary file name

In [5]:
base_source_name = "%s_block.csv.zip" % source_year
base_source_file = "%s%s" % (subset_data_dir, base_source_name)

### Source supplementary summary data (special case for 1990)

In [6]:
supp_source_name = "%s_blck_grp_598.csv.zip" % source_year
supp_source_file = "%s%s" % (subset_data_dir, supp_source_name)

### Convenience code shorthand/lookup

In [7]:
print(inspect.getsource(nhgisxwalk.valid_geo_shorthand))

def valid_geo_shorthand(shorthand_name=True):
    """Shorthand lookups for census geographies."""
    lookup = {
        "blk": "block",
        "bgp": "block group part",
        "bg": "block group",
        "tr": "tract",
        "co": "county",
    }
    if not shorthand_name:
        lookup = {v: k for k, v in lookup.items()}
    return lookup



In [8]:
nhgisxwalk.valid_geo_shorthand(shorthand_name=False)

{'block': 'blk',
 'block group part': 'bgp',
 'block group': 'bg',
 'tract': 'tr',
 'county': 'co'}

### Instantiate an `nhgisxwalk.GeoCrossWalk` object
##### see [nhgisxwalk.GeoCrossWalk](https://github.com/ipums/nhgisxwalk/blob/92b4fe55de0a9c53d0315dcda8ec121faaf20aef/nhgisxwalk/geocrosswalk.py#L19) for full details

In [9]:
nhgisxwalk.desc_code_1990

{'Persons': {'Persons': 'Universe',
  'NP1': 'Source code',
  'ET1': 'NHGIS code',
  'Total': 'ET1001'},
 'Families': {'Families': 'Universe',
  'NP2': 'Source code',
  'EUD': 'NHGIS code',
  'Total': 'EUD001'},
 'Households': {'Households': 'Universe',
  'NP3': 'Source code',
  'EUO': 'NHGIS code',
  'Total': 'EUO001'},
 'Housing Units': {'Housing Units': 'Universe',
  'NH1': 'Source code',
  'ESA': 'NHGIS code',
  'Total': 'ESA001'}}

In [10]:
input_vars = [
    nhgisxwalk.desc_code_1990["Persons"]["Total"],
    nhgisxwalk.desc_code_1990["Families"]["Total"],
    nhgisxwalk.desc_code_1990["Households"]["Total"],
    nhgisxwalk.desc_code_1990["Housing Units"]["Total"]
]
input_vars

['ET1001', 'EUD001', 'EUO001', 'ESA001']

In [11]:
input_var_tags = ["pop", "fam", "hh", "hu"]

In [12]:
subset_state = "10"
bgp1990_to_tr2010 = nhgisxwalk.GeoCrossWalk(
    base_xwalk,
    source_year=source_year,
    target_year=target_year,
    source_geo="bgp",
    target_geo="tr",
    base_source_table=base_source_file,
    supp_source_table=supp_source_file,
    input_var=input_vars,
    weight_var=input_var_tags,
    stfips=subset_state,
    keep_base=True,
    add_geoid=True
)
bgp1990_to_tr2010.xwalk

Unnamed: 0,bgp1990gj,tr2010gj,tr2010ge,wt_pop,wt_fam,wt_hh,wt_hu
0,G100001090444072500423009999999999921,G1000010043202,10001043202,1.0,1.0,1.0,1.0
1,G100001090444444300422009999999999926,G1000010042202,10001042202,1.0,1.0,1.0,1.0
2,G100001090444612650422009999999219011,G1000010041200,10001041200,0.0,0.0,0.0,0.0
3,G100001090444612650422009999999219011,G1000010042201,10001042201,1.0,1.0,1.0,1.0
4,G100001090444612650422009999999219012,G1000010042201,10001042201,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...
1058,G100005093552999990515009999999999923,G1000050051500,10005051500,1.0,1.0,1.0,1.0
1059,G100005093552999990515009999999999924,G1000050051500,10005051500,1.0,1.0,1.0,1.0
1060,G100005093552999990516009999999999921,G1000050051702,10005051702,1.0,1.0,1.0,1.0
1061,G340033010610106000204029999999916014,G1000030990100,10003990100,0.0,0.0,0.0,0.0


### Prepare a single data product with a `README.txt`

In [13]:
xwalk, xwalk_name = bgp1990_to_tr2010.xwalk, bgp1990_to_tr2010.xwalk_name
xwalk_name_base = "_".join(xwalk_name.split("_")[:-1])

In [14]:
out_data_dir = "../../crosswalks/"
out_path = "%s%s%s/%s" % (out_data_dir, xwalk_name_base, "_state", xwalk_name)
nhgisxwalk.prepare_data_product(xwalk, xwalk_name, out_path, remove=True)

### Read crosswalk from a `.zip` archive

In [15]:
in_path = "%s%s%s" % (out_data_dir, xwalk_name_base, "_state/")
id_cols = [c for c in xwalk.columns if not c.startswith("wt")]
data_types = nhgisxwalk.str_types(id_cols)
from_csv_kws = {"path": in_path, "archived": True, "remove_unpacked": True}
read_csv_kws = {"dtype": data_types}
bgp1990_to_tr2010_df = nhgisxwalk.xwalk_df_from_csv(
    xwalk_name, **from_csv_kws, **read_csv_kws
)
bgp1990_to_tr2010_df

Unnamed: 0,bgp1990gj,tr2010gj,tr2010ge,wt_pop,wt_fam,wt_hh,wt_hu
0,G100001090444072500423009999999999921,G1000010043202,10001043202,1.0,1.0,1.0,1.0
1,G100001090444444300422009999999999926,G1000010042202,10001042202,1.0,1.0,1.0,1.0
2,G100001090444612650422009999999219011,G1000010041200,10001041200,0.0,0.0,0.0,0.0
3,G100001090444612650422009999999219011,G1000010042201,10001042201,1.0,1.0,1.0,1.0
4,G100001090444612650422009999999219012,G1000010042201,10001042201,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...
1058,G100005093552999990515009999999999923,G1000050051500,10005051500,1.0,1.0,1.0,1.0
1059,G100005093552999990515009999999999924,G1000050051500,10005051500,1.0,1.0,1.0,1.0
1060,G100005093552999990516009999999999921,G1000050051702,10005051702,1.0,1.0,1.0,1.0
1061,G340033010610106000204029999999916014,G1000030990100,10003990100,0.0,0.0,0.0,0.0


-----------------------------------------------