{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# weighted-portion-synthetic-atoms\n",
"\n",
" ### toy example"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2020-06-21T01:05:26.334914Z",
"start_time": "2020-06-21T01:05:26.314784Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2020-06-20T21:05:26-04:00\n",
"\n",
"CPython 3.7.6\n",
"IPython 7.15.0\n",
"\n",
"compiler : Clang 9.0.1 \n",
"system : Darwin\n",
"release : 19.5.0\n",
"machine : x86_64\n",
"processor : i386\n",
"CPU cores : 8\n",
"interpreter: 64bit\n"
]
}
],
"source": [
"%load_ext watermark\n",
"%watermark"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2020-06-21T01:05:26.634846Z",
"start_time": "2020-06-21T01:05:26.337573Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"watermark 2.0.2\n",
"nhgisxwalk 0.0.4\n",
"\n"
]
}
],
"source": [
"import nhgisxwalk\n",
"import inspect\n",
"\n",
"%load_ext autoreload\n",
"%autoreload 2\n",
"%watermark -w\n",
"%watermark -iv"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2020-06-21T01:05:26.663128Z",
"start_time": "2020-06-21T01:05:26.639214Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"def calculate_atoms(\n",
" df,\n",
" weight=None,\n",
" input_var=None,\n",
" weight_var=None,\n",
" weight_prefix=None,\n",
" source_id=None,\n",
" groupby_cols=None,\n",
" overwrite_attrs=None,\n",
"):\n",
" \"\"\"Calculate the atoms (intersecting parts) of census geographies\n",
" and interpolate a proportional weight of the source attribute that\n",
" lies within the target geography.\n",
" \n",
" Parameters\n",
" ----------\n",
" \n",
" df : pandas.DataFrame\n",
" The input data. See ``GeoCrossWalk.base``.\n",
" \n",
" weight : str\n",
" The weight colum name(s).\n",
" \n",
" input_var : str or iterable\n",
" The input variable column name(s).\n",
" \n",
" weight_var : str or iterable\n",
" The groupby and summed variable column name(s).\n",
" \n",
" weight_prefix : str\n",
" Prepend this prefix to the the ``weight_var`` column name.\n",
" \n",
" source_id : str\n",
" The source ID column name.\n",
" \n",
" groupby_cols : list\n",
" The dataframe columns on which to perform groupby.\n",
" \n",
" overwrite_attrs : None or GeoCrossWalk\n",
" Setting this parameter to a ``GeoCrossWalk`` object overwrites the\n",
" ``input_var`` and ``weight_var`` attributes. Default is ``None``.\n",
" \n",
" Returns\n",
" -------\n",
" \n",
" atoms : pandas.DataFrame\n",
" All intersections between ``source`` and ``target`` geographies, and \n",
" the interpolated weight calculations for the propotion of\n",
" source area attributes that are in the target area.\n",
" \n",
" Notes\n",
" -----\n",
" \n",
" See example 1 in the ``GeoCrossWalk`` Examples section.\n",
" \n",
" \"\"\"\n",
"\n",
" # confirm variable data types\n",
" input_var, weight_var = _check_vars(input_var), _check_vars(weight_var)\n",
"\n",
" # determine length of variable lists\n",
" n_input_var, n_weight_var = len(input_var), len(weight_var)\n",
"\n",
" # check variable lists are equal length\n",
" if n_input_var != n_weight_var:\n",
" msg = \"The 'input_var' and 'weight_var' should be the same length. \"\n",
" msg += \"%s != %s\" % (n_input_var, n_weight_var)\n",
" raise RuntimeError(msg)\n",
"\n",
" # add prefix (if desired)\n",
" weight_col = _weight_columns(weight_prefix if weight_prefix else \"\", weight_var)\n",
"\n",
" if str(overwrite_attrs) != \"None\":\n",
" overwrite_attrs.input_var = input_var\n",
" overwrite_attrs.weight_col = weight_col\n",
"\n",
" # iterate over each pair of input/interpolation variables\n",
" for ix, (ivar, wvar) in enumerate(zip(input_var, weight_col)):\n",
"\n",
" # calculate numerators\n",
" df[wvar] = df[weight] * df[ivar]\n",
" if ix == 0:\n",
" # on the first iteration create an atom dataframe\n",
" atoms = df.groupby(groupby_cols)[wvar].sum().to_frame()\n",
" atoms.reset_index(inplace=True)\n",
" else:\n",
" # on tsubsequent iterations add weights as a column\n",
" atoms[wvar] = df.groupby(groupby_cols)[wvar].sum().values\n",
"\n",
" # calculate denominators\n",
" denominators = atoms.groupby(source_id)[wvar].sum()\n",
"\n",
" # interpolate weights\n",
" atoms[wvar] = atoms[wvar] / atoms[source_id].map(denominators)\n",
"\n",
" # if any weights are NaN, replace with 0.\n",
" atoms[wvar].fillna(0.0, inplace=True)\n",
"\n",
" return atoms\n",
"\n"
]
}
],
"source": [
"print(inspect.getsource(nhgisxwalk.calculate_atoms))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### toy data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2020-06-21T01:05:26.703147Z",
"start_time": "2020-06-21T01:05:26.665058Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" bgp1990 | \n",
" blk1990 | \n",
" blk2010 | \n",
" trt2010 | \n",
" wt | \n",
" pop_1990 | \n",
" hh_1990 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" A | \n",
" A.1 | \n",
" X.1 | \n",
" X | \n",
" 1.0 | \n",
" 60.0 | \n",
" 25.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" A | \n",
" A.2 | \n",
" X.2 | \n",
" X | \n",
" 0.3 | \n",
" 100.0 | \n",
" 40.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" A | \n",
" A.2 | \n",
" Y.1 | \n",
" Y | \n",
" 0.7 | \n",
" 100.0 | \n",
" 40.0 | \n",
"
\n",
" \n",
" | 3 | \n",
" B | \n",
" B.1 | \n",
" X.3 | \n",
" X | \n",
" 1.0 | \n",
" 50.0 | \n",
" 20.0 | \n",
"
\n",
" \n",
" | 4 | \n",
" B | \n",
" B.2 | \n",
" Y.2 | \n",
" Y | \n",
" 1.0 | \n",
" 80.0 | \n",
" 30.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" bgp1990 blk1990 blk2010 trt2010 wt pop_1990 hh_1990\n",
"0 A A.1 X.1 X 1.0 60.0 25.0\n",
"1 A A.2 X.2 X 0.3 100.0 40.0\n",
"2 A A.2 Y.1 Y 0.7 100.0 40.0\n",
"3 B B.1 X.3 X 1.0 50.0 20.0\n",
"4 B B.2 Y.2 Y 1.0 80.0 30.0"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"toy_df = nhgisxwalk.example_crosswalk_data()\n",
"toy_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### demo atom crosswalk calculation"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2020-06-21T01:05:26.745505Z",
"start_time": "2020-06-21T01:05:26.705467Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" bgp1990 | \n",
" trt2010 | \n",
" wt_pop | \n",
" wt_hh | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" A | \n",
" X | \n",
" 0.562500 | \n",
" 0.569231 | \n",
"
\n",
" \n",
" | 1 | \n",
" A | \n",
" Y | \n",
" 0.437500 | \n",
" 0.430769 | \n",
"
\n",
" \n",
" | 2 | \n",
" B | \n",
" X | \n",
" 0.384615 | \n",
" 0.400000 | \n",
"
\n",
" \n",
" | 3 | \n",
" B | \n",
" Y | \n",
" 0.615385 | \n",
" 0.600000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" bgp1990 trt2010 wt_pop wt_hh\n",
"0 A X 0.562500 0.569231\n",
"1 A Y 0.437500 0.430769\n",
"2 B X 0.384615 0.400000\n",
"3 B Y 0.615385 0.600000"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"toy_atoms = nhgisxwalk.calculate_atoms(\n",
" toy_df,\n",
" weight=\"wt\",\n",
" input_var=[\"pop_1990\", \"hh_1990\"],\n",
" weight_var=[\"pop\", \"hh\"],\n",
" weight_prefix=\"wt_\",\n",
" source_id=\"bgp1990\",\n",
" groupby_cols=[\"bgp1990\", \"trt2010\"]\n",
")\n",
"toy_atoms"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"-------------------"
]
}
],
"metadata": {
"_draft": {
"nbviewer_url": "https://gist.github.com/ee43d7cfc266d2bfcba379cb572107f4"
},
"gist": {
"data": {
"description": "weighted-portion-synthetic-atoms.ipynb",
"public": true
},
"id": "ee43d7cfc266d2bfcba379cb572107f4"
},
"kernelspec": {
"display_name": "Python [conda env:nhgis]",
"language": "python",
"name": "conda-env-nhgis-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}