{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# weighted-portion-synthetic-atoms\n", "\n", " ### toy example" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-06-21T01:05:26.334914Z", "start_time": "2020-06-21T01:05:26.314784Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2020-06-20T21:05:26-04:00\n", "\n", "CPython 3.7.6\n", "IPython 7.15.0\n", "\n", "compiler : Clang 9.0.1 \n", "system : Darwin\n", "release : 19.5.0\n", "machine : x86_64\n", "processor : i386\n", "CPU cores : 8\n", "interpreter: 64bit\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2020-06-21T01:05:26.634846Z", "start_time": "2020-06-21T01:05:26.337573Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "watermark 2.0.2\n", "nhgisxwalk 0.0.4\n", "\n" ] } ], "source": [ "import nhgisxwalk\n", "import inspect\n", "\n", "%load_ext autoreload\n", "%autoreload 2\n", "%watermark -w\n", "%watermark -iv" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2020-06-21T01:05:26.663128Z", "start_time": "2020-06-21T01:05:26.639214Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "def calculate_atoms(\n", " df,\n", " weight=None,\n", " input_var=None,\n", " weight_var=None,\n", " weight_prefix=None,\n", " source_id=None,\n", " groupby_cols=None,\n", " overwrite_attrs=None,\n", "):\n", " \"\"\"Calculate the atoms (intersecting parts) of census geographies\n", " and interpolate a proportional weight of the source attribute that\n", " lies within the target geography.\n", " \n", " Parameters\n", " ----------\n", " \n", " df : pandas.DataFrame\n", " The input data. See ``GeoCrossWalk.base``.\n", " \n", " weight : str\n", " The weight colum name(s).\n", " \n", " input_var : str or iterable\n", " The input variable column name(s).\n", " \n", " weight_var : str or iterable\n", " The groupby and summed variable column name(s).\n", " \n", " weight_prefix : str\n", " Prepend this prefix to the the ``weight_var`` column name.\n", " \n", " source_id : str\n", " The source ID column name.\n", " \n", " groupby_cols : list\n", " The dataframe columns on which to perform groupby.\n", " \n", " overwrite_attrs : None or GeoCrossWalk\n", " Setting this parameter to a ``GeoCrossWalk`` object overwrites the\n", " ``input_var`` and ``weight_var`` attributes. Default is ``None``.\n", " \n", " Returns\n", " -------\n", " \n", " atoms : pandas.DataFrame\n", " All intersections between ``source`` and ``target`` geographies, and \n", " the interpolated weight calculations for the propotion of\n", " source area attributes that are in the target area.\n", " \n", " Notes\n", " -----\n", " \n", " See example 1 in the ``GeoCrossWalk`` Examples section.\n", " \n", " \"\"\"\n", "\n", " # confirm variable data types\n", " input_var, weight_var = _check_vars(input_var), _check_vars(weight_var)\n", "\n", " # determine length of variable lists\n", " n_input_var, n_weight_var = len(input_var), len(weight_var)\n", "\n", " # check variable lists are equal length\n", " if n_input_var != n_weight_var:\n", " msg = \"The 'input_var' and 'weight_var' should be the same length. \"\n", " msg += \"%s != %s\" % (n_input_var, n_weight_var)\n", " raise RuntimeError(msg)\n", "\n", " # add prefix (if desired)\n", " weight_col = _weight_columns(weight_prefix if weight_prefix else \"\", weight_var)\n", "\n", " if str(overwrite_attrs) != \"None\":\n", " overwrite_attrs.input_var = input_var\n", " overwrite_attrs.weight_col = weight_col\n", "\n", " # iterate over each pair of input/interpolation variables\n", " for ix, (ivar, wvar) in enumerate(zip(input_var, weight_col)):\n", "\n", " # calculate numerators\n", " df[wvar] = df[weight] * df[ivar]\n", " if ix == 0:\n", " # on the first iteration create an atom dataframe\n", " atoms = df.groupby(groupby_cols)[wvar].sum().to_frame()\n", " atoms.reset_index(inplace=True)\n", " else:\n", " # on tsubsequent iterations add weights as a column\n", " atoms[wvar] = df.groupby(groupby_cols)[wvar].sum().values\n", "\n", " # calculate denominators\n", " denominators = atoms.groupby(source_id)[wvar].sum()\n", "\n", " # interpolate weights\n", " atoms[wvar] = atoms[wvar] / atoms[source_id].map(denominators)\n", "\n", " # if any weights are NaN, replace with 0.\n", " atoms[wvar].fillna(0.0, inplace=True)\n", "\n", " return atoms\n", "\n" ] } ], "source": [ "print(inspect.getsource(nhgisxwalk.calculate_atoms))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### toy data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2020-06-21T01:05:26.703147Z", "start_time": "2020-06-21T01:05:26.665058Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp1990blk1990blk2010trt2010wtpop_1990hh_1990
0AA.1X.1X1.060.025.0
1AA.2X.2X0.3100.040.0
2AA.2Y.1Y0.7100.040.0
3BB.1X.3X1.050.020.0
4BB.2Y.2Y1.080.030.0
\n", "
" ], "text/plain": [ " bgp1990 blk1990 blk2010 trt2010 wt pop_1990 hh_1990\n", "0 A A.1 X.1 X 1.0 60.0 25.0\n", "1 A A.2 X.2 X 0.3 100.0 40.0\n", "2 A A.2 Y.1 Y 0.7 100.0 40.0\n", "3 B B.1 X.3 X 1.0 50.0 20.0\n", "4 B B.2 Y.2 Y 1.0 80.0 30.0" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "toy_df = nhgisxwalk.example_crosswalk_data()\n", "toy_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### demo atom crosswalk calculation" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2020-06-21T01:05:26.745505Z", "start_time": "2020-06-21T01:05:26.705467Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bgp1990trt2010wt_popwt_hh
0AX0.5625000.569231
1AY0.4375000.430769
2BX0.3846150.400000
3BY0.6153850.600000
\n", "
" ], "text/plain": [ " bgp1990 trt2010 wt_pop wt_hh\n", "0 A X 0.562500 0.569231\n", "1 A Y 0.437500 0.430769\n", "2 B X 0.384615 0.400000\n", "3 B Y 0.615385 0.600000" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "toy_atoms = nhgisxwalk.calculate_atoms(\n", " toy_df,\n", " weight=\"wt\",\n", " input_var=[\"pop_1990\", \"hh_1990\"],\n", " weight_var=[\"pop\", \"hh\"],\n", " weight_prefix=\"wt_\",\n", " source_id=\"bgp1990\",\n", " groupby_cols=[\"bgp1990\", \"trt2010\"]\n", ")\n", "toy_atoms" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-------------------" ] } ], "metadata": { "_draft": { "nbviewer_url": "https://gist.github.com/ee43d7cfc266d2bfcba379cb572107f4" }, "gist": { "data": { "description": "weighted-portion-synthetic-atoms.ipynb", "public": true }, "id": "ee43d7cfc266d2bfcba379cb572107f4" }, "kernelspec": { "display_name": "Python [conda env:nhgis]", "language": "python", "name": "conda-env-nhgis-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }