{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# weighted-portion-synthetic-atoms\n",
    "\n",
    " ###  toy example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T01:05:26.334914Z",
     "start_time": "2020-06-21T01:05:26.314784Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2020-06-20T21:05:26-04:00\n",
      "\n",
      "CPython 3.7.6\n",
      "IPython 7.15.0\n",
      "\n",
      "compiler   : Clang 9.0.1 \n",
      "system     : Darwin\n",
      "release    : 19.5.0\n",
      "machine    : x86_64\n",
      "processor  : i386\n",
      "CPU cores  : 8\n",
      "interpreter: 64bit\n"
     ]
    }
   ],
   "source": [
    "%load_ext watermark\n",
    "%watermark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T01:05:26.634846Z",
     "start_time": "2020-06-21T01:05:26.337573Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "watermark 2.0.2\n",
      "nhgisxwalk 0.0.4\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import nhgisxwalk\n",
    "import inspect\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "%watermark -w\n",
    "%watermark -iv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T01:05:26.663128Z",
     "start_time": "2020-06-21T01:05:26.639214Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "def calculate_atoms(\n",
      "    df,\n",
      "    weight=None,\n",
      "    input_var=None,\n",
      "    weight_var=None,\n",
      "    weight_prefix=None,\n",
      "    source_id=None,\n",
      "    groupby_cols=None,\n",
      "    overwrite_attrs=None,\n",
      "):\n",
      "    \"\"\"Calculate the atoms (intersecting parts) of census geographies\n",
      "    and interpolate a proportional weight of the source attribute that\n",
      "    lies within the target geography.\n",
      "    \n",
      "    Parameters\n",
      "    ----------\n",
      "    \n",
      "    df : pandas.DataFrame\n",
      "        The input data. See ``GeoCrossWalk.base``.\n",
      "    \n",
      "    weight : str\n",
      "        The weight colum name(s).\n",
      "    \n",
      "    input_var : str or iterable\n",
      "        The input variable column name(s).\n",
      "    \n",
      "    weight_var : str or iterable\n",
      "        The groupby and summed variable column name(s).\n",
      "    \n",
      "    weight_prefix : str\n",
      "        Prepend this prefix to the the ``weight_var`` column name.\n",
      "    \n",
      "    source_id : str\n",
      "        The source ID column name.\n",
      "    \n",
      "    groupby_cols : list\n",
      "        The dataframe columns on which to perform groupby.\n",
      "    \n",
      "    overwrite_attrs : None or GeoCrossWalk\n",
      "        Setting this parameter to a ``GeoCrossWalk`` object overwrites the\n",
      "        ``input_var`` and ``weight_var`` attributes. Default is ``None``.\n",
      "    \n",
      "    Returns\n",
      "    -------\n",
      "    \n",
      "    atoms : pandas.DataFrame\n",
      "        All intersections between ``source`` and ``target`` geographies, and \n",
      "        the interpolated weight calculations for the propotion of\n",
      "        source area attributes that are in the target area.\n",
      "    \n",
      "    Notes\n",
      "    -----\n",
      "    \n",
      "    See example 1 in the ``GeoCrossWalk`` Examples section.\n",
      "    \n",
      "    \"\"\"\n",
      "\n",
      "    # confirm variable data types\n",
      "    input_var, weight_var = _check_vars(input_var), _check_vars(weight_var)\n",
      "\n",
      "    # determine length of variable lists\n",
      "    n_input_var, n_weight_var = len(input_var), len(weight_var)\n",
      "\n",
      "    # check variable lists are equal length\n",
      "    if n_input_var != n_weight_var:\n",
      "        msg = \"The 'input_var' and 'weight_var' should be the same length. \"\n",
      "        msg += \"%s != %s\" % (n_input_var, n_weight_var)\n",
      "        raise RuntimeError(msg)\n",
      "\n",
      "    # add prefix (if desired)\n",
      "    weight_col = _weight_columns(weight_prefix if weight_prefix else \"\", weight_var)\n",
      "\n",
      "    if str(overwrite_attrs) != \"None\":\n",
      "        overwrite_attrs.input_var = input_var\n",
      "        overwrite_attrs.weight_col = weight_col\n",
      "\n",
      "    # iterate over each pair of input/interpolation variables\n",
      "    for ix, (ivar, wvar) in enumerate(zip(input_var, weight_col)):\n",
      "\n",
      "        # calculate numerators\n",
      "        df[wvar] = df[weight] * df[ivar]\n",
      "        if ix == 0:\n",
      "            # on the first iteration create an atom dataframe\n",
      "            atoms = df.groupby(groupby_cols)[wvar].sum().to_frame()\n",
      "            atoms.reset_index(inplace=True)\n",
      "        else:\n",
      "            # on tsubsequent iterations add weights as a column\n",
      "            atoms[wvar] = df.groupby(groupby_cols)[wvar].sum().values\n",
      "\n",
      "        # calculate denominators\n",
      "        denominators = atoms.groupby(source_id)[wvar].sum()\n",
      "\n",
      "        # interpolate weights\n",
      "        atoms[wvar] = atoms[wvar] / atoms[source_id].map(denominators)\n",
      "\n",
      "        # if any weights are NaN, replace with 0.\n",
      "        atoms[wvar].fillna(0.0, inplace=True)\n",
      "\n",
      "    return atoms\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(inspect.getsource(nhgisxwalk.calculate_atoms))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### toy data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T01:05:26.703147Z",
     "start_time": "2020-06-21T01:05:26.665058Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>bgp1990</th>\n",
       "      <th>blk1990</th>\n",
       "      <th>blk2010</th>\n",
       "      <th>trt2010</th>\n",
       "      <th>wt</th>\n",
       "      <th>pop_1990</th>\n",
       "      <th>hh_1990</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A</td>\n",
       "      <td>A.1</td>\n",
       "      <td>X.1</td>\n",
       "      <td>X</td>\n",
       "      <td>1.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>25.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A</td>\n",
       "      <td>A.2</td>\n",
       "      <td>X.2</td>\n",
       "      <td>X</td>\n",
       "      <td>0.3</td>\n",
       "      <td>100.0</td>\n",
       "      <td>40.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A</td>\n",
       "      <td>A.2</td>\n",
       "      <td>Y.1</td>\n",
       "      <td>Y</td>\n",
       "      <td>0.7</td>\n",
       "      <td>100.0</td>\n",
       "      <td>40.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>B</td>\n",
       "      <td>B.1</td>\n",
       "      <td>X.3</td>\n",
       "      <td>X</td>\n",
       "      <td>1.0</td>\n",
       "      <td>50.0</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>B</td>\n",
       "      <td>B.2</td>\n",
       "      <td>Y.2</td>\n",
       "      <td>Y</td>\n",
       "      <td>1.0</td>\n",
       "      <td>80.0</td>\n",
       "      <td>30.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  bgp1990 blk1990 blk2010 trt2010   wt  pop_1990  hh_1990\n",
       "0       A     A.1     X.1       X  1.0      60.0     25.0\n",
       "1       A     A.2     X.2       X  0.3     100.0     40.0\n",
       "2       A     A.2     Y.1       Y  0.7     100.0     40.0\n",
       "3       B     B.1     X.3       X  1.0      50.0     20.0\n",
       "4       B     B.2     Y.2       Y  1.0      80.0     30.0"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "toy_df = nhgisxwalk.example_crosswalk_data()\n",
    "toy_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### demo atom crosswalk calculation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T01:05:26.745505Z",
     "start_time": "2020-06-21T01:05:26.705467Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>bgp1990</th>\n",
       "      <th>trt2010</th>\n",
       "      <th>wt_pop</th>\n",
       "      <th>wt_hh</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A</td>\n",
       "      <td>X</td>\n",
       "      <td>0.562500</td>\n",
       "      <td>0.569231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A</td>\n",
       "      <td>Y</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>0.430769</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>B</td>\n",
       "      <td>X</td>\n",
       "      <td>0.384615</td>\n",
       "      <td>0.400000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>B</td>\n",
       "      <td>Y</td>\n",
       "      <td>0.615385</td>\n",
       "      <td>0.600000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  bgp1990 trt2010    wt_pop     wt_hh\n",
       "0       A       X  0.562500  0.569231\n",
       "1       A       Y  0.437500  0.430769\n",
       "2       B       X  0.384615  0.400000\n",
       "3       B       Y  0.615385  0.600000"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "toy_atoms = nhgisxwalk.calculate_atoms(\n",
    "    toy_df,\n",
    "    weight=\"wt\",\n",
    "    input_var=[\"pop_1990\", \"hh_1990\"],\n",
    "    weight_var=[\"pop\", \"hh\"],\n",
    "    weight_prefix=\"wt_\",\n",
    "    source_id=\"bgp1990\",\n",
    "    groupby_cols=[\"bgp1990\", \"trt2010\"]\n",
    ")\n",
    "toy_atoms"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "-------------------"
   ]
  }
 ],
 "metadata": {
  "_draft": {
   "nbviewer_url": "https://gist.github.com/ee43d7cfc266d2bfcba379cb572107f4"
  },
  "gist": {
   "data": {
    "description": "weighted-portion-synthetic-atoms.ipynb",
    "public": true
   },
   "id": "ee43d7cfc266d2bfcba379cb572107f4"
  },
  "kernelspec": {
   "display_name": "Python [conda env:nhgis]",
   "language": "python",
   "name": "conda-env-nhgis-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}