{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This file is part of the Minnesota Population Center's NHGISXWALK.\n",
    "# For copyright and licensing information, see the NOTICE and LICENSE files\n",
    "# in this project's top-level directory, and also on-line at:\n",
    "#   https://github.com/ipums/nhgisxwalk"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generate national and state-level crosswalks\n",
    "## 2000 block group parts to 2010 county\n",
    "\n",
    "### NHGIS [block crosswalks](https://www.nhgis.org/user-resources/geographic-crosswalks)\n",
    "\n",
    "**James D. Gaboardi, 06/2020**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-10-01T21:53:19.285459Z",
     "start_time": "2020-10-01T21:53:19.140633Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2020-10-01T17:53:19-04:00\n",
      "\n",
      "CPython 3.8.5\n",
      "IPython 7.18.1\n",
      "\n",
      "compiler   : Clang 10.0.1 \n",
      "system     : Darwin\n",
      "release    : 19.6.0\n",
      "machine    : x86_64\n",
      "processor  : i386\n",
      "CPU cores  : 8\n",
      "interpreter: 64bit\n"
     ]
    }
   ],
   "source": [
    "%load_ext watermark\n",
    "%watermark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-10-01T21:53:19.555408Z",
     "start_time": "2020-10-01T21:53:19.287875Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "watermark 2.0.2\n",
      "pandas     1.1.1\n",
      "numpy      1.19.1\n",
      "nhgisxwalk 0.0.9post1\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import nhgisxwalk\n",
    "import inspect\n",
    "import numpy\n",
    "import pandas\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "%watermark -w\n",
    "%watermark -iv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Source and target years for the crosswalk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-10-01T21:53:19.574264Z",
     "start_time": "2020-10-01T21:53:19.558483Z"
    }
   },
   "outputs": [],
   "source": [
    "source_year, target_year = \"2000\", \"2010\"\n",
    "gj_src, gj_trg = \"GJOIN%s\"%source_year, \"GJOIN%s\"%target_year"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-10-01T21:53:19.592374Z",
     "start_time": "2020-10-01T21:53:19.576945Z"
    }
   },
   "outputs": [],
   "source": [
    "# Set these to a local directory\n",
    "data_in = \"path/to/data/\"\n",
    "data_tab = \"path/to/data/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-10-01T21:53:19.608098Z",
     "start_time": "2020-10-01T21:53:19.594045Z"
    }
   },
   "outputs": [],
   "source": [
    "block_file = \"%s_block\" % source_year"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Source-target building base"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-10-01T21:53:33.317361Z",
     "start_time": "2020-10-01T21:53:19.609436Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GJOIN2000</th>\n",
       "      <th>GJOIN2010</th>\n",
       "      <th>WEIGHT</th>\n",
       "      <th>PAREA</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>G01000100201001000</td>\n",
       "      <td>G01000100201002000</td>\n",
       "      <td>0.035897</td>\n",
       "      <td>0.008988</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>G01000100201001000</td>\n",
       "      <td>G01000100201002001</td>\n",
       "      <td>0.253330</td>\n",
       "      <td>0.263725</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>G01000100201001000</td>\n",
       "      <td>G01000100201002002</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000385</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>G01000100201001000</td>\n",
       "      <td>G01000100201002003</td>\n",
       "      <td>0.076297</td>\n",
       "      <td>0.055430</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>G01000100201001000</td>\n",
       "      <td>G01000100201002004</td>\n",
       "      <td>0.032441</td>\n",
       "      <td>0.007543</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            GJOIN2000           GJOIN2010    WEIGHT     PAREA\n",
       "0  G01000100201001000  G01000100201002000  0.035897  0.008988\n",
       "1  G01000100201001000  G01000100201002001  0.253330  0.263725\n",
       "2  G01000100201001000  G01000100201002002  0.000000  0.000385\n",
       "3  G01000100201001000  G01000100201002003  0.076297  0.055430\n",
       "4  G01000100201001000  G01000100201002004  0.032441  0.007543"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_xwalk_name = \"nhgis_blk%s_blk%s_gj\" % (source_year, target_year)\n",
    "data_types = nhgisxwalk.str_types([gj_src, gj_trg])\n",
    "from_csv_kws = {\"path\": data_in, \"archived\": True, \"remove_unpacked\": True}\n",
    "read_csv_kws = {\"dtype\": data_types}\n",
    "base_xwalk = nhgisxwalk.xwalk_df_from_csv(\n",
    "    base_xwalk_name, **from_csv_kws, **read_csv_kws\n",
    ")\n",
    "base_xwalk.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Source summary data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-10-01T21:53:33.333064Z",
     "start_time": "2020-10-01T21:53:33.319487Z"
    }
   },
   "outputs": [],
   "source": [
    "base_source_name = \"%s/%s.csv\" % (block_file, block_file)\n",
    "base_source_file = \"%s%s\" % (data_tab, base_source_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Convenience code shorthand/lookup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-10-01T21:53:33.350663Z",
     "start_time": "2020-10-01T21:53:33.336507Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'block': 'blk',\n",
       " 'block group part': 'bgp',\n",
       " 'block group': 'bg',\n",
       " 'tract': 'tr',\n",
       " 'county': 'co'}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nhgisxwalk.valid_geo_shorthand(shorthand_name=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Set the `nhgisxwalk.GeoCrossWalk` parameters\n",
    "##### see [nhgisxwalk.GeoCrossWalk](https://github.com/ipums/nhgisxwalk/blob/92b4fe55de0a9c53d0315dcda8ec121faaf20aef/nhgisxwalk/geocrosswalk.py#L19) for full details"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-10-01T21:53:33.367949Z",
     "start_time": "2020-10-01T21:53:33.352761Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Persons': {'Persons': 'Universe',\n",
       "  'NP001A': 'Source code',\n",
       "  'FXS': 'NHGIS code',\n",
       "  'Total': 'FXS001'},\n",
       " 'Families': {'Families': 'Universe',\n",
       "  'NP031A': 'Source code',\n",
       "  'F2V': 'NHGIS code',\n",
       "  'Total': 'F2V001'},\n",
       " 'Households': {'Households': 'Universe',\n",
       "  'NP010A': 'Source code',\n",
       "  'FY4': 'NHGIS code',\n",
       "  'Total': 'FY4001'},\n",
       " 'Housing Units': {'Housing Units': 'Universe',\n",
       "  'NH001A': 'Source code',\n",
       "  'FV5': 'NHGIS code',\n",
       "  'Total': 'FV5001'}}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nhgisxwalk.desc_code_2000_SF1b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-10-01T21:53:33.384148Z",
     "start_time": "2020-10-01T21:53:33.369508Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['FXS001', 'F2V001', 'FY4001', 'FV5001']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "input_vars = [\n",
    "    nhgisxwalk.desc_code_2000_SF1b[\"Persons\"][\"Total\"],\n",
    "    nhgisxwalk.desc_code_2000_SF1b[\"Families\"][\"Total\"],\n",
    "    nhgisxwalk.desc_code_2000_SF1b[\"Households\"][\"Total\"],\n",
    "    nhgisxwalk.desc_code_2000_SF1b[\"Housing Units\"][\"Total\"]\n",
    "]\n",
    "input_vars"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-10-01T21:53:33.399417Z",
     "start_time": "2020-10-01T21:53:33.385512Z"
    }
   },
   "outputs": [],
   "source": [
    "input_var_tags = [\"pop\", \"fam\", \"hh\", \"hu\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-10-01T21:53:33.414885Z",
     "start_time": "2020-10-01T21:53:33.400734Z"
    }
   },
   "outputs": [],
   "source": [
    "xwalk_args = {\n",
    "    \"source_year\": source_year,\n",
    "    \"target_year\": target_year,\n",
    "    \"source_geo\": \"bgp\",\n",
    "    \"target_geo\": \"co\",\n",
    "    \"base_source_table\": base_source_file,\n",
    "    \"input_var\": input_vars,\n",
    "    \"weight_var\": input_var_tags,\n",
    "    \"keep_base\": False,\n",
    "    \"add_geoid\": True\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Generate data product\n",
    "1. Create a national crosswalk then split by state \n",
    "2. Write out all products with `README.txt` files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-10-01T21:55:39.351727Z",
     "start_time": "2020-10-01T21:53:33.416175Z"
    }
   },
   "outputs": [],
   "source": [
    "nhgisxwalk.generate_data_product(base_xwalk, xwalk_args, data_in)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "-----------------------------------------------"
   ]
  }
 ],
 "metadata": {
  "_draft": {
   "nbviewer_url": "https://gist.github.com/9f47e4ec2cc37bce83acf20abfca69d2"
  },
  "gist": {
   "data": {
    "description": "sample-workflow.ipynb",
    "public": true
   },
   "id": "9f47e4ec2cc37bce83acf20abfca69d2"
  },
  "kernelspec": {
   "display_name": "Python [conda env:nhgis] *",
   "language": "python",
   "name": "conda-env-nhgis-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}