{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import orca\n",
    "from urbansim.models import RegressionModel\n",
    "from urbansim.utils import misc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Set the location of the HDFStore as an injectable called \"store\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "orca.add_injectable(\"store\", pd.HDFStore(os.path.join(misc.data_dir(), \"sanfran_public.h5\"), mode=\"r\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Specify table sources and broadcasts that will be used later"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "@orca.table('buildings')\n",
    "def buildings(store):\n",
    "    df = store['buildings']\n",
    "    return df\n",
    "\n",
    "@orca.table('zones')\n",
    "def zones(store):\n",
    "    df = store['zones']\n",
    "    return df\n",
    "\n",
    "@orca.table('households')\n",
    "def households(store):\n",
    "    df = store['households']\n",
    "    return df\n",
    "\n",
    "@orca.table('parcels')\n",
    "def parcels(store):\n",
    "    df = store['parcels']\n",
    "    return df\n",
    "\n",
    "orca.broadcast('zones', 'buildings', cast_index=True, onto_on='zone_id')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Specify the computed columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "@orca.column('households', 'income_quartile', cache=True)\n",
    "def income_quartile(households):\n",
    "    return pd.Series(pd.qcut(households.income, 4).labels,\n",
    "                     index=households.index)\n",
    "\n",
    "@orca.column('households', 'zone_id', cache=True)\n",
    "def zone_id(households, buildings):\n",
    "    return misc.reindex(buildings.zone_id, households.building_id)\n",
    "\n",
    "@orca.column('zones', 'ave_unit_sqft')\n",
    "def ave_unit_sqft(buildings, zones):\n",
    "    s = buildings.unit_sqft[buildings.general_type == \"Residential\"]\\\n",
    "        .groupby(buildings.zone_id).quantile().apply(np.log1p)\n",
    "    return s.reindex(zones.index).fillna(s.quantile())\n",
    "\n",
    "@orca.column('zones', 'ave_lot_sqft')\n",
    "def ave_lot_sqft(buildings, zones):\n",
    "    s = buildings.unit_lot_size.groupby(buildings.zone_id).quantile().apply(np.log1p)\n",
    "    return s.reindex(zones.index).fillna(s.quantile())\n",
    "\n",
    "@orca.column('zones', 'sum_residential_units')\n",
    "def sum_residential_units(buildings):\n",
    "    return buildings.residential_units.groupby(buildings.zone_id).sum().apply(np.log1p)\n",
    "\n",
    "@orca.column('zones', 'ave_income')\n",
    "def ave_income(households, zones):\n",
    "    s = households.income.groupby(households.zone_id).quantile().apply(np.log1p)\n",
    "    return s.reindex(zones.index).fillna(s.quantile())\n",
    "\n",
    "orca.add_injectable(\"building_type_map\", {\n",
    "    1: \"Residential\",\n",
    "    2: \"Residential\",\n",
    "    3: \"Residential\",\n",
    "    4: \"Office\",\n",
    "    5: \"Hotel\",\n",
    "    6: \"School\",\n",
    "    7: \"Industrial\",\n",
    "    8: \"Industrial\",\n",
    "    9: \"Industrial\",\n",
    "    10: \"Retail\",\n",
    "    11: \"Retail\",\n",
    "    12: \"Residential\",\n",
    "    13: \"Retail\",\n",
    "    14: \"Office\"\n",
    "})\n",
    "\n",
    "@orca.column('buildings', 'zone_id', cache=True)\n",
    "def zone_id(buildings, parcels):\n",
    "    return misc.reindex(parcels.zone_id, buildings.parcel_id)\n",
    "\n",
    "@orca.column('buildings', 'general_type', cache=True)\n",
    "def general_type(buildings, building_type_map):\n",
    "    return buildings.building_type_id.map(building_type_map)\n",
    "\n",
    "@orca.column('buildings', 'unit_sqft', cache=True)\n",
    "def unit_sqft(buildings):\n",
    "    return buildings.building_sqft / buildings.residential_units.replace(0, 1)\n",
    "\n",
    "@orca.column('buildings', 'unit_lot_size', cache=True)\n",
    "def unit_lot_size(buildings, parcels):\n",
    "    return misc.reindex(parcels.parcel_size, buildings.parcel_id) / \\\n",
    "        buildings.residential_units.replace(0, 1)\n",
    "    \n",
    "@orca.column('parcels', 'parcel_size', cache=True)\n",
    "def parcel_size(parcels):\n",
    "    return parcels.shape_area * 10.764"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Configure the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "rm = RegressionModel(\n",
    "    fit_filters=[\n",
    "        'unit_lot_size > 0',\n",
    "        'year_built > 1000',\n",
    "        'year_built < 2020',\n",
    "        'unit_sqft > 100',\n",
    "        'unit_sqft < 20000'\n",
    "    ],\n",
    "    predict_filters=[\n",
    "        \"general_type == 'Residential'\"\n",
    "    ],\n",
    "    model_expression='np.log1p(residential_sales_price) ~ I(year_built < 1940)'\n",
    "        '+ I(year_built > 2005) + np.log1p(unit_sqft) + np.log1p(unit_lot_size)'\n",
    "        '+ sum_residential_units + ave_lot_sqft + ave_unit_sqft + ave_income',\n",
    "    ytransform = np.exp\n",
    ")     "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get the data - merge buildings and zones  (notice UrbanSum does the merge based on the broadcast)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "merged_df = orca.merge_tables(target=\"buildings\", tables=[\"buildings\", \"zones\"], columns=rm.columns_used()) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fill nans - UrbanSim wants you to take care of nans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import utils\n",
    "merged_df[\"year_built\"] = merged_df.year_built.fillna(merged_df.year_built.quantile())\n",
    "merged_df[\"residential_sales_price\"] = merged_df.residential_sales_price.fillna(0)\n",
    "merged_df[\"general_type\"] = merged_df.general_type.fillna(merged_df.general_type.value_counts().idxmax())\n",
    "_ = utils.deal_with_nas(merged_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fit and report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>OLS Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>    <td>np.log1p(residential_sales_price)</td> <th>  R-squared:         </th>  <td>   0.399</td>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                           <td>OLS</td>                <th>  Adj. R-squared:    </th>  <td>   0.399</td>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>                     <td>Least Squares</td>           <th>  F-statistic:       </th>  <td>1.240e+04</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>                     <td>Tue, 28 Apr 2015</td>          <th>  Prob (F-statistic):</th>   <td>  0.00</td>   \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                         <td>10:38:22</td>              <th>  Log-Likelihood:    </th> <td>-2.5244e+05</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>No. Observations:</th>              <td>149409</td>               <th>  AIC:               </th>  <td>5.049e+05</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Residuals:</th>                  <td>149400</td>               <th>  BIC:               </th>  <td>5.050e+05</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Model:</th>                      <td>     8</td>               <th>                     </th>      <td> </td>     \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>              <td>nonrobust</td>             <th>                     </th>      <td> </td>     \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "                <td></td>                  <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th> <th>[95.0% Conf. Int.]</th> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Intercept</th>                    <td>   -3.8664</td> <td>    0.170</td> <td>  -22.761</td> <td> 0.000</td> <td>   -4.199    -3.533</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>I(year_built < 1940)[T.True]</th> <td>    0.0011</td> <td>    0.007</td> <td>    0.144</td> <td> 0.885</td> <td>   -0.013     0.016</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>I(year_built > 2005)[T.True]</th> <td>   -0.0632</td> <td>    0.049</td> <td>   -1.298</td> <td> 0.194</td> <td>   -0.159     0.032</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>np.log1p(unit_sqft)</th>          <td>   -1.4830</td> <td>    0.007</td> <td> -210.907</td> <td> 0.000</td> <td>   -1.497    -1.469</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>np.log1p(unit_lot_size)</th>      <td>   -0.1476</td> <td>    0.006</td> <td>  -23.219</td> <td> 0.000</td> <td>   -0.160    -0.135</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>sum_residential_units</th>        <td>    0.0986</td> <td>    0.008</td> <td>   11.972</td> <td> 0.000</td> <td>    0.082     0.115</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>ave_lot_sqft</th>                 <td>   -0.2179</td> <td>    0.010</td> <td>  -21.293</td> <td> 0.000</td> <td>   -0.238    -0.198</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>ave_unit_sqft</th>                <td>    0.9376</td> <td>    0.020</td> <td>   45.828</td> <td> 0.000</td> <td>    0.898     0.978</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>ave_income</th>                   <td>    1.4192</td> <td>    0.015</td> <td>   92.812</td> <td> 0.000</td> <td>    1.389     1.449</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "  <th>Omnibus:</th>       <td>89673.667</td> <th>  Durbin-Watson:     </th>  <td>   1.798</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Prob(Omnibus):</th>  <td> 0.000</td>   <th>  Jarque-Bera (JB):  </th> <td>819859.564</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Skew:</th>           <td>-2.841</td>   <th>  Prob(JB):          </th>  <td>    0.00</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Kurtosis:</th>       <td>12.970</td>   <th>  Cond. No.          </th>  <td>1.01e+03</td> \n",
       "</tr>\n",
       "</table>"
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                                    OLS Regression Results                                   \n",
       "=============================================================================================\n",
       "Dep. Variable:     np.log1p(residential_sales_price)   R-squared:                       0.399\n",
       "Model:                                           OLS   Adj. R-squared:                  0.399\n",
       "Method:                                Least Squares   F-statistic:                 1.240e+04\n",
       "Date:                               Tue, 28 Apr 2015   Prob (F-statistic):               0.00\n",
       "Time:                                       10:38:22   Log-Likelihood:            -2.5244e+05\n",
       "No. Observations:                             149409   AIC:                         5.049e+05\n",
       "Df Residuals:                                 149400   BIC:                         5.050e+05\n",
       "Df Model:                                          8                                         \n",
       "Covariance Type:                           nonrobust                                         \n",
       "================================================================================================\n",
       "                                   coef    std err          t      P>|t|      [95.0% Conf. Int.]\n",
       "------------------------------------------------------------------------------------------------\n",
       "Intercept                       -3.8664      0.170    -22.761      0.000        -4.199    -3.533\n",
       "I(year_built < 1940)[T.True]     0.0011      0.007      0.144      0.885        -0.013     0.016\n",
       "I(year_built > 2005)[T.True]    -0.0632      0.049     -1.298      0.194        -0.159     0.032\n",
       "np.log1p(unit_sqft)             -1.4830      0.007   -210.907      0.000        -1.497    -1.469\n",
       "np.log1p(unit_lot_size)         -0.1476      0.006    -23.219      0.000        -0.160    -0.135\n",
       "sum_residential_units            0.0986      0.008     11.972      0.000         0.082     0.115\n",
       "ave_lot_sqft                    -0.2179      0.010    -21.293      0.000        -0.238    -0.198\n",
       "ave_unit_sqft                    0.9376      0.020     45.828      0.000         0.898     0.978\n",
       "ave_income                       1.4192      0.015     92.812      0.000         1.389     1.449\n",
       "==============================================================================\n",
       "Omnibus:                    89673.667   Durbin-Watson:                   1.798\n",
       "Prob(Omnibus):                  0.000   Jarque-Bera (JB):           819859.564\n",
       "Skew:                          -2.841   Prob(JB):                         0.00\n",
       "Kurtosis:                      12.970   Cond. No.                     1.01e+03\n",
       "==============================================================================\n",
       "\n",
       "Warnings:\n",
       "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
       "[2] The condition number is large, 1.01e+03. This might indicate that there are\n",
       "strong multicollinearity or other numerical problems.\n",
       "\"\"\""
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rm.fit(merged_df).summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Predict and report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count      140782.000000\n",
       "mean         1157.915780\n",
       "std         41536.026751\n",
       "min             0.001362\n",
       "25%           252.965573\n",
       "50%           380.341584\n",
       "75%           554.749517\n",
       "max      12517261.710714\n",
       "dtype: float64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rm.predict(merged_df).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}