{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import numpy as np\n", "import orca\n", "from urbansim.models import RegressionModel\n", "from urbansim.utils import misc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Set the location of the HDFStore as an injectable called \"store\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "orca.add_injectable(\"store\", pd.HDFStore(os.path.join(misc.data_dir(), \"sanfran_public.h5\"), mode=\"r\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Specify table sources and broadcasts that will be used later" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "@orca.table('buildings')\n", "def buildings(store):\n", " df = store['buildings']\n", " return df\n", "\n", "@orca.table('zones')\n", "def zones(store):\n", " df = store['zones']\n", " return df\n", "\n", "@orca.table('households')\n", "def households(store):\n", " df = store['households']\n", " return df\n", "\n", "@orca.table('parcels')\n", "def parcels(store):\n", " df = store['parcels']\n", " return df\n", "\n", "orca.broadcast('zones', 'buildings', cast_index=True, onto_on='zone_id')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Specify the computed columns" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "@orca.column('households', 'income_quartile', cache=True)\n", "def income_quartile(households):\n", " return pd.Series(pd.qcut(households.income, 4).labels,\n", " index=households.index)\n", "\n", "@orca.column('households', 'zone_id', cache=True)\n", "def zone_id(households, buildings):\n", " return misc.reindex(buildings.zone_id, households.building_id)\n", "\n", "@orca.column('zones', 'ave_unit_sqft')\n", "def ave_unit_sqft(buildings, zones):\n", " s = buildings.unit_sqft[buildings.general_type == \"Residential\"]\\\n", " .groupby(buildings.zone_id).quantile().apply(np.log1p)\n", " return s.reindex(zones.index).fillna(s.quantile())\n", "\n", "@orca.column('zones', 'ave_lot_sqft')\n", "def ave_lot_sqft(buildings, zones):\n", " s = buildings.unit_lot_size.groupby(buildings.zone_id).quantile().apply(np.log1p)\n", " return s.reindex(zones.index).fillna(s.quantile())\n", "\n", "@orca.column('zones', 'sum_residential_units')\n", "def sum_residential_units(buildings):\n", " return buildings.residential_units.groupby(buildings.zone_id).sum().apply(np.log1p)\n", "\n", "@orca.column('zones', 'ave_income')\n", "def ave_income(households, zones):\n", " s = households.income.groupby(households.zone_id).quantile().apply(np.log1p)\n", " return s.reindex(zones.index).fillna(s.quantile())\n", "\n", "orca.add_injectable(\"building_type_map\", {\n", " 1: \"Residential\",\n", " 2: \"Residential\",\n", " 3: \"Residential\",\n", " 4: \"Office\",\n", " 5: \"Hotel\",\n", " 6: \"School\",\n", " 7: \"Industrial\",\n", " 8: \"Industrial\",\n", " 9: \"Industrial\",\n", " 10: \"Retail\",\n", " 11: \"Retail\",\n", " 12: \"Residential\",\n", " 13: \"Retail\",\n", " 14: \"Office\"\n", "})\n", "\n", "@orca.column('buildings', 'zone_id', cache=True)\n", "def zone_id(buildings, parcels):\n", " return misc.reindex(parcels.zone_id, buildings.parcel_id)\n", "\n", "@orca.column('buildings', 'general_type', cache=True)\n", "def general_type(buildings, building_type_map):\n", " return buildings.building_type_id.map(building_type_map)\n", "\n", "@orca.column('buildings', 'unit_sqft', cache=True)\n", "def unit_sqft(buildings):\n", " return buildings.building_sqft / buildings.residential_units.replace(0, 1)\n", "\n", "@orca.column('buildings', 'unit_lot_size', cache=True)\n", "def unit_lot_size(buildings, parcels):\n", " return misc.reindex(parcels.parcel_size, buildings.parcel_id) / \\\n", " buildings.residential_units.replace(0, 1)\n", " \n", "@orca.column('parcels', 'parcel_size', cache=True)\n", "def parcel_size(parcels):\n", " return parcels.shape_area * 10.764" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Configure the model" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "rm = RegressionModel(\n", " fit_filters=[\n", " 'unit_lot_size > 0',\n", " 'year_built > 1000',\n", " 'year_built < 2020',\n", " 'unit_sqft > 100',\n", " 'unit_sqft < 20000'\n", " ],\n", " predict_filters=[\n", " \"general_type == 'Residential'\"\n", " ],\n", " model_expression='np.log1p(residential_sales_price) ~ I(year_built < 1940)'\n", " '+ I(year_built > 2005) + np.log1p(unit_sqft) + np.log1p(unit_lot_size)'\n", " '+ sum_residential_units + ave_lot_sqft + ave_unit_sqft + ave_income',\n", " ytransform = np.exp\n", ") " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get the data - merge buildings and zones (notice UrbanSum does the merge based on the broadcast)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "merged_df = orca.merge_tables(target=\"buildings\", tables=[\"buildings\", \"zones\"], columns=rm.columns_used()) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fill nans - UrbanSim wants you to take care of nans" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import utils\n", "merged_df[\"year_built\"] = merged_df.year_built.fillna(merged_df.year_built.quantile())\n", "merged_df[\"residential_sales_price\"] = merged_df.residential_sales_price.fillna(0)\n", "merged_df[\"general_type\"] = merged_df.general_type.fillna(merged_df.general_type.value_counts().idxmax())\n", "_ = utils.deal_with_nas(merged_df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fit and report" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
OLS Regression Results
Dep. Variable: np.log1p(residential_sales_price) R-squared: 0.399
Model: OLS Adj. R-squared: 0.399
Method: Least Squares F-statistic: 1.240e+04
Date: Tue, 28 Apr 2015 Prob (F-statistic): 0.00
Time: 10:38:22 Log-Likelihood: -2.5244e+05
No. Observations: 149409 AIC: 5.049e+05
Df Residuals: 149400 BIC: 5.050e+05
Df Model: 8
Covariance Type: nonrobust
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
coef std err t P>|t| [95.0% Conf. Int.]
Intercept -3.8664 0.170 -22.761 0.000 -4.199 -3.533
I(year_built < 1940)[T.True] 0.0011 0.007 0.144 0.885 -0.013 0.016
I(year_built > 2005)[T.True] -0.0632 0.049 -1.298 0.194 -0.159 0.032
np.log1p(unit_sqft) -1.4830 0.007 -210.907 0.000 -1.497 -1.469
np.log1p(unit_lot_size) -0.1476 0.006 -23.219 0.000 -0.160 -0.135
sum_residential_units 0.0986 0.008 11.972 0.000 0.082 0.115
ave_lot_sqft -0.2179 0.010 -21.293 0.000 -0.238 -0.198
ave_unit_sqft 0.9376 0.020 45.828 0.000 0.898 0.978
ave_income 1.4192 0.015 92.812 0.000 1.389 1.449
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Omnibus: 89673.667 Durbin-Watson: 1.798
Prob(Omnibus): 0.000 Jarque-Bera (JB): 819859.564
Skew: -2.841 Prob(JB): 0.00
Kurtosis: 12.970 Cond. No. 1.01e+03
" ], "text/plain": [ "\n", "\"\"\"\n", " OLS Regression Results \n", "=============================================================================================\n", "Dep. Variable: np.log1p(residential_sales_price) R-squared: 0.399\n", "Model: OLS Adj. R-squared: 0.399\n", "Method: Least Squares F-statistic: 1.240e+04\n", "Date: Tue, 28 Apr 2015 Prob (F-statistic): 0.00\n", "Time: 10:38:22 Log-Likelihood: -2.5244e+05\n", "No. Observations: 149409 AIC: 5.049e+05\n", "Df Residuals: 149400 BIC: 5.050e+05\n", "Df Model: 8 \n", "Covariance Type: nonrobust \n", "================================================================================================\n", " coef std err t P>|t| [95.0% Conf. Int.]\n", "------------------------------------------------------------------------------------------------\n", "Intercept -3.8664 0.170 -22.761 0.000 -4.199 -3.533\n", "I(year_built < 1940)[T.True] 0.0011 0.007 0.144 0.885 -0.013 0.016\n", "I(year_built > 2005)[T.True] -0.0632 0.049 -1.298 0.194 -0.159 0.032\n", "np.log1p(unit_sqft) -1.4830 0.007 -210.907 0.000 -1.497 -1.469\n", "np.log1p(unit_lot_size) -0.1476 0.006 -23.219 0.000 -0.160 -0.135\n", "sum_residential_units 0.0986 0.008 11.972 0.000 0.082 0.115\n", "ave_lot_sqft -0.2179 0.010 -21.293 0.000 -0.238 -0.198\n", "ave_unit_sqft 0.9376 0.020 45.828 0.000 0.898 0.978\n", "ave_income 1.4192 0.015 92.812 0.000 1.389 1.449\n", "==============================================================================\n", "Omnibus: 89673.667 Durbin-Watson: 1.798\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 819859.564\n", "Skew: -2.841 Prob(JB): 0.00\n", "Kurtosis: 12.970 Cond. No. 1.01e+03\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", "[2] The condition number is large, 1.01e+03. This might indicate that there are\n", "strong multicollinearity or other numerical problems.\n", "\"\"\"" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rm.fit(merged_df).summary()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predict and report" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "count 140782.000000\n", "mean 1157.915780\n", "std 41536.026751\n", "min 0.001362\n", "25% 252.965573\n", "50% 380.341584\n", "75% 554.749517\n", "max 12517261.710714\n", "dtype: float64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rm.predict(merged_df).describe()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }