{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"import numpy as np\n",
"import orca\n",
"from urbansim.models import RegressionModel\n",
"from urbansim.utils import misc"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set the location of the HDFStore as an injectable called \"store\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"orca.add_injectable(\"store\", pd.HDFStore(os.path.join(misc.data_dir(), \"sanfran_public.h5\"), mode=\"r\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Specify table sources and broadcasts that will be used later"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"@orca.table('buildings')\n",
"def buildings(store):\n",
" df = store['buildings']\n",
" return df\n",
"\n",
"@orca.table('zones')\n",
"def zones(store):\n",
" df = store['zones']\n",
" return df\n",
"\n",
"@orca.table('households')\n",
"def households(store):\n",
" df = store['households']\n",
" return df\n",
"\n",
"@orca.table('parcels')\n",
"def parcels(store):\n",
" df = store['parcels']\n",
" return df\n",
"\n",
"orca.broadcast('zones', 'buildings', cast_index=True, onto_on='zone_id')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Specify the computed columns"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"@orca.column('households', 'income_quartile', cache=True)\n",
"def income_quartile(households):\n",
" return pd.Series(pd.qcut(households.income, 4).labels,\n",
" index=households.index)\n",
"\n",
"@orca.column('households', 'zone_id', cache=True)\n",
"def zone_id(households, buildings):\n",
" return misc.reindex(buildings.zone_id, households.building_id)\n",
"\n",
"@orca.column('zones', 'ave_unit_sqft')\n",
"def ave_unit_sqft(buildings, zones):\n",
" s = buildings.unit_sqft[buildings.general_type == \"Residential\"]\\\n",
" .groupby(buildings.zone_id).quantile().apply(np.log1p)\n",
" return s.reindex(zones.index).fillna(s.quantile())\n",
"\n",
"@orca.column('zones', 'ave_lot_sqft')\n",
"def ave_lot_sqft(buildings, zones):\n",
" s = buildings.unit_lot_size.groupby(buildings.zone_id).quantile().apply(np.log1p)\n",
" return s.reindex(zones.index).fillna(s.quantile())\n",
"\n",
"@orca.column('zones', 'sum_residential_units')\n",
"def sum_residential_units(buildings):\n",
" return buildings.residential_units.groupby(buildings.zone_id).sum().apply(np.log1p)\n",
"\n",
"@orca.column('zones', 'ave_income')\n",
"def ave_income(households, zones):\n",
" s = households.income.groupby(households.zone_id).quantile().apply(np.log1p)\n",
" return s.reindex(zones.index).fillna(s.quantile())\n",
"\n",
"orca.add_injectable(\"building_type_map\", {\n",
" 1: \"Residential\",\n",
" 2: \"Residential\",\n",
" 3: \"Residential\",\n",
" 4: \"Office\",\n",
" 5: \"Hotel\",\n",
" 6: \"School\",\n",
" 7: \"Industrial\",\n",
" 8: \"Industrial\",\n",
" 9: \"Industrial\",\n",
" 10: \"Retail\",\n",
" 11: \"Retail\",\n",
" 12: \"Residential\",\n",
" 13: \"Retail\",\n",
" 14: \"Office\"\n",
"})\n",
"\n",
"@orca.column('buildings', 'zone_id', cache=True)\n",
"def zone_id(buildings, parcels):\n",
" return misc.reindex(parcels.zone_id, buildings.parcel_id)\n",
"\n",
"@orca.column('buildings', 'general_type', cache=True)\n",
"def general_type(buildings, building_type_map):\n",
" return buildings.building_type_id.map(building_type_map)\n",
"\n",
"@orca.column('buildings', 'unit_sqft', cache=True)\n",
"def unit_sqft(buildings):\n",
" return buildings.building_sqft / buildings.residential_units.replace(0, 1)\n",
"\n",
"@orca.column('buildings', 'unit_lot_size', cache=True)\n",
"def unit_lot_size(buildings, parcels):\n",
" return misc.reindex(parcels.parcel_size, buildings.parcel_id) / \\\n",
" buildings.residential_units.replace(0, 1)\n",
" \n",
"@orca.column('parcels', 'parcel_size', cache=True)\n",
"def parcel_size(parcels):\n",
" return parcels.shape_area * 10.764"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Configure the model"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"rm = RegressionModel(\n",
" fit_filters=[\n",
" 'unit_lot_size > 0',\n",
" 'year_built > 1000',\n",
" 'year_built < 2020',\n",
" 'unit_sqft > 100',\n",
" 'unit_sqft < 20000'\n",
" ],\n",
" predict_filters=[\n",
" \"general_type == 'Residential'\"\n",
" ],\n",
" model_expression='np.log1p(residential_sales_price) ~ I(year_built < 1940)'\n",
" '+ I(year_built > 2005) + np.log1p(unit_sqft) + np.log1p(unit_lot_size)'\n",
" '+ sum_residential_units + ave_lot_sqft + ave_unit_sqft + ave_income',\n",
" ytransform = np.exp\n",
") "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get the data - merge buildings and zones (notice UrbanSum does the merge based on the broadcast)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"merged_df = orca.merge_tables(target=\"buildings\", tables=[\"buildings\", \"zones\"], columns=rm.columns_used()) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fill nans - UrbanSim wants you to take care of nans"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import utils\n",
"merged_df[\"year_built\"] = merged_df.year_built.fillna(merged_df.year_built.quantile())\n",
"merged_df[\"residential_sales_price\"] = merged_df.residential_sales_price.fillna(0)\n",
"merged_df[\"general_type\"] = merged_df.general_type.fillna(merged_df.general_type.value_counts().idxmax())\n",
"_ = utils.deal_with_nas(merged_df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fit and report"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"OLS Regression Results\n",
"\n",
" Dep. Variable: | np.log1p(residential_sales_price) | R-squared: | 0.399 | \n",
"
\n",
"\n",
" Model: | OLS | Adj. R-squared: | 0.399 | \n",
"
\n",
"\n",
" Method: | Least Squares | F-statistic: | 1.240e+04 | \n",
"
\n",
"\n",
" Date: | Tue, 28 Apr 2015 | Prob (F-statistic): | 0.00 | \n",
"
\n",
"\n",
" Time: | 10:38:22 | Log-Likelihood: | -2.5244e+05 | \n",
"
\n",
"\n",
" No. Observations: | 149409 | AIC: | 5.049e+05 | \n",
"
\n",
"\n",
" Df Residuals: | 149400 | BIC: | 5.050e+05 | \n",
"
\n",
"\n",
" Df Model: | 8 | | | \n",
"
\n",
"\n",
" Covariance Type: | nonrobust | | | \n",
"
\n",
"
\n",
"\n",
"\n",
" | coef | std err | t | P>|t| | [95.0% Conf. Int.] | \n",
"
\n",
"\n",
" Intercept | -3.8664 | 0.170 | -22.761 | 0.000 | -4.199 -3.533 | \n",
"
\n",
"\n",
" I(year_built < 1940)[T.True] | 0.0011 | 0.007 | 0.144 | 0.885 | -0.013 0.016 | \n",
"
\n",
"\n",
" I(year_built > 2005)[T.True] | -0.0632 | 0.049 | -1.298 | 0.194 | -0.159 0.032 | \n",
"
\n",
"\n",
" np.log1p(unit_sqft) | -1.4830 | 0.007 | -210.907 | 0.000 | -1.497 -1.469 | \n",
"
\n",
"\n",
" np.log1p(unit_lot_size) | -0.1476 | 0.006 | -23.219 | 0.000 | -0.160 -0.135 | \n",
"
\n",
"\n",
" sum_residential_units | 0.0986 | 0.008 | 11.972 | 0.000 | 0.082 0.115 | \n",
"
\n",
"\n",
" ave_lot_sqft | -0.2179 | 0.010 | -21.293 | 0.000 | -0.238 -0.198 | \n",
"
\n",
"\n",
" ave_unit_sqft | 0.9376 | 0.020 | 45.828 | 0.000 | 0.898 0.978 | \n",
"
\n",
"\n",
" ave_income | 1.4192 | 0.015 | 92.812 | 0.000 | 1.389 1.449 | \n",
"
\n",
"
\n",
"\n",
"\n",
" Omnibus: | 89673.667 | Durbin-Watson: | 1.798 | \n",
"
\n",
"\n",
" Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 819859.564 | \n",
"
\n",
"\n",
" Skew: | -2.841 | Prob(JB): | 0.00 | \n",
"
\n",
"\n",
" Kurtosis: | 12.970 | Cond. No. | 1.01e+03 | \n",
"
\n",
"
"
],
"text/plain": [
"\n",
"\"\"\"\n",
" OLS Regression Results \n",
"=============================================================================================\n",
"Dep. Variable: np.log1p(residential_sales_price) R-squared: 0.399\n",
"Model: OLS Adj. R-squared: 0.399\n",
"Method: Least Squares F-statistic: 1.240e+04\n",
"Date: Tue, 28 Apr 2015 Prob (F-statistic): 0.00\n",
"Time: 10:38:22 Log-Likelihood: -2.5244e+05\n",
"No. Observations: 149409 AIC: 5.049e+05\n",
"Df Residuals: 149400 BIC: 5.050e+05\n",
"Df Model: 8 \n",
"Covariance Type: nonrobust \n",
"================================================================================================\n",
" coef std err t P>|t| [95.0% Conf. Int.]\n",
"------------------------------------------------------------------------------------------------\n",
"Intercept -3.8664 0.170 -22.761 0.000 -4.199 -3.533\n",
"I(year_built < 1940)[T.True] 0.0011 0.007 0.144 0.885 -0.013 0.016\n",
"I(year_built > 2005)[T.True] -0.0632 0.049 -1.298 0.194 -0.159 0.032\n",
"np.log1p(unit_sqft) -1.4830 0.007 -210.907 0.000 -1.497 -1.469\n",
"np.log1p(unit_lot_size) -0.1476 0.006 -23.219 0.000 -0.160 -0.135\n",
"sum_residential_units 0.0986 0.008 11.972 0.000 0.082 0.115\n",
"ave_lot_sqft -0.2179 0.010 -21.293 0.000 -0.238 -0.198\n",
"ave_unit_sqft 0.9376 0.020 45.828 0.000 0.898 0.978\n",
"ave_income 1.4192 0.015 92.812 0.000 1.389 1.449\n",
"==============================================================================\n",
"Omnibus: 89673.667 Durbin-Watson: 1.798\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 819859.564\n",
"Skew: -2.841 Prob(JB): 0.00\n",
"Kurtosis: 12.970 Cond. No. 1.01e+03\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
"[2] The condition number is large, 1.01e+03. This might indicate that there are\n",
"strong multicollinearity or other numerical problems.\n",
"\"\"\""
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rm.fit(merged_df).summary()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Predict and report"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"count 140782.000000\n",
"mean 1157.915780\n",
"std 41536.026751\n",
"min 0.001362\n",
"25% 252.965573\n",
"50% 380.341584\n",
"75% 554.749517\n",
"max 12517261.710714\n",
"dtype: float64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rm.predict(merged_df).describe()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}