{
 "metadata": {
  "name": "discrete_choice"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Discrete Choice Models - Fair's Affair data"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "A survey of women only was conducted in 1974 by *Redbook* asking about extramarital affairs."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import numpy as np\n",
      "from scipy import stats\n",
      "import matplotlib.pyplot as plt\n",
      "import statsmodels.api as sm\n",
      "from statsmodels.formula.api import logit, probit, poisson, ols"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print sm.datasets.fair.SOURCE"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print sm.datasets.fair.NOTE"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "dta = sm.datasets.fair.load_pandas().data"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "dta['affair'] = (dta['affairs'] > 0).astype(float)\n",
      "print dta.head(10)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print dta.describe()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "affair_mod = logit(\"affair ~ occupation + educ + occupation_husb\" \n",
      "                   \"+ rate_marriage + age + yrs_married + children\"\n",
      "                   \" + religious\", dta).fit()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print affair_mod.summary()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "raw",
     "metadata": {},
     "source": [
      "How well are we predicting?"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "affair_mod.pred_table()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "raw",
     "metadata": {},
     "source": [
      "The coefficients of the discrete choice model do not tell us much. What we're after is marginal effects."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "mfx = affair_mod.get_margeff()\n",
      "print mfx.summary()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "respondent1000 = dta.ix[1000]\n",
      "print respondent1000"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "resp = dict(zip(range(1,9), respondent1000[[\"occupation\", \"educ\", \n",
      "                                            \"occupation_husb\", \"rate_marriage\", \n",
      "                                            \"age\", \"yrs_married\", \"children\", \n",
      "                                            \"religious\"]].tolist()))\n",
      "resp.update({0 : 1})\n",
      "print resp"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "mfx = affair_mod.get_margeff(atexog=resp)\n",
      "print mfx.summary()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "affair_mod.predict(respondent1000)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "affair_mod.fittedvalues[1000]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "affair_mod.model.cdf(affair_mod.fittedvalues[1000])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "raw",
     "metadata": {},
     "source": [
      "The \"correct\" model here is likely the Tobit model. We have an work in progress branch \"tobit-model\" on github, if anyone is interested in censored regression models."
     ]
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "Exercise: Logit vs Probit"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "fig = plt.figure(figsize=(12,8))\n",
      "ax = fig.add_subplot(111)\n",
      "support = np.linspace(-6, 6, 1000)\n",
      "ax.plot(support, stats.logistic.cdf(support), 'r-', label='Logistic')\n",
      "ax.plot(support, stats.norm.cdf(support), label='Probit')\n",
      "ax.legend();"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "fig = plt.figure(figsize=(12,8))\n",
      "ax = fig.add_subplot(111)\n",
      "support = np.linspace(-6, 6, 1000)\n",
      "ax.plot(support, stats.logistic.pdf(support), 'r-', label='Logistic')\n",
      "ax.plot(support, stats.norm.pdf(support), label='Probit')\n",
      "ax.legend();"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "raw",
     "metadata": {},
     "source": [
      "Compare the estimates of the Logit Fair model above to a Probit model. Does the prediction table look better? Much difference in marginal effects?"
     ]
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "Genarlized Linear Model Example"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print sm.datasets.star98.SOURCE"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print sm.datasets.star98.DESCRLONG"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print sm.datasets.star98.NOTE"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "dta = sm.datasets.star98.load_pandas().data\n",
      "print dta.columns"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print dta[['NABOVE', 'NBELOW', 'LOWINC', 'PERASIAN', 'PERBLACK', 'PERHISP', 'PERMINTE']].head(10)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print dta[['AVYRSEXP', 'AVSALK', 'PERSPENK', 'PTRATIO', 'PCTAF', 'PCTCHRT', 'PCTYRRND']].head(10)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "formula = 'NABOVE + NBELOW ~ LOWINC + PERASIAN + PERBLACK + PERHISP + PCTCHRT '\n",
      "formula += '+ PCTYRRND + PERMINTE*AVYRSEXP*AVSALK + PERSPENK*PTRATIO*PCTAF'"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Aside: Binomial distribution"
     ]
    },
    {
     "cell_type": "raw",
     "metadata": {},
     "source": [
      "Toss a six-sided die 5 times, what's the probability of exactly 2 fours?"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "stats.binom(5, 1./6).pmf(2)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from scipy.misc import comb\n",
      "comb(5,2) * (1/6.)**2 * (5/6.)**3"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from statsmodels.formula.api import glm\n",
      "glm_mod = glm(formula, dta, family=sm.families.Binomial()).fit()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print glm_mod.summary()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "raw",
     "metadata": {},
     "source": [
      "The number of trials "
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "glm_mod.model.data.orig_endog.sum(1)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "glm_mod.fittedvalues * glm_mod.model.data.orig_endog.sum(1)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "raw",
     "metadata": {},
     "source": [
      "First differences: We hold all explanatory variables constant at their means and manipulate the percentage of low income households to assess its impact\n",
      "on the response variables:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "exog = glm_mod.model.data.orig_exog # get the dataframe"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "means25 = exog.mean()\n",
      "print means25"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "means25['LOWINC'] = exog['LOWINC'].quantile(.25)\n",
      "print means25"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "means75 = exog.mean()\n",
      "means75['LOWINC'] = exog['LOWINC'].quantile(.75)\n",
      "print means75"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "resp25 = glm_mod.predict(means25)\n",
      "resp75 = glm_mod.predict(means75)\n",
      "diff = resp75 - resp25"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "raw",
     "metadata": {},
     "source": [
      "The interquartile first difference for the percentage of low income households in a school district is:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print \"%2.4f%%\" % (diff[0]*100)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "nobs = glm_mod.nobs\n",
      "y = glm_mod.model.endog\n",
      "yhat = glm_mod.mu"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from statsmodels.graphics.api import abline_plot\n",
      "fig = plt.figure(figsize=(12,8))\n",
      "ax = fig.add_subplot(111, ylabel='Observed Values', xlabel='Fitted Values')\n",
      "ax.scatter(yhat, y)\n",
      "y_vs_yhat = sm.OLS(y, sm.add_constant(yhat, prepend=True)).fit()\n",
      "fig = abline_plot(model_results=y_vs_yhat, ax=ax)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Plot fitted values vs Pearson residuals"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Pearson residuals are defined to be \n",
      "\n",
      "$$\\frac{(y - \\mu)}{\\sqrt{(var(\\mu))}}$$\n",
      "\n",
      "where var is typically determined by the family. E.g., binomial variance is $np(1 - p)$"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "fig = plt.figure(figsize=(12,8))\n",
      "ax = fig.add_subplot(111, title='Residual Dependence Plot', xlabel='Fitted Values',\n",
      "                          ylabel='Pearson Residuals')\n",
      "ax.scatter(yhat, stats.zscore(glm_mod.resid_pearson))\n",
      "ax.axis('tight')\n",
      "ax.plot([0.0, 1.0],[0.0, 0.0], 'k-');"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Histogram of standardized deviance residuals with Kernel Density Estimate overlayed"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "The definition of the deviance residuals depends on the family. For the Binomial distribution this is \n",
      "\n",
      "$$r_{dev} = sign\\(Y-\\mu\\)*\\sqrt{2n(Y\\log\\frac{Y}{\\mu}+(1-Y)\\log\\frac{(1-Y)}{(1-\\mu)}}$$\n",
      "\n",
      "They can be used to detect ill-fitting covariates"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "resid = glm_mod.resid_deviance\n",
      "resid_std = stats.zscore(resid) \n",
      "kde_resid = sm.nonparametric.KDEUnivariate(resid_std)\n",
      "kde_resid.fit()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "fig = plt.figure(figsize=(12,8))\n",
      "ax = fig.add_subplot(111, title=\"Standardized Deviance Residuals\")\n",
      "ax.hist(resid_std, bins=25, normed=True);\n",
      "ax.plot(kde_resid.support, kde_resid.density, 'r');"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "QQ-plot of deviance residuals"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "fig = plt.figure(figsize=(12,8))\n",
      "ax = fig.add_subplot(111)\n",
      "fig = sm.graphics.qqplot(resid, line='r', ax=ax)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}