{
 "metadata": {
  "name": "",
  "signature": "sha256:4d0a48ce24362e292246573d1784aae3c74f74e611ae1ea9570c003e60083096"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Generalized Linear Models (Formula)"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "This notebook illustrates how you can use R-style formulas to fit Generalized Linear Models.\n",
      "\n",
      "To begin, we load the ``Star98`` dataset and we construct a formula and pre-process the data:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from __future__ import print_function\n",
      "import statsmodels.api as sm\n",
      "import statsmodels.formula.api as smf\n",
      "star98 = sm.datasets.star98.load_pandas().data\n",
      "formula = 'SUCCESS ~ LOWINC + PERASIAN + PERBLACK + PERHISP + PCTCHRT + \\\n",
      "           PCTYRRND + PERMINTE*AVYRSEXP*AVSALK + PERSPENK*PTRATIO*PCTAF'\n",
      "dta = star98[['NABOVE', 'NBELOW', 'LOWINC', 'PERASIAN', 'PERBLACK', 'PERHISP',\n",
      "              'PCTCHRT', 'PCTYRRND', 'PERMINTE', 'AVYRSEXP', 'AVSALK',\n",
      "              'PERSPENK', 'PTRATIO', 'PCTAF']]\n",
      "endog = dta['NABOVE'] / (dta['NABOVE'] + dta.pop('NBELOW'))\n",
      "del dta['NABOVE']\n",
      "dta['SUCCESS'] = endog"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Then, we fit the GLM model:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "mod1 = smf.glm(formula=formula, data=dta, family=sm.families.Binomial()).fit()\n",
      "mod1.summary()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Finally, we define a function to operate customized data transformation using the formula framework:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def double_it(x):\n",
      "    return 2 * x\n",
      "formula = 'SUCCESS ~ double_it(LOWINC) + PERASIAN + PERBLACK + PERHISP + PCTCHRT + \\\n",
      "           PCTYRRND + PERMINTE*AVYRSEXP*AVSALK + PERSPENK*PTRATIO*PCTAF'\n",
      "mod2 = smf.glm(formula=formula, data=dta, family=sm.families.Binomial()).fit()\n",
      "mod2.summary()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "As expected, the coefficient for ``double_it(LOWINC)`` in the second model is half the size of the ``LOWINC`` coefficient from the first model:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print(mod1.params[1])\n",
      "print(mod2.params[1] * 2)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}