{
 "metadata": {
  "name": "",
  "signature": "sha256:daf845121b68fedc6c20fe71f5f45290441e727ef67b53809f4a1ba57a794f9d"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Created 7/9/2014 by KO\n",
      "</br>\n",
      "Implements propensity-score matching and eventually will implement balance diagnostics"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%matplotlib inline\n",
      "import math\n",
      "import numpy as np\n",
      "import scipy\n",
      "from scipy.stats import binom, hypergeom\n",
      "import pandas as pd\n",
      "import matplotlib.pyplot as plt\n",
      "from sklearn.linear_model import LogisticRegression"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "<b>Goal:</b> find the average treatment effect in the treatment group (ATT) on RE78.\n",
      "\n",
      "<b>Import the data:</b> controls and treated from Lalonde/Dehejia papers.  Here's what the site says about the data:\n",
      "\n",
      "The variables from left to right are: treatment indicator (1 if treated, 0 if not treated), age, education, Black (1 if black, 0 otherwise), Hispanic (1 if Hispanic, 0 otherwise), married (1 if married, 0 otherwise), nodegree (1 if no degree, 0 otherwise), RE74 (earnings in 1974), RE75 (earnings in 1975), and RE78 (earnings in 1978).\n",
      "\n",
      "http://users.nber.org/%7Erdehejia/nswdata2.html\n"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "names = ['Treated', 'Age', 'Education', 'Black', 'Hispanic', 'Married',\n",
      "         'Nodegree', 'RE74', 'RE75', 'RE78']\n",
      "treated = pd.read_table('nswre74_treated.txt', sep = '\\s+',\n",
      "                        header = None, names = names)\n",
      "control = pd.read_table('nswre74_control.txt', sep='\\s+', \n",
      "                        header = None, names = names)\n",
      "data = pd.concat([treated, control])\n",
      "data.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>Treated</th>\n",
        "      <th>Age</th>\n",
        "      <th>Education</th>\n",
        "      <th>Black</th>\n",
        "      <th>Hispanic</th>\n",
        "      <th>Married</th>\n",
        "      <th>Nodegree</th>\n",
        "      <th>RE74</th>\n",
        "      <th>RE75</th>\n",
        "      <th>RE78</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td> 1</td>\n",
        "      <td> 37</td>\n",
        "      <td> 11</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>  9930.0460</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> 1</td>\n",
        "      <td> 22</td>\n",
        "      <td>  9</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>  3595.8940</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> 1</td>\n",
        "      <td> 30</td>\n",
        "      <td> 12</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 24909.4500</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> 1</td>\n",
        "      <td> 27</td>\n",
        "      <td> 11</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>  7506.1460</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> 1</td>\n",
        "      <td> 33</td>\n",
        "      <td>  8</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>   289.7899</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 2,
       "text": [
        "   Treated  Age  Education  Black  Hispanic  Married  Nodegree  RE74  RE75  \\\n",
        "0        1   37         11      1         0        1         1     0     0   \n",
        "1        1   22          9      0         1        0         1     0     0   \n",
        "2        1   30         12      1         0        0         0     0     0   \n",
        "3        1   27         11      1         0        0         1     0     0   \n",
        "4        1   33          8      1         0        0         1     0     0   \n",
        "\n",
        "         RE78  \n",
        "0   9930.0460  \n",
        "1   3595.8940  \n",
        "2  24909.4500  \n",
        "3   7506.1460  \n",
        "4    289.7899  "
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "<b>Compute propensity scores</b> to start.  Then we need to separate the treated and controls again (preserve original indexing) in order to match them.\n",
      "\n",
      "\n",
      "Note, this section might need some fine-tuning to make it match Dehejia and Wahba (see their appendix for how they computed propensity scores)\n"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "propensity = LogisticRegression()\n",
      "propensity = propensity.fit(data[names[1:-1]], data.Treated)\n",
      "pscore = propensity.predict_proba(data[names[1:-1]])[:,1] # The predicted propensities by the model\n",
      "print pscore[:5]\n",
      "\n",
      "data['Propensity'] = pscore\n",
      "#pscore = pd.Series(data = pscore, index = data.index)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[ 0.42716293  0.25617646  0.54874013  0.37386481  0.40217168]\n"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Implement one-to-one matching, caliper without replacement.  Variants of the method are examined in the following paper.  This is something to explore further.\n",
      "</br>\n",
      "Austin, P. C. (2014), A comparison of 12 algorithms for matching on the propensity score. Statist. Med., 33: 1057\u20131069. doi: 10.1002/sim.6004"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def Match(groups, propensity, caliper = 0.05):\n",
      "    ''' \n",
      "    Inputs:\n",
      "    groups = Treatment assignments.  Must be 2 groups\n",
      "    propensity = Propensity scores for each observation. Propensity and groups should be in the same order (matching indices)\n",
      "    caliper = Maximum difference in matched propensity scores. For now, this is a caliper on the raw\n",
      "            propensity; Austin reccommends using a caliper on the logit propensity.\n",
      "    \n",
      "    Output:\n",
      "    A series containing the individuals in the control group matched to the treatment group.\n",
      "    Note that with caliper matching, not every treated individual may have a match.\n",
      "    '''\n",
      "\n",
      "    # Check inputs\n",
      "    if any(propensity <=0) or any(propensity >=1):\n",
      "        raise ValueError('Propensity scores must be between 0 and 1')\n",
      "    elif not(0<caliper<1):\n",
      "        raise ValueError('Caliper must be between 0 and 1')\n",
      "    elif len(groups)!= len(propensity):\n",
      "        raise ValueError('groups and propensity scores must be same dimension')\n",
      "    elif len(groups.unique()) != 2:\n",
      "        raise ValueError('wrong number of groups')\n",
      "        \n",
      "        \n",
      "    # Code groups as 0 and 1\n",
      "    groups = groups == groups.unique()[0]\n",
      "    N = len(groups)\n",
      "    N1 = groups.sum(); N2 = N-N1\n",
      "    g1, g2 = propensity[groups == 1], (propensity[groups == 0])\n",
      "    # Check if treatment groups got flipped - treatment (coded 1) should be the smaller\n",
      "    if N1 > N2:\n",
      "       N1, N2, g1, g2 = N2, N1, g2, g1 \n",
      "        \n",
      "        \n",
      "    # Randomly permute the smaller group to get order for matching\n",
      "    morder = np.random.permutation(N1)\n",
      "    matches = pd.Series(np.empty(N1))\n",
      "    matches[:] = np.NAN\n",
      "    \n",
      "    for m in morder:\n",
      "        dist = abs(g1[m] - g2)\n",
      "        if dist.min() <= caliper:\n",
      "            matches[m] = dist.argmin()\n",
      "            g2 = g2.drop(matches[m])\n",
      "    return (matches)\n",
      "\n",
      "    \n",
      "    \n",
      "    \n",
      "    \n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "stuff = Match(data.Treated, data.Propensity)\n",
      "g1, g2 = data.Propensity[data.Treated==1], data.Propensity[data.Treated==0]\n",
      "# test ValueError\n",
      "#badtreat = data.Treated + data.Hispanic\n",
      "#Match(badtreat, pscore)\n",
      "stuff[:5]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 5,
       "text": [
        "0     25\n",
        "1    213\n",
        "2     75\n",
        "3     14\n",
        "4    210\n",
        "dtype: float64"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "<b>Here's the result:</b> if we put the propensity scores of the treatment and matched controls side-by-side, we see that they're matched pretty well."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "zip(g1, g2[stuff])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 6,
       "text": [
        "[(0.42716292681386048, 0.42581880946572531),\n",
        " (0.25617646165856756, 0.25484822320097977),\n",
        " (0.54874012613984235, 0.54874012613984235),\n",
        " (0.37386481070884442, 0.37424224379754362),\n",
        " (0.40217167818759586, 0.4025488132226952),\n",
        " (0.37341040912703743, 0.37341040912703743),\n",
        " (0.53301175942990109, 0.53492670949658028),\n",
        " (0.38451503327005687, 0.38489660886065941),\n",
        " (0.50914521709464833, 0.5038713907925424),\n",
        " (0.61794972732004938, 0.58527783340947481),\n",
        " (0.36708091086829236, 0.36693596150764657),\n",
        " (0.52310609372279693, 0.52399253077902697),\n",
        " (0.37001375859203811, 0.37001375859203811),\n",
        " (0.41040938689881862, 0.40263858902000565),\n",
        " (0.37295623086366536, 0.37330571977466881),\n",
        " (0.36206521830641381, 0.36206521830641381),\n",
        " (0.53663014121627661, 0.53887945917796798),\n",
        " (0.37046647062835597, 0.37046647062835597),\n",
        " (0.57103382954029125, 0.5709591033853374),\n",
        " (0.53976237257756132, 0.53976237257756132),\n",
        " (0.36543122767419073, 0.36543122767419073),\n",
        " (0.59343787140217641, 0.58088215756113837),\n",
        " (0.4386969080721812, 0.44247414429559007),\n",
        " (0.36753212799132406, 0.36753212799132406),\n",
        " (0.35997778168476563, 0.35997778168476563),\n",
        " (0.40954977660006303, 0.40954977660006303),\n",
        " (0.36963807095849838, 0.36918577756921234),\n",
        " (0.26069791214312099, 0.26100872287425286),\n",
        " (0.35789562808324343, 0.35789562808324343),\n",
        " (0.36753212799132406, 0.36708091086829236),\n",
        " (0.35789562808324343, 0.35789562808324343),\n",
        " (0.45658813935562875, nan),\n",
        " (0.40082653085458858, 0.40082653085458858),\n",
        " (0.52624903693537084, nan),\n",
        " (0.5375136651601029, 0.5375136651601029),\n",
        " (0.56484900646250547, 0.56420807943645801),\n",
        " (0.40038494333600216, 0.39999799811496028),\n",
        " (0.56561608250077011, 0.56652020912997003),\n",
        " (0.46330566204377877, nan),\n",
        " (0.37257932252559373, 0.37257932252559373),\n",
        " (0.52850447140293089, 0.53075874267386913),\n",
        " (0.3969042231346866, 0.39612686819916004),\n",
        " (0.36790692445253587, 0.36753212799132406),\n",
        " (0.27913948507677838, 0.27881522993582841),\n",
        " (0.35915945500472685, 0.35868111759952492),\n",
        " (0.40082653085458858, 0.40168028755982904),\n",
        " (0.36790692445253587, 0.36790692445253587),\n",
        " (0.36288641657881326, 0.36288641657881326),\n",
        " (0.40038494333600216, 0.40079387221848178),\n",
        " (0.53301175942990109, 0.57005720877929322),\n",
        " (0.3913433578168215, 0.39168310001168555),\n",
        " (0.41393321094396696, 0.4127054899751052),\n",
        " (0.35500470600275558, 0.35500470600275558),\n",
        " (0.5375136651601029, 0.57053301873387885),\n",
        " (0.4117397433545737, 0.41137419523919089),\n",
        " (0.35789562808324343, 0.35789562808324343),\n",
        " (0.40567343869825451, 0.40603741546027916),\n",
        " (0.47360442597144675, 0.44944907059288414),\n",
        " (0.59691150969843831, 0.58464545042501748),\n",
        " (0.39304342081844873, 0.39301309024977354),\n",
        " (0.36790692445253587, 0.36835856598771649),\n",
        " (0.39350085125328266, 0.39350085125328266),\n",
        " (0.39612686819916004, 0.39775517139652095),\n",
        " (0.37386481070884442, 0.37386481070884442),\n",
        " (0.41434922364455296, 0.41302230654596012),\n",
        " (0.38619843594707792, 0.38619843594707792),\n",
        " (0.37386481070884442, 0.37424224379754362),\n",
        " (0.41608090759002542, 0.41309626451423542),\n",
        " (0.47723165395085809, 0.47774167074970658),\n",
        " (0.54112719501669537, 0.55768626228483698),\n",
        " (0.38405562703559115, 0.38451503327005687),\n",
        " (0.57901039501721874, 0.57317432336471086),\n",
        " (0.38016333753745757, 0.38041199464841013),\n",
        " (0.5375136651601029, 0.55545300341207382),\n",
        " (0.52810275907963666, 0.55097984688635149),\n",
        " (0.39953799040539978, 0.39938108342140327),\n",
        " (0.53075874267386913, 0.56180397045598252),\n",
        " (0.37129563329740667, 0.37046647062835597),\n",
        " (0.40178416325708044, 0.40255931536012274),\n",
        " (0.36333543761891568, 0.36333543761891568),\n",
        " (0.5375136651601029, 0.5375136651601029),\n",
        " (0.5217350446165081, nan),\n",
        " (0.59732989436222506, 0.58907212092192596),\n",
        " (0.53663014121627661, nan),\n",
        " (0.38919010277019828, 0.38919010277019828),\n",
        " (0.39744213629889608, 0.39736320517754298),\n",
        " (0.26207005517968529, 0.26138340190382492),\n",
        " (0.37174898822506325, 0.37174898822506325),\n",
        " (0.55984202089567947, nan),\n",
        " (0.38489660886065941, 0.38514373934059354),\n",
        " (0.41393321094396696, 0.41126954472315269),\n",
        " (0.35789562808324343, 0.35789562808324343),\n",
        " (0.36963807095849838, 0.36963807095849838),\n",
        " (0.52850447140293089, 0.52822407944383731),\n",
        " (0.41051397115899052, 0.41051397115899052),\n",
        " (0.5687410368098067, 0.56887204494312937),\n",
        " (0.36498110138737444, 0.36498110138737444),\n",
        " (0.57546114010719607, 0.57669759720271807),\n",
        " (0.37469705231202188, 0.37469705231202188),\n",
        " (0.27200031650600354, 0.27659164918684331),\n",
        " (0.37257932252559373, 0.37257932252559373),\n",
        " (0.37681958989925324, 0.37681958989925324),\n",
        " (0.36963807095849838, 0.36963807095849838),\n",
        " (0.35500470600275558, 0.35500470600275558),\n",
        " (0.35707939391258053, 0.35707939391258053),\n",
        " (0.41882782270999969, 0.4199216409379678),\n",
        " (0.53663014121627661, 0.53663014121627661),\n",
        " (0.36288641657881326, 0.36288641657881326),\n",
        " (0.38024100782850595, 0.38062096495388931),\n",
        " (0.39397088933484964, 0.3917332458962281),\n",
        " (0.32878224824302671, 0.32857587970056046),\n",
        " (0.24604313298890371, 0.24531002119584097),\n",
        " (0.52145634998553481, 0.52341300607717067),\n",
        " (0.35215316763290705, 0.35293545509098728),\n",
        " (0.55291558624596349, 0.55453051526410713),\n",
        " (0.53812498686079091, 0.53760415714737053),\n",
        " (0.53165840610880255, nan),\n",
        " (0.53178741742330415, 0.53526343055701198),\n",
        " (0.36732044941345232, 0.36726529273432967),\n",
        " (0.50016320180970986, 0.54233385721831306),\n",
        " (0.54568333247369738, 0.54649842890599332),\n",
        " (0.37088397590008554, 0.37046647062835597),\n",
        " (0.37807518272603041, 0.37811089138712795),\n",
        " (0.54505617352098923, 0.54778717953003553),\n",
        " (0.37187026316150906, 0.37123230389724349),\n",
        " (0.42907939387160898, 0.42621298956406267),\n",
        " (0.503576849031942, nan),\n",
        " (0.41499892319370452, 0.40530744104465677),\n",
        " (0.34867332496059367, 0.34855987148037076),\n",
        " (0.35124881591719082, 0.34951011622716577),\n",
        " (0.368354966402969, 0.36835856598771649),\n",
        " (0.39388148189718369, 0.3939643292973713),\n",
        " (0.42287750361949428, 0.42323937900753988),\n",
        " (0.27508239938161005, 0.28098466748341883),\n",
        " (0.41651421693564455, 0.42037816699807562),\n",
        " (0.4167397609523012, 0.41919527728474437),\n",
        " (0.46806147084466776, 0.45878647462448441),\n",
        " (0.36978946463533807, 0.36963807095849838),\n",
        " (0.49379330696466373, nan),\n",
        " (0.43357631858871326, 0.43257515170041527),\n",
        " (0.34692254178035581, 0.34623057969596582),\n",
        " (0.46757669724338247, 0.46341413004896059),\n",
        " (0.37685357018163268, 0.37681958989925324),\n",
        " (0.4455565269264718, 0.44617327640136217),\n",
        " (0.61886657105467635, 0.60079264899057727),\n",
        " (0.37718188114200613, 0.37705709921672609),\n",
        " (0.40934354192118261, 0.40954977660006303),\n",
        " (0.39573595296507968, 0.39432477073157096),\n",
        " (0.44255500877946502, 0.42053379991459011),\n",
        " (0.36811932082281856, 0.36790692445253587),\n",
        " (0.42116861754671925, 0.42103202468255674),\n",
        " (0.47650112195239025, 0.47887757800201047),\n",
        " (0.39476901876612058, 0.39217707695803344),\n",
        " (0.53190717776280616, 0.54227884676845073),\n",
        " (0.49379443849344062, 0.45360505623822056),\n",
        " (0.4872454286920998, 0.48602144945577064),\n",
        " (0.41089388876758753, 0.41048788604880759),\n",
        " (0.53019919491977685, 0.52624903693537084),\n",
        " (0.42206423034004498, 0.42342902381310754),\n",
        " (0.42156230780201276, 0.42182411349268001),\n",
        " (0.41195948127964338, 0.41102028893247478),\n",
        " (0.41575142347870969, 0.41004419660549557),\n",
        " (0.37949264038566599, 0.37894686448575243),\n",
        " (0.44597592218322879, 0.44343522976592487),\n",
        " (0.54700647957502357, nan),\n",
        " (0.46839575084837853, 0.42584389373973836),\n",
        " (0.30118504354309478, 0.30266228063350109),\n",
        " (0.39535751543292041, 0.39479876377674628),\n",
        " (0.46967422150989402, 0.47016706817874493),\n",
        " (0.40711739797786478, 0.40603741546027916),\n",
        " (0.43016221895202345, 0.40603741546027916),\n",
        " (0.59539063185170427, 0.57227359694234925),\n",
        " (0.25033235153545463, 0.2506659230345058),\n",
        " (0.41813865251776078, 0.40263858902000565),\n",
        " (0.2969316019282166, 0.29322046060925033),\n",
        " (0.46046038221301638, 0.45526853386815697),\n",
        " (0.52687531615177041, 0.54771545246673836),\n",
        " (0.50045740817131545, 0.51770231215857909),\n",
        " (0.63404475620095613, 0.58689845316771205),\n",
        " (0.376835082633315, 0.37604736111069387),\n",
        " (0.59996973878777604, 0.57989993766179626),\n",
        " (0.47324609795733941, 0.47791911265297649),\n",
        " (0.53695349937194692, 0.53707042977012709),\n",
        " (0.60108829050619672, nan),\n",
        " (0.6729751518671101, 0.63068079328671744)]"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}