{
 "metadata": {
  "name": "",
  "signature": "sha256:0f9af32c03c3961c6aa73ac9230dcba29211d2d6932125b0b270ce77b14385d3"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Convert A String Categorical Variable With Patsy\n",
      "\n",
      "- **Author:** [Chris Albon](http://www.chrisalbon.com/), [@ChrisAlbon](https://twitter.com/chrisalbon)\n",
      "- **Date:** -\n",
      "- **Repo:** [Python 3 code snippets for data science](https://github.com/chrisalbon/code_py)\n",
      "- **Note:**\n",
      "\n",
      "Originally from: Data Origami."
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### import modules"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import pandas as pd\n",
      "import patsy"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Create dataframe"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "raw_data = {'patient': [1, 1, 1, 0, 0], \n",
      "        'obs': [1, 2, 3, 1, 2], \n",
      "        'treatment': [0, 1, 0, 1, 0],\n",
      "        'score': ['strong', 'weak', 'normal', 'weak', 'strong']} \n",
      "df = pd.DataFrame(raw_data, columns = ['patient', 'obs', 'treatment', 'score'])\n",
      "df"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>patient</th>\n",
        "      <th>obs</th>\n",
        "      <th>treatment</th>\n",
        "      <th>score</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> strong</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> 1</td>\n",
        "      <td> 2</td>\n",
        "      <td> 1</td>\n",
        "      <td>   weak</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> 1</td>\n",
        "      <td> 3</td>\n",
        "      <td> 0</td>\n",
        "      <td> normal</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "      <td>   weak</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> 0</td>\n",
        "      <td> 2</td>\n",
        "      <td> 0</td>\n",
        "      <td> strong</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 15,
       "text": [
        "   patient  obs  treatment   score\n",
        "0        1    1          0  strong\n",
        "1        1    2          1    weak\n",
        "2        1    3          0  normal\n",
        "3        0    1          1    weak\n",
        "4        0    2          0  strong"
       ]
      }
     ],
     "prompt_number": 15
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Convert df['score'] into a categorical variable ready for regression (i.e. set one category as the baseline)"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# On the 'score' variable in the df dataframe, convert to a categorical variable, and spit out a dataframe\n",
      "patsy.dmatrix('score', df, return_type='dataframe')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>Intercept</th>\n",
        "      <th>score[T.strong]</th>\n",
        "      <th>score[T.weak]</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 19,
       "text": [
        "   Intercept  score[T.strong]  score[T.weak]\n",
        "0          1                1              0\n",
        "1          1                0              1\n",
        "2          1                0              0\n",
        "3          1                0              1\n",
        "4          1                1              0"
       ]
      }
     ],
     "prompt_number": 19
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Convert df['score'] into a categorical variable without setting one category as baseline\n",
      "\n",
      "This is likely what you will want to do"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# On the 'score' variable in the df dataframe, convert to a categorical variable, and spit out a dataframe\n",
      "patsy.dmatrix('score - 1', df, return_type='dataframe')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>score[normal]</th>\n",
        "      <th>score[strong]</th>\n",
        "      <th>score[weak]</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 20,
       "text": [
        "   score[normal]  score[strong]  score[weak]\n",
        "0              0              1            0\n",
        "1              0              0            1\n",
        "2              1              0            0\n",
        "3              0              0            1\n",
        "4              0              1            0"
       ]
      }
     ],
     "prompt_number": 20
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Create a variable that is \"1\" if the variables of patient and treatment are both 1"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "patsy.dmatrix('patient + treatment + patient:treatment-1', df, return_type='dataframe')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>patient</th>\n",
        "      <th>treatment</th>\n",
        "      <th>patient:treatment</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 18,
       "text": [
        "   patient  treatment  patient:treatment\n",
        "0        1          0                  0\n",
        "1        1          1                  1\n",
        "2        1          0                  0\n",
        "3        0          1                  0\n",
        "4        0          0                  0"
       ]
      }
     ],
     "prompt_number": 18
    }
   ],
   "metadata": {}
  }
 ]
}