{
 "metadata": {
  "name": "",
  "signature": "sha256:11993ebaa7aa1c5ebb5c3eaff271ec7cd7eb035bd2793e76d9837603a767a572"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Breaking Up A String Into Columns Using Regex In Pandas\n",
      "\n",
      "- **Author:** [Chris Albon](http://www.chrisalbon.com/), [@ChrisAlbon](https://twitter.com/chrisalbon)\n",
      "- **Date:** -\n",
      "- **Repo:** [Python 3 code snippets for data science](https://github.com/chrisalbon/code_py)\n",
      "- **Note:** Originally based on [this tutorial in nbviewer](http://nbviewer.ipython.org/github/swcarpentry/notebooks/blob/master/regex-intro.ipynb)."
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Import modules"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import re\n",
      "import pandas as pd"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 24
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Create a dataframe of raw strings"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Create a dataframe with a single column of strings\n",
      "data = {'raw': ['Arizona 1 2014-12-23       3242.0',\n",
      "                'Iowa 1 2010-02-23       3453.7',\n",
      "                'Oregon 0 2014-06-20       2123.0',\n",
      "                'Maryland 0 2014-03-14       1123.6',\n",
      "                'Florida 1 2013-01-15       2134.0',\n",
      "                'Georgia 0 2012-07-14       2345.6']}\n",
      "df = pd.DataFrame(data, columns = ['raw'])\n",
      "df"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>raw</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>  Arizona 1 2014-12-23       3242.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>     Iowa 1 2010-02-23       3453.7</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>   Oregon 0 2014-06-20       2123.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> Maryland 0 2014-03-14       1123.6</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>  Florida 1 2013-01-15       2134.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
        "      <td>  Georgia 0 2012-07-14       2345.6</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>6 rows \u00d7 1 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 25,
       "text": [
        "                                  raw\n",
        "0   Arizona 1 2014-12-23       3242.0\n",
        "1      Iowa 1 2010-02-23       3453.7\n",
        "2    Oregon 0 2014-06-20       2123.0\n",
        "3  Maryland 0 2014-03-14       1123.6\n",
        "4   Florida 1 2013-01-15       2134.0\n",
        "5   Georgia 0 2012-07-14       2345.6\n",
        "\n",
        "[6 rows x 1 columns]"
       ]
      }
     ],
     "prompt_number": 25
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Search a column of strings for a pattern"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Which rows of df['raw'] contain 'xxxx-xx-xx'?\n",
      "df['raw'].str.contains('....-..-..', regex=True)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 26,
       "text": [
        "0    True\n",
        "1    True\n",
        "2    True\n",
        "3    True\n",
        "4    True\n",
        "5    True\n",
        "Name: raw, dtype: bool"
       ]
      }
     ],
     "prompt_number": 26
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Extract the column of single digits"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# In the column 'raw', extract single digit in the strings\n",
      "df['female'] = df['raw'].str.extract('(\\d)')\n",
      "df['female']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 27,
       "text": [
        "0    1\n",
        "1    1\n",
        "2    0\n",
        "3    0\n",
        "4    1\n",
        "5    0\n",
        "Name: female, dtype: object"
       ]
      }
     ],
     "prompt_number": 27
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Extract the column of dates"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# In the column 'raw', extract xxxx-xx-xx in the strings\n",
      "df['date'] = df['raw'].str.extract('(....-..-..)')\n",
      "df['date']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 28,
       "text": [
        "0    2014-12-23\n",
        "1    2010-02-23\n",
        "2    2014-06-20\n",
        "3    2014-03-14\n",
        "4    2013-01-15\n",
        "5    2012-07-14\n",
        "Name: date, dtype: object"
       ]
      }
     ],
     "prompt_number": 28
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Extract the column of thousands"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# In the column 'raw', extract ####.## in the strings\n",
      "df['score'] = df['raw'].str.extract('(\\d\\d\\d\\d\\.\\d)')\n",
      "df['score']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 29,
       "text": [
        "0    3242.0\n",
        "1    3453.7\n",
        "2    2123.0\n",
        "3    1123.6\n",
        "4    2134.0\n",
        "5    2345.6\n",
        "Name: score, dtype: object"
       ]
      }
     ],
     "prompt_number": 29
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Extract the column of words"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# In the column 'raw', extract the word in the strings\n",
      "df['state'] = df['raw'].str.extract('([A-Z]\\w{0,})')\n",
      "df['state']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 31,
       "text": [
        "0     Arizona\n",
        "1        Iowa\n",
        "2      Oregon\n",
        "3    Maryland\n",
        "4     Florida\n",
        "5     Georgia\n",
        "Name: state, dtype: object"
       ]
      }
     ],
     "prompt_number": 31
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### View the final dataframe"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>raw</th>\n",
        "      <th>female</th>\n",
        "      <th>date</th>\n",
        "      <th>score</th>\n",
        "      <th>state</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>  Arizona 1 2014-12-23       3242.0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 2014-12-23</td>\n",
        "      <td> 3242.0</td>\n",
        "      <td>  Arizona</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>     Iowa 1 2010-02-23       3453.7</td>\n",
        "      <td> 1</td>\n",
        "      <td> 2010-02-23</td>\n",
        "      <td> 3453.7</td>\n",
        "      <td>     Iowa</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>   Oregon 0 2014-06-20       2123.0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 2014-06-20</td>\n",
        "      <td> 2123.0</td>\n",
        "      <td>   Oregon</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> Maryland 0 2014-03-14       1123.6</td>\n",
        "      <td> 0</td>\n",
        "      <td> 2014-03-14</td>\n",
        "      <td> 1123.6</td>\n",
        "      <td> Maryland</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>  Florida 1 2013-01-15       2134.0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 2013-01-15</td>\n",
        "      <td> 2134.0</td>\n",
        "      <td>  Florida</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
        "      <td>  Georgia 0 2012-07-14       2345.6</td>\n",
        "      <td> 0</td>\n",
        "      <td> 2012-07-14</td>\n",
        "      <td> 2345.6</td>\n",
        "      <td>  Georgia</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>6 rows \u00d7 5 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 33,
       "text": [
        "                                  raw female        date   score     state\n",
        "0   Arizona 1 2014-12-23       3242.0      1  2014-12-23  3242.0   Arizona\n",
        "1      Iowa 1 2010-02-23       3453.7      1  2010-02-23  3453.7      Iowa\n",
        "2    Oregon 0 2014-06-20       2123.0      0  2014-06-20  2123.0    Oregon\n",
        "3  Maryland 0 2014-03-14       1123.6      0  2014-03-14  1123.6  Maryland\n",
        "4   Florida 1 2013-01-15       2134.0      1  2013-01-15  2134.0   Florida\n",
        "5   Georgia 0 2012-07-14       2345.6      0  2012-07-14  2345.6   Georgia\n",
        "\n",
        "[6 rows x 5 columns]"
       ]
      }
     ],
     "prompt_number": 33
    }
   ],
   "metadata": {}
  }
 ]
}