{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pop</th>\n",
       "      <th>state</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td> 1.5</td>\n",
       "      <td> OH</td>\n",
       "      <td> 2000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td> 1.7</td>\n",
       "      <td> OH</td>\n",
       "      <td> 2001</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td> 3.6</td>\n",
       "      <td> OH</td>\n",
       "      <td> 2002</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td> 2.4</td>\n",
       "      <td> NV</td>\n",
       "      <td> 2001</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> 2.9</td>\n",
       "      <td> NV</td>\n",
       "      <td> 2002</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pop state  year\n",
       "0  1.5    OH  2000\n",
       "1  1.7    OH  2001\n",
       "2  3.6    OH  2002\n",
       "3  2.4    NV  2001\n",
       "4  2.9    NV  2002"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    " '''\n",
    "Pandas is one of the most powerful contributions to python for quick and easy data analysis. Data Science is dominated by\n",
    "one common data structure - the table. Python never had a great native way to manipulate tables in ways that many analysts\n",
    "are used to (if you're at all familliar with spreadsheets or relational databases). The basic Pandas data structure is the Data \n",
    "Frame which, if you are an R user, should sound familliar.\n",
    "\n",
    "This module is a very high level treatment of basic data operations one typically uses when manipulating tables in Python. \n",
    "To really learn all of the details, refer to the book.\n",
    "'''\n",
    "\n",
    "#To import\n",
    "import pandas as pd #Its common to use pd as the abbreviation\n",
    "from pandas import Series, DataFrame #Wes McKinney recommends importing these separately - they are used so often and benefit from having their own namespace\n",
    "\n",
    "'''\n",
    "The Series - for this module we'll skip the Series (see book for details), but we will define it. A Series is a one dimensional\n",
    "array like object that has an array plus an index, which labels the array entries. Once we present a Data Frame, one can think\n",
    "of a series as similar to a Data Frame with just one column.\n",
    "'''\n",
    "\n",
    "'''\n",
    "A simple example of the DataFrame - building one from a dictionary\n",
    "(note for this to work each list has to be the same length)\n",
    "'''\n",
    "data = {'state':['OH', 'OH', 'OH', 'NV', 'NV'], \n",
    "        'year':[2000, 2001, 2002, 2001, 2002],\n",
    "        'pop':[1.5, 1.7, 3.6, 2.4, 2.9]}\n",
    "\n",
    "frame = pd.DataFrame(data) #This function will turn the dict to the data frame. Notice that the keys become columns and an index is created\n",
    "\n",
    "frame\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0    OH\n",
       " 1    OH\n",
       " 2    OH\n",
       " 3    NV\n",
       " 4    NV\n",
       " Name: state, dtype: object, 0    OH\n",
       " 1    OH\n",
       " 2    OH\n",
       " 3    NV\n",
       " 4    NV\n",
       " Name: state, dtype: object)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#To retrieve columns...use dict-like notation or use the column name as an attribute\n",
    "frame['state'], frame.state"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(   pop state  year\n",
       " 1  1.7    OH  2001, pop       1.7\n",
       " state      OH\n",
       " year     2001\n",
       " Name: 1, dtype: object)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#To retrieve a row, you can index it like a list, or use the actual row index name using the .ix method\n",
    "frame[1:2], frame.ix[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pop</th>\n",
       "      <th>state</th>\n",
       "      <th>year</th>\n",
       "      <th>big_pop</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td> 1.5</td>\n",
       "      <td> OH</td>\n",
       "      <td> 2000</td>\n",
       "      <td> False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td> 1.7</td>\n",
       "      <td> OH</td>\n",
       "      <td> 2001</td>\n",
       "      <td> False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td> 3.6</td>\n",
       "      <td> OH</td>\n",
       "      <td> 2002</td>\n",
       "      <td>  True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td> 2.4</td>\n",
       "      <td> NV</td>\n",
       "      <td> 2001</td>\n",
       "      <td> False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> 2.9</td>\n",
       "      <td> NV</td>\n",
       "      <td> 2002</td>\n",
       "      <td> False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pop state  year big_pop\n",
       "0  1.5    OH  2000   False\n",
       "1  1.7    OH  2001   False\n",
       "2  3.6    OH  2002    True\n",
       "3  2.4    NV  2001   False\n",
       "4  2.9    NV  2002   False"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Assigning a new column is easy too\n",
    "frame['big_pop'] = (frame['pop']>3)\n",
    "frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Rand1</th>\n",
       "      <th>OrigOrd</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td> 0.643863</td>\n",
       "      <td> 2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> 0.298427</td>\n",
       "      <td> 4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td> 0.118110</td>\n",
       "      <td> 7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>-0.220349</td>\n",
       "      <td> 5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>-0.393215</td>\n",
       "      <td> 9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.452396</td>\n",
       "      <td> 0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.502908</td>\n",
       "      <td> 3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>-0.585668</td>\n",
       "      <td> 6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.987663</td>\n",
       "      <td> 1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>-1.774430</td>\n",
       "      <td> 8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Rand1  OrigOrd\n",
       "2  0.643863        2\n",
       "4  0.298427        4\n",
       "7  0.118110        7\n",
       "5 -0.220349        5\n",
       "9 -0.393215        9\n",
       "0 -0.452396        0\n",
       "3 -0.502908        3\n",
       "6 -0.585668        6\n",
       "1 -0.987663        1\n",
       "8 -1.774430        8"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'''\n",
    "One operation on data that is frequent enough to highlight here is sorting\n",
    "'''\n",
    "import numpy as np\n",
    "\n",
    "df = pd.DataFrame(np.random.randn(10,1), columns = ['Rand1'])\n",
    "df['OrigOrd'] = df.index.values\n",
    "df = df.sort_index(by = 'Rand1', ascending = False) #Sorting by a particular column\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Rand1</th>\n",
       "      <th>OrigOrd</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.452396</td>\n",
       "      <td> 0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.987663</td>\n",
       "      <td> 1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td> 0.643863</td>\n",
       "      <td> 2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.502908</td>\n",
       "      <td> 3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> 0.298427</td>\n",
       "      <td> 4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>-0.220349</td>\n",
       "      <td> 5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>-0.585668</td>\n",
       "      <td> 6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td> 0.118110</td>\n",
       "      <td> 7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>-1.774430</td>\n",
       "      <td> 8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>-0.393215</td>\n",
       "      <td> 9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Rand1  OrigOrd\n",
       "0 -0.452396        0\n",
       "1 -0.987663        1\n",
       "2  0.643863        2\n",
       "3 -0.502908        3\n",
       "4  0.298427        4\n",
       "5 -0.220349        5\n",
       "6 -0.585668        6\n",
       "7  0.118110        7\n",
       "8 -1.774430        8\n",
       "9 -0.393215        9"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df.sort_index() #Now sorting back, using the index\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>key</th>\n",
       "      <th>rand_float</th>\n",
       "      <th>rand_int</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td> a</td>\n",
       "      <td>-1.253479</td>\n",
       "      <td> 2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td> b</td>\n",
       "      <td> 1.285238</td>\n",
       "      <td> 1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td> c</td>\n",
       "      <td> 0.312605</td>\n",
       "      <td> 2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td> d</td>\n",
       "      <td>-1.128266</td>\n",
       "      <td> 2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> e</td>\n",
       "      <td>-0.096089</td>\n",
       "      <td> 0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td> f</td>\n",
       "      <td>-0.920994</td>\n",
       "      <td> 3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td> g</td>\n",
       "      <td>-1.790160</td>\n",
       "      <td> 4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td> h</td>\n",
       "      <td>-0.273019</td>\n",
       "      <td> 4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td> i</td>\n",
       "      <td>-0.922280</td>\n",
       "      <td> 3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td> j</td>\n",
       "      <td>-0.063663</td>\n",
       "      <td> 0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  key  rand_float  rand_int\n",
       "0   a   -1.253479         2\n",
       "1   b    1.285238         1\n",
       "2   c    0.312605         2\n",
       "3   d   -1.128266         2\n",
       "4   e   -0.096089         0\n",
       "5   f   -0.920994         3\n",
       "6   g   -1.790160         4\n",
       "7   h   -0.273019         4\n",
       "8   i   -0.922280         3\n",
       "9   j   -0.063663         0"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'''\n",
    "Some of the real power we are after is the ability to condense, merge and concatenate data sets. This is where we\n",
    "want Python to have the same data munging functionality we usually get from executing SQL statements on relational\n",
    "databases.\n",
    "'''\n",
    "\n",
    "alpha = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']\n",
    "df1 = DataFrame({'rand_float':np.random.randn(10), 'key':alpha})\n",
    "df2 = DataFrame({'rand_int':np.random.randint(0, 5, size = 10), 'key':alpha})\n",
    "\n",
    "'''\n",
    "So we have two dataframes that share indexes (in this case all of them). We want to combine them. In sql we would execute\n",
    "a join, such as Select * from table1 a join table2 b on a.key=b.key;\n",
    "'''\n",
    "df_merge = pd.merge(df1,df2,on='key')\n",
    "df_merge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rand_float</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>rand_int</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.079876</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td> 1.285238</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.689713</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.921637</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-1.031590</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          rand_float\n",
       "rand_int            \n",
       "0          -0.079876\n",
       "1           1.285238\n",
       "2          -0.689713\n",
       "3          -0.921637\n",
       "4          -1.031590"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'''\n",
    "Now that we have this merged table, we might want to summarize it within a key grouping\n",
    "'''\n",
    "df_merge.groupby('rand_int').mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"4\" halign=\"left\">rand_float</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>sum</th>\n",
       "      <th>mean</th>\n",
       "      <th>len</th>\n",
       "      <th>std</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>rand_int</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.159752</td>\n",
       "      <td>-0.079876</td>\n",
       "      <td> 2</td>\n",
       "      <td> 0.022929</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td> 1.285238</td>\n",
       "      <td> 1.285238</td>\n",
       "      <td> 1</td>\n",
       "      <td>      NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-2.069140</td>\n",
       "      <td>-0.689713</td>\n",
       "      <td> 3</td>\n",
       "      <td> 0.870288</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-1.843274</td>\n",
       "      <td>-0.921637</td>\n",
       "      <td> 2</td>\n",
       "      <td> 0.000909</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-2.063179</td>\n",
       "      <td>-1.031590</td>\n",
       "      <td> 2</td>\n",
       "      <td> 1.072780</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          rand_float                         \n",
       "                 sum      mean  len       std\n",
       "rand_int                                     \n",
       "0          -0.159752 -0.079876    2  0.022929\n",
       "1           1.285238  1.285238    1       NaN\n",
       "2          -2.069140 -0.689713    3  0.870288\n",
       "3          -1.843274 -0.921637    2  0.000909\n",
       "4          -2.063179 -1.031590    2  1.072780"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'''\n",
    "You can have multiple aggregation functions, but the syntax isn't the same\n",
    "'''\n",
    "df_merge.groupby('rand_int').agg([np.sum, np.mean, len, np.std])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}