{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import random\n",
    "import string\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'KXDIWQKMP'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "randStr = lambda str_type,length:\"\".join(np.random.choice(np.array(list(eval(str_type))),length))\n",
    "randStr(\"string.ascii_uppercase\",9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>iovrrx</th>\n",
       "      <th>nfinsu</th>\n",
       "      <th>mvdfjc</th>\n",
       "      <th>idjges</th>\n",
       "      <th>fubmrg</th>\n",
       "      <th>lvuhfv</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.987654</td>\n",
       "      <td>0.206104</td>\n",
       "      <td>0.802920</td>\n",
       "      <td>0.011157</td>\n",
       "      <td>0.860618</td>\n",
       "      <td>0.575871</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.706397</td>\n",
       "      <td>0.860083</td>\n",
       "      <td>0.939230</td>\n",
       "      <td>0.436194</td>\n",
       "      <td>0.557081</td>\n",
       "      <td>0.706964</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.043139</td>\n",
       "      <td>0.729435</td>\n",
       "      <td>0.597488</td>\n",
       "      <td>0.700998</td>\n",
       "      <td>0.974193</td>\n",
       "      <td>0.917758</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.316080</td>\n",
       "      <td>0.461547</td>\n",
       "      <td>0.844540</td>\n",
       "      <td>0.510143</td>\n",
       "      <td>0.908475</td>\n",
       "      <td>0.877330</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.828839</td>\n",
       "      <td>0.177670</td>\n",
       "      <td>0.610833</td>\n",
       "      <td>0.328238</td>\n",
       "      <td>0.327697</td>\n",
       "      <td>0.689756</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     iovrrx    nfinsu    mvdfjc    idjges    fubmrg    lvuhfv\n",
       "0  0.987654  0.206104  0.802920  0.011157  0.860618  0.575871\n",
       "1  0.706397  0.860083  0.939230  0.436194  0.557081  0.706964\n",
       "2  0.043139  0.729435  0.597488  0.700998  0.974193  0.917758\n",
       "3  0.316080  0.461547  0.844540  0.510143  0.908475  0.877330\n",
       "4  0.828839  0.177670  0.610833  0.328238  0.327697  0.689756"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def mkRandDf(length=None, width=None, str_kind=None, str_len=None):\n",
    "    \"\"\"\n",
    "    Parameters\n",
    "    ----------\n",
    "    length : int\n",
    "    width : int\n",
    "    str_kind : str\n",
    "        Any of the types of strings in strings module e.g.\n",
    "            \"acscii_lower\", \"ascii_upper\", \"digits\", ect...\n",
    "    str_len : int\n",
    "    \"\"\"\n",
    "    str_kind = str_kind or \"ascii_uppercase\"\n",
    "    str_kind = \"string.\"+ str_kind\n",
    "    str_len = width or 10\n",
    "    \n",
    "    randStr = lambda str_type, length :\"\".join(np.random.choice(np.array(list(eval(str_type))),length))\n",
    "    \n",
    "    column_names = [randStr(str_kind, width) for i in range(width)]\n",
    "    \n",
    "    rand_df = pd.DataFrame(np.random.rand(length, width))\n",
    "    rand_df.columns = column_names\n",
    "    \n",
    "    return rand_df\n",
    "\n",
    "my_df = mkRandDf(5, 6, str_kind=\"ascii_lowercase\")\n",
    "my_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "--- \n",
    "# `ix` to slice up like a `numpy` array."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.046928\n",
       "1    0.736357\n",
       "2    0.020734\n",
       "3    0.973336\n",
       "4    0.827972\n",
       "Name: aekbyu, dtype: float64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "my_df.ix[:,0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use `to_frame` to make a sliced series into a data frame of its own."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>aekbyu</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.046928</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.736357</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.020734</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.973336</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.827972</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     aekbyu\n",
       "0  0.046928\n",
       "1  0.736357\n",
       "2  0.020734\n",
       "3  0.973336\n",
       "4  0.827972"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "my_df.ix[:,0].to_frame()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "\n",
    "### MultiIndexing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>left</th>\n",
       "      <th>middle</th>\n",
       "      <th>right</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.912992</td>\n",
       "      <td>0.544838</td>\n",
       "      <td>0.489038</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.977553</td>\n",
       "      <td>0.265588</td>\n",
       "      <td>0.342345</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.266270</td>\n",
       "      <td>0.121333</td>\n",
       "      <td>0.871679</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       left    middle     right\n",
       "0  0.912992  0.544838  0.489038\n",
       "1  0.977553  0.265588  0.342345\n",
       "2  0.266270  0.121333  0.871679"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_array = np.random.rand(3,3)\n",
    "data = pd.DataFrame(data_array,columns = [\"left\",\"middle\",\"right\"])\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def superGroup(dataframe=None,new_level=None):\n",
    "    if type(dataframe.columns) == pd.indexes.base.Index:\n",
    "        out_df = pd.DataFrame(dataframe.values,index=dataframe.index,columns=pd.MultiIndex.from_product([[new_level],dataframe.columns]))\n",
    "        return out_df\n",
    "    if type(dataframe.columns) == pd.indexes.multi.MultiIndex:\n",
    "        levels = [list(i.values) for i in dataframe.columns.levels]\n",
    "        levels = [[new_level]]+levels\n",
    "        out_df = pd.DataFrame(dataframe.values, index = dataframe.index, columns = pd.MultiIndex.from_product(levels))\n",
    "        return out_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"3\" halign=\"left\">Most New</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"3\" halign=\"left\">Newest</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"3\" halign=\"left\">New</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>left</th>\n",
       "      <th>middle</th>\n",
       "      <th>right</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.394329</td>\n",
       "      <td>0.298836</td>\n",
       "      <td>0.683910</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.847183</td>\n",
       "      <td>0.630419</td>\n",
       "      <td>0.171109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.747384</td>\n",
       "      <td>0.058720</td>\n",
       "      <td>0.129397</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Most New                    \n",
       "     Newest                    \n",
       "        New                    \n",
       "       left    middle     right\n",
       "0  0.394329  0.298836  0.683910\n",
       "1  0.847183  0.630419  0.171109\n",
       "2  0.747384  0.058720  0.129397"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hey = superGroup(data,\"New\")\n",
    "works = superGroup(hey,\"Newest\")\n",
    "superGroup(works,\"Most New\")\n",
    "# hey.columns.labels"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "code review question"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10000 loops, best of 3: 25.3 µs per loop\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "color = [\"1xYellow ; 2xRed \",\n",
    "         \"2xYellow ; 1xOrange \",\n",
    "         \"3xYellow ; 2xGreen \",\n",
    "         \"1xYellow ; 1xRed \",\n",
    "         \"2xYellow ; 1xRed \"]\n",
    "numbers = np.random.rand(len(color))\n",
    "ex_df = pd.DataFrame(np.array([numbers,color]).T,\n",
    "                     columns = [\"numbers\",\"colors\"])\n",
    "rx = re.compile(\"x(\\w+)\\s\")\n",
    "just_colors = ex_df.colors.apply(rx.findall)\n",
    "%timeit present_colors = set(sum(just_colors,[]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# present_colors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Green ', 'Orange ', 'Red ', 'Yellow '}"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "present_colors = set()\n",
    "for value in ex_df['colors'].values:\n",
    "    for color in [x.strip() for x in value.split(';')]:\n",
    "        present_colors.add(color.split('x')[-1])\n",
    "\n",
    "present_colors = set(sum(([color.split('x')[-1] for color in value.split(';')]\n",
    "                          for value in ex_df['colors'].values), []))\n",
    "present_colors"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "---\n",
    "## [CODEREVIEW - Is there a more efficient way to subtract multiple columns in a PANDAS DataFrame by a series (or single column)?](http://codereview.stackexchange.com/questions/156447/is-there-a-more-efficient-way-to-subtract-multiple-columns-in-a-pandas-dataframe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     iovrrx    nfinsu    mvdfjc    idjges    fubmrg    lvuhfv\n",
      "0  0.987654  0.206104  0.802920  0.011157  0.860618  0.575871\n",
      "1  0.706397  0.860083  0.939230  0.436194  0.557081  0.706964\n",
      "2  0.043139  0.729435  0.597488  0.700998  0.974193  0.917758\n",
      "3  0.316080  0.461547  0.844540  0.510143  0.908475  0.877330\n",
      "4  0.828839  0.177670  0.610833  0.328238  0.327697  0.689756\n"
     ]
    }
   ],
   "source": [
    "print(my_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "section_A = my_df.ix[:,:3]\n",
    "section_B = my_df.ix[:,3:]\n",
    "# section_B"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     iovrrx    nfinsu    mvdfjc    idjges    fubmrg    lvuhfv\n",
      "0  1.667378 -0.593258  1.368628 -4.800610  1.468744  0.889117\n",
      "1  0.056992  0.340988  0.467991 -0.638518 -0.285601  0.058149\n",
      "2 -3.467018  0.612699  0.324830  0.555330  1.030127  0.944032\n",
      "3 -0.941776 -0.395590  0.476099 -0.251165  0.581380  0.531053\n",
      "4  0.933714 -1.288174  0.493400 -0.402633 -0.405015  0.668708\n"
     ]
    }
   ],
   "source": [
    "log_div_ave = my_df.apply(np.log2).values.T - my_df.apply(np.log2).mean(axis=1).values\n",
    "\n",
    "log_div_ave = pd.DataFrame(log_div_ave.T,columns=my_df.columns)\n",
    "\n",
    "print(log_div_ave)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}