{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# stdlib!\n",
    "\n",
    "We're going to import the data, including our own type converter:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import csv\n",
    "\n",
    "conversion_map = {\n",
    "    'carat': float,\n",
    "    'depth': float,\n",
    "    'price': int,\n",
    "    'table': float,\n",
    "    'x': float,\n",
    "    'y': float,\n",
    "    'z': float\n",
    "}\n",
    "def converter(type_map, row):\n",
    "    \"\"\"Yep, we need to roll our own type conversions.\"\"\"\n",
    "    converted_row = {}\n",
    "    for col, val in row.items():\n",
    "        converter = type_map.get(col)\n",
    "        if converter:\n",
    "            converted_row[col] = converter(val)\n",
    "        else:\n",
    "            converted_row[col] = val\n",
    "    return converted_row\n",
    "\n",
    "with open('diamonds.csv', 'r') as f:\n",
    "    reader = csv.DictReader(f)\n",
    "    diamonds = [converter(conversion_map, r) for r in reader]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'': '1',\n",
       "  'carat': 0.23,\n",
       "  'clarity': 'SI2',\n",
       "  'color': 'E',\n",
       "  'cut': 'Ideal',\n",
       "  'depth': 61.5,\n",
       "  'price': 326,\n",
       "  'table': 55.0,\n",
       "  'x': 3.95,\n",
       "  'y': 3.98,\n",
       "  'z': 2.43},\n",
       " {'': '2',\n",
       "  'carat': 0.21,\n",
       "  'clarity': 'SI1',\n",
       "  'color': 'E',\n",
       "  'cut': 'Premium',\n",
       "  'depth': 59.8,\n",
       "  'price': 326,\n",
       "  'table': 61.0,\n",
       "  'x': 3.89,\n",
       "  'y': 3.84,\n",
       "  'z': 2.31}]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "diamonds[:2]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "Adding things up is easy enough..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "43040.86999999912"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def get_total_carats():\n",
    "    total_carats = 0\n",
    "    for row in diamonds:\n",
    "        total_carats += row['carat']\n",
    "    return total_carats\n",
    "get_total_carats()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "43040.86999999912"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Faster/more compact: Generator expression!\n",
    "sum(row['carat'] for row in diamonds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100 loops, best of 3: 5.29 ms per loop\n"
     ]
    }
   ],
   "source": [
    "# Which is faster? \n",
    "%timeit get_total_carats()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100 loops, best of 3: 5.7 ms per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit sum(row['carat'] for row in diamonds)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "But what if we want to group, then add?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# defaultdict is awesome. defaultdict is awesome.\n",
    "from collections import defaultdict\n",
    "\n",
    "def grouper(grouping_col, seq):\n",
    "    \"\"\"People have definitely written a faster version than what I'm about to write\"\"\"\n",
    "    groups = defaultdict(lambda: defaultdict(list))\n",
    "    for row in seq:\n",
    "        group = groups[row[grouping_col]]\n",
    "        for k, v in row.items():\n",
    "            if k != grouping_col:\n",
    "                group[k].append(v)\n",
    "    return groups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "groups = grouper('cut', diamonds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# check that the groups include all the rows from the original data\n",
    "assert sum(len(g['price']) for g in groups.values()) == len(diamonds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Fair', 'Good', 'Ideal', 'Premium', 'Very Good'}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "set(groups)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "What if I wanted to do something like \n",
    "```sql\n",
    "select cut, mean(price)\n",
    "from diamonds\n",
    "group by cut;\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Fair': 4358.757763975155,\n",
       " 'Good': 3928.864451691806,\n",
       " 'Ideal': 3457.541970210199,\n",
       " 'Premium': 4584.2577042999055,\n",
       " 'Very Good': 3981.7598907465654}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summary = {}\n",
    "for group, values in groups.items():\n",
    "    summary[group] = sum(values['price']) / len(values['price'])\n",
    "summary"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "or how about this one:\n",
    "```sql\n",
    "select max(price)\n",
    "from diamonds\n",
    "where carat > 1;\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "18823"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def get_max_price():\n",
    "    max_price = 0\n",
    "    for row in diamonds:\n",
    "        if row['carat'] > 1 and row['price'] > max_price:\n",
    "            max_price = row['price']\n",
    "    return max_price\n",
    "get_max_price()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "18823"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# More compact yet again: generator expression!\n",
    "max(row['price'] for row in diamonds if row['carat'] > 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100 loops, best of 3: 6.82 ms per loop\n"
     ]
    }
   ],
   "source": [
    "# Which is faster?\n",
    "%timeit get_max_price()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100 loops, best of 3: 7.36 ms per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit max(row['price'] for row in diamonds if row['carat'] > 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### itertools interlude\n",
    "\n",
    "The itertools module does lots of nice things. You should be aware of it, and use it where you would be writing your own bespoke counting thing. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Toolz!\n",
    "\n",
    "Lets see what it looks like to repeat some of this analysis using the toolz library. PSA: toolz has a pretty huge API, and it's worth reading through the docs: http://toolz.readthedocs.org/en/latest/api.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import toolz as tz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Some quick toolz fun things:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1, 2, 3, 4, 5, 6]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(tz.concat(([1, 2, 3], (4, 5, 6))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['A-foo', 'B-foo', 'c-foo', 'd-foo', 'bar-foo', 'baz-foo']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(tz.mapcat(lambda r: [x + \"-foo\" for x in r],\n",
    "                          [[\"A\", \"B\"], (\"c\", \"d\"), (\"bar\", \"baz\")]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'D': 6775, 'E': 9797, 'F': 9542, 'G': 11292, 'H': 8304, 'I': 5422, 'J': 2808}"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tz.frequencies([r['color'] for r in diamonds])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Remember that toolz is lazy- functions will return generator-like things:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<itertools.islice at 0x11176f260>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "take_2 = tz.take(2, diamonds)\n",
    "take_2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<generator object unique at 0x11176b4b0>"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unique_clarity = tz.unique(diamonds, key=lambda x: x.get('clarity'))\n",
    "unique_clarity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'': '1',\n",
       "  'carat': 0.23,\n",
       "  'clarity': 'SI2',\n",
       "  'color': 'E',\n",
       "  'cut': 'Ideal',\n",
       "  'depth': 61.5,\n",
       "  'price': 326,\n",
       "  'table': 55.0,\n",
       "  'x': 3.95,\n",
       "  'y': 3.98,\n",
       "  'z': 2.43},\n",
       " {'': '2',\n",
       "  'carat': 0.21,\n",
       "  'clarity': 'SI1',\n",
       "  'color': 'E',\n",
       "  'cut': 'Premium',\n",
       "  'depth': 59.8,\n",
       "  'price': 326,\n",
       "  'table': 61.0,\n",
       "  'x': 3.89,\n",
       "  'y': 3.84,\n",
       "  'z': 2.31}]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(take_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'': '1',\n",
       "  'carat': 0.23,\n",
       "  'clarity': 'SI2',\n",
       "  'color': 'E',\n",
       "  'cut': 'Ideal',\n",
       "  'depth': 61.5,\n",
       "  'price': 326,\n",
       "  'table': 55.0,\n",
       "  'x': 3.95,\n",
       "  'y': 3.98,\n",
       "  'z': 2.43},\n",
       " {'': '2',\n",
       "  'carat': 0.21,\n",
       "  'clarity': 'SI1',\n",
       "  'color': 'E',\n",
       "  'cut': 'Premium',\n",
       "  'depth': 59.8,\n",
       "  'price': 326,\n",
       "  'table': 61.0,\n",
       "  'x': 3.89,\n",
       "  'y': 3.84,\n",
       "  'z': 2.31},\n",
       " {'': '3',\n",
       "  'carat': 0.23,\n",
       "  'clarity': 'VS1',\n",
       "  'color': 'E',\n",
       "  'cut': 'Good',\n",
       "  'depth': 56.9,\n",
       "  'price': 327,\n",
       "  'table': 65.0,\n",
       "  'x': 4.05,\n",
       "  'y': 4.07,\n",
       "  'z': 2.31},\n",
       " {'': '4',\n",
       "  'carat': 0.29,\n",
       "  'clarity': 'VS2',\n",
       "  'color': 'I',\n",
       "  'cut': 'Premium',\n",
       "  'depth': 62.4,\n",
       "  'price': 334,\n",
       "  'table': 58.0,\n",
       "  'x': 4.2,\n",
       "  'y': 4.23,\n",
       "  'z': 2.63},\n",
       " {'': '6',\n",
       "  'carat': 0.24,\n",
       "  'clarity': 'VVS2',\n",
       "  'color': 'J',\n",
       "  'cut': 'Very Good',\n",
       "  'depth': 62.8,\n",
       "  'price': 336,\n",
       "  'table': 57.0,\n",
       "  'x': 3.94,\n",
       "  'y': 3.96,\n",
       "  'z': 2.48},\n",
       " {'': '7',\n",
       "  'carat': 0.24,\n",
       "  'clarity': 'VVS1',\n",
       "  'color': 'I',\n",
       "  'cut': 'Very Good',\n",
       "  'depth': 62.3,\n",
       "  'price': 336,\n",
       "  'table': 57.0,\n",
       "  'x': 3.95,\n",
       "  'y': 3.98,\n",
       "  'z': 2.47},\n",
       " {'': '16',\n",
       "  'carat': 0.32,\n",
       "  'clarity': 'I1',\n",
       "  'color': 'E',\n",
       "  'cut': 'Premium',\n",
       "  'depth': 60.9,\n",
       "  'price': 345,\n",
       "  'table': 58.0,\n",
       "  'x': 4.38,\n",
       "  'y': 4.42,\n",
       "  'z': 2.68},\n",
       " {'': '230',\n",
       "  'carat': 0.52,\n",
       "  'clarity': 'IF',\n",
       "  'color': 'F',\n",
       "  'cut': 'Ideal',\n",
       "  'depth': 62.2,\n",
       "  'price': 2783,\n",
       "  'table': 55.0,\n",
       "  'x': 5.14,\n",
       "  'y': 5.18,\n",
       "  'z': 3.21}]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Note that this returns the entire object\n",
    "list(unique_clarity)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'I1': 741,\n",
       " 'IF': 1790,\n",
       " 'SI1': 13065,\n",
       " 'SI2': 9194,\n",
       " 'VS1': 8171,\n",
       " 'VS2': 12258,\n",
       " 'VVS1': 3655,\n",
       " 'VVS2': 5066}"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# What are our clarity counts?\n",
    "tz.countby(lambda x: x['clarity'], diamonds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "18823"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# What about our max price from above? Reduction!\n",
    "def comparo(accum, row):\n",
    "    price = row['price']\n",
    "    if price > accum:\n",
    "        return price\n",
    "    else:\n",
    "        return accum\n",
    "    \n",
    "tz.reduce(comparo, diamonds, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "18823"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We could have also threaded here\n",
    "tz.thread_last(diamonds, \n",
    "               (tz.map, lambda x: x['price']), \n",
    "               max)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100 loops, best of 3: 8.6 ms per loop\n"
     ]
    }
   ],
   "source": [
    "# Which is faster?\n",
    "%timeit tz.reduce(comparo, diamonds, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100 loops, best of 3: 11.2 ms per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit tz.thread_last(diamonds, (tz.map, lambda x: x['price']), max)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's look at another SQL query- say we want clarity and carats where price > 1000:\n",
    "```sql\n",
    "select count(1)\n",
    "from diamonds\n",
    "where price > 1000\n",
    "group by clarity;\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'I1': 675,\n",
       " 'IF': 1042,\n",
       " 'SI1': 9978,\n",
       " 'SI2': 8118,\n",
       " 'VS1': 5702,\n",
       " 'VS2': 8647,\n",
       " 'VVS1': 2108,\n",
       " 'VVS2': 3146}"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Toolz has currying!\n",
    "import toolz.curried as tzc\n",
    "tzc.pipe(diamonds, \n",
    "         tzc.filter(lambda r: r['price'] > 1000),\n",
    "         tzc.map(lambda r: (r['clarity'],)),\n",
    "         tzc.countby(lambda r: r[0]),\n",
    "         dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'I1': 675,\n",
       " 'IF': 1042,\n",
       " 'SI1': 9978,\n",
       " 'SI2': 8118,\n",
       " 'VS1': 5702,\n",
       " 'VS2': 8647,\n",
       " 'VVS1': 2108,\n",
       " 'VVS2': 3146}"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We can go about this another way as well:\n",
    "def filter_and_count(kv):\n",
    "    f_and_c = tz.thread_last(kv[1],\n",
    "                             (tz.filter, lambda r: r['price'] > 1000),\n",
    "                             tz.count)\n",
    "                                          \n",
    "    return kv[0], f_and_c\n",
    "\n",
    "tz.thread_last(diamonds,\n",
    "               (tz.groupby, 'clarity'),\n",
    "               (tz.itemmap, filter_and_count))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'I1': 675,\n",
       " 'IF': 1042,\n",
       " 'SI1': 9978,\n",
       " 'SI2': 8118,\n",
       " 'VS1': 5702,\n",
       " 'VS2': 8647,\n",
       " 'VVS1': 2108,\n",
       " 'VVS2': 3146}"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Cleanest/best way: reduceby: Groupby + reduce\n",
    "def increment(accum, row):\n",
    "    if row['price'] > 1000:\n",
    "        return accum + 1\n",
    "    else:\n",
    "        return accum\n",
    "\n",
    "tz.reduceby('clarity', \n",
    "            increment,\n",
    "            diamonds, 0)                               "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Pandas!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "# We don't need this to use Pandas, FYI\n",
    "import numpy as np\n",
    "\n",
    "# CSV reader is fast!\n",
    "df = pd.read_csv('diamonds.csv', index_col=0)\n",
    "# Keep this for later, we're going to overwrite df\n",
    "df_diamonds = df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>carat</th>\n",
       "      <th>cut</th>\n",
       "      <th>color</th>\n",
       "      <th>clarity</th>\n",
       "      <th>depth</th>\n",
       "      <th>table</th>\n",
       "      <th>price</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>z</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.23</td>\n",
       "      <td>Ideal</td>\n",
       "      <td>E</td>\n",
       "      <td>SI2</td>\n",
       "      <td>61.5</td>\n",
       "      <td>55</td>\n",
       "      <td>326</td>\n",
       "      <td>3.95</td>\n",
       "      <td>3.98</td>\n",
       "      <td>2.43</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.21</td>\n",
       "      <td>Premium</td>\n",
       "      <td>E</td>\n",
       "      <td>SI1</td>\n",
       "      <td>59.8</td>\n",
       "      <td>61</td>\n",
       "      <td>326</td>\n",
       "      <td>3.89</td>\n",
       "      <td>3.84</td>\n",
       "      <td>2.31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.23</td>\n",
       "      <td>Good</td>\n",
       "      <td>E</td>\n",
       "      <td>VS1</td>\n",
       "      <td>56.9</td>\n",
       "      <td>65</td>\n",
       "      <td>327</td>\n",
       "      <td>4.05</td>\n",
       "      <td>4.07</td>\n",
       "      <td>2.31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.29</td>\n",
       "      <td>Premium</td>\n",
       "      <td>I</td>\n",
       "      <td>VS2</td>\n",
       "      <td>62.4</td>\n",
       "      <td>58</td>\n",
       "      <td>334</td>\n",
       "      <td>4.20</td>\n",
       "      <td>4.23</td>\n",
       "      <td>2.63</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.31</td>\n",
       "      <td>Good</td>\n",
       "      <td>J</td>\n",
       "      <td>SI2</td>\n",
       "      <td>63.3</td>\n",
       "      <td>58</td>\n",
       "      <td>335</td>\n",
       "      <td>4.34</td>\n",
       "      <td>4.35</td>\n",
       "      <td>2.75</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   carat      cut color clarity  depth  table  price     x     y     z\n",
       "1   0.23    Ideal     E     SI2   61.5     55    326  3.95  3.98  2.43\n",
       "2   0.21  Premium     E     SI1   59.8     61    326  3.89  3.84  2.31\n",
       "3   0.23     Good     E     VS1   56.9     65    327  4.05  4.07  2.31\n",
       "4   0.29  Premium     I     VS2   62.4     58    334  4.20  4.23  2.63\n",
       "5   0.31     Good     J     SI2   63.3     58    335  4.34  4.35  2.75"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>carat</th>\n",
       "      <th>depth</th>\n",
       "      <th>table</th>\n",
       "      <th>price</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>z</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>53940.000000</td>\n",
       "      <td>53940.000000</td>\n",
       "      <td>53940.000000</td>\n",
       "      <td>53940.000000</td>\n",
       "      <td>53940.000000</td>\n",
       "      <td>53940.000000</td>\n",
       "      <td>53940.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>0.797940</td>\n",
       "      <td>61.749405</td>\n",
       "      <td>57.457184</td>\n",
       "      <td>3932.799722</td>\n",
       "      <td>5.731157</td>\n",
       "      <td>5.734526</td>\n",
       "      <td>3.538734</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>0.474011</td>\n",
       "      <td>1.432621</td>\n",
       "      <td>2.234491</td>\n",
       "      <td>3989.439738</td>\n",
       "      <td>1.121761</td>\n",
       "      <td>1.142135</td>\n",
       "      <td>0.705699</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.200000</td>\n",
       "      <td>43.000000</td>\n",
       "      <td>43.000000</td>\n",
       "      <td>326.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>0.400000</td>\n",
       "      <td>61.000000</td>\n",
       "      <td>56.000000</td>\n",
       "      <td>950.000000</td>\n",
       "      <td>4.710000</td>\n",
       "      <td>4.720000</td>\n",
       "      <td>2.910000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>0.700000</td>\n",
       "      <td>61.800000</td>\n",
       "      <td>57.000000</td>\n",
       "      <td>2401.000000</td>\n",
       "      <td>5.700000</td>\n",
       "      <td>5.710000</td>\n",
       "      <td>3.530000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>1.040000</td>\n",
       "      <td>62.500000</td>\n",
       "      <td>59.000000</td>\n",
       "      <td>5324.250000</td>\n",
       "      <td>6.540000</td>\n",
       "      <td>6.540000</td>\n",
       "      <td>4.040000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>5.010000</td>\n",
       "      <td>79.000000</td>\n",
       "      <td>95.000000</td>\n",
       "      <td>18823.000000</td>\n",
       "      <td>10.740000</td>\n",
       "      <td>58.900000</td>\n",
       "      <td>31.800000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              carat         depth         table         price             x  \\\n",
       "count  53940.000000  53940.000000  53940.000000  53940.000000  53940.000000   \n",
       "mean       0.797940     61.749405     57.457184   3932.799722      5.731157   \n",
       "std        0.474011      1.432621      2.234491   3989.439738      1.121761   \n",
       "min        0.200000     43.000000     43.000000    326.000000      0.000000   \n",
       "25%        0.400000     61.000000     56.000000    950.000000      4.710000   \n",
       "50%        0.700000     61.800000     57.000000   2401.000000      5.700000   \n",
       "75%        1.040000     62.500000     59.000000   5324.250000      6.540000   \n",
       "max        5.010000     79.000000     95.000000  18823.000000     10.740000   \n",
       "\n",
       "                  y             z  \n",
       "count  53940.000000  53940.000000  \n",
       "mean       5.734526      3.538734  \n",
       "std        1.142135      0.705699  \n",
       "min        0.000000      0.000000  \n",
       "25%        4.720000      2.910000  \n",
       "50%        5.710000      3.530000  \n",
       "75%        6.540000      4.040000  \n",
       "max       58.900000     31.800000  "
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>carat</th>\n",
       "      <th>depth</th>\n",
       "      <th>table</th>\n",
       "      <th>price</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>z</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>clarity</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>I1</th>\n",
       "      <td>1.283846</td>\n",
       "      <td>62.734278</td>\n",
       "      <td>58.303779</td>\n",
       "      <td>3924.168691</td>\n",
       "      <td>6.761093</td>\n",
       "      <td>6.709379</td>\n",
       "      <td>4.207908</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>IF</th>\n",
       "      <td>0.505123</td>\n",
       "      <td>61.510615</td>\n",
       "      <td>56.507207</td>\n",
       "      <td>2864.839106</td>\n",
       "      <td>4.968402</td>\n",
       "      <td>4.989827</td>\n",
       "      <td>3.061659</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SI1</th>\n",
       "      <td>0.850482</td>\n",
       "      <td>61.853042</td>\n",
       "      <td>57.662541</td>\n",
       "      <td>3996.001148</td>\n",
       "      <td>5.888383</td>\n",
       "      <td>5.888256</td>\n",
       "      <td>3.639845</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SI2</th>\n",
       "      <td>1.077648</td>\n",
       "      <td>61.772167</td>\n",
       "      <td>57.927181</td>\n",
       "      <td>5063.028606</td>\n",
       "      <td>6.401370</td>\n",
       "      <td>6.397826</td>\n",
       "      <td>3.948478</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>VS1</th>\n",
       "      <td>0.727158</td>\n",
       "      <td>61.667458</td>\n",
       "      <td>57.315151</td>\n",
       "      <td>3839.455391</td>\n",
       "      <td>5.572178</td>\n",
       "      <td>5.581828</td>\n",
       "      <td>3.441007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>VS2</th>\n",
       "      <td>0.763935</td>\n",
       "      <td>61.724417</td>\n",
       "      <td>57.417401</td>\n",
       "      <td>3924.989395</td>\n",
       "      <td>5.657709</td>\n",
       "      <td>5.658859</td>\n",
       "      <td>3.491478</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>VVS1</th>\n",
       "      <td>0.503321</td>\n",
       "      <td>61.624651</td>\n",
       "      <td>56.884460</td>\n",
       "      <td>2523.114637</td>\n",
       "      <td>4.960364</td>\n",
       "      <td>4.975075</td>\n",
       "      <td>3.061294</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>VVS2</th>\n",
       "      <td>0.596202</td>\n",
       "      <td>61.663778</td>\n",
       "      <td>57.024990</td>\n",
       "      <td>3283.737071</td>\n",
       "      <td>5.218454</td>\n",
       "      <td>5.232118</td>\n",
       "      <td>3.221465</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            carat      depth      table        price         x         y  \\\n",
       "clarity                                                                    \n",
       "I1       1.283846  62.734278  58.303779  3924.168691  6.761093  6.709379   \n",
       "IF       0.505123  61.510615  56.507207  2864.839106  4.968402  4.989827   \n",
       "SI1      0.850482  61.853042  57.662541  3996.001148  5.888383  5.888256   \n",
       "SI2      1.077648  61.772167  57.927181  5063.028606  6.401370  6.397826   \n",
       "VS1      0.727158  61.667458  57.315151  3839.455391  5.572178  5.581828   \n",
       "VS2      0.763935  61.724417  57.417401  3924.989395  5.657709  5.658859   \n",
       "VVS1     0.503321  61.624651  56.884460  2523.114637  4.960364  4.975075   \n",
       "VVS2     0.596202  61.663778  57.024990  3283.737071  5.218454  5.232118   \n",
       "\n",
       "                z  \n",
       "clarity            \n",
       "I1       4.207908  \n",
       "IF       3.061659  \n",
       "SI1      3.639845  \n",
       "SI2      3.948478  \n",
       "VS1      3.441007  \n",
       "VS2      3.491478  \n",
       "VVS1     3.061294  \n",
       "VVS2     3.221465  "
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby('clarity').mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Our previous queries:\n",
    "```sql\n",
    "select cut, mean(price)\n",
    "from diamonds\n",
    "group by cut;\n",
    "\n",
    "select count(carat)\n",
    "from diamonds\n",
    "where price > 1000\n",
    "group by clarity;\n",
    "\n",
    "select max(price)\n",
    "from diamonds\n",
    "where carat > 1;\n",
    "\n",
    "select cut, price\n",
    "from diamonds\n",
    "where cut in ('Ideal', 'Premium')\n",
    "order by price desc\n",
    "limit 10;\n",
    "```\n",
    "\n",
    "Are pretty trivial operations in Pandas:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "cut\n",
       "Fair         4358.757764\n",
       "Good         3928.864452\n",
       "Ideal        3457.541970\n",
       "Premium      4584.257704\n",
       "Very Good    3981.759891\n",
       "Name: price, dtype: float64"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby('cut')['price'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "clarity\n",
       "I1       675\n",
       "IF      1042\n",
       "SI1     9978\n",
       "SI2     8118\n",
       "VS1     5702\n",
       "VS2     8647\n",
       "VVS1    2108\n",
       "VVS2    3146\n",
       "Name: carat, dtype: int64"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['price'] > 1000].groupby('clarity')['carat'].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "18823"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['carat'] > 1]['price'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>carat</th>\n",
       "      <th>cut</th>\n",
       "      <th>color</th>\n",
       "      <th>clarity</th>\n",
       "      <th>depth</th>\n",
       "      <th>table</th>\n",
       "      <th>price</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>z</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>27750</th>\n",
       "      <td>2.29</td>\n",
       "      <td>Premium</td>\n",
       "      <td>I</td>\n",
       "      <td>VS2</td>\n",
       "      <td>60.8</td>\n",
       "      <td>60</td>\n",
       "      <td>18823</td>\n",
       "      <td>8.50</td>\n",
       "      <td>8.47</td>\n",
       "      <td>5.16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27748</th>\n",
       "      <td>1.51</td>\n",
       "      <td>Ideal</td>\n",
       "      <td>G</td>\n",
       "      <td>IF</td>\n",
       "      <td>61.7</td>\n",
       "      <td>55</td>\n",
       "      <td>18806</td>\n",
       "      <td>7.37</td>\n",
       "      <td>7.41</td>\n",
       "      <td>4.56</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27747</th>\n",
       "      <td>2.07</td>\n",
       "      <td>Ideal</td>\n",
       "      <td>G</td>\n",
       "      <td>SI2</td>\n",
       "      <td>62.5</td>\n",
       "      <td>55</td>\n",
       "      <td>18804</td>\n",
       "      <td>8.20</td>\n",
       "      <td>8.13</td>\n",
       "      <td>5.11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27745</th>\n",
       "      <td>2.29</td>\n",
       "      <td>Premium</td>\n",
       "      <td>I</td>\n",
       "      <td>SI1</td>\n",
       "      <td>61.8</td>\n",
       "      <td>59</td>\n",
       "      <td>18797</td>\n",
       "      <td>8.52</td>\n",
       "      <td>8.45</td>\n",
       "      <td>5.24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27744</th>\n",
       "      <td>2.00</td>\n",
       "      <td>Premium</td>\n",
       "      <td>I</td>\n",
       "      <td>VS1</td>\n",
       "      <td>60.8</td>\n",
       "      <td>59</td>\n",
       "      <td>18795</td>\n",
       "      <td>8.13</td>\n",
       "      <td>8.02</td>\n",
       "      <td>4.91</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27743</th>\n",
       "      <td>2.04</td>\n",
       "      <td>Premium</td>\n",
       "      <td>H</td>\n",
       "      <td>SI1</td>\n",
       "      <td>58.1</td>\n",
       "      <td>60</td>\n",
       "      <td>18795</td>\n",
       "      <td>8.37</td>\n",
       "      <td>8.28</td>\n",
       "      <td>4.84</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27742</th>\n",
       "      <td>2.15</td>\n",
       "      <td>Ideal</td>\n",
       "      <td>G</td>\n",
       "      <td>SI2</td>\n",
       "      <td>62.6</td>\n",
       "      <td>54</td>\n",
       "      <td>18791</td>\n",
       "      <td>8.29</td>\n",
       "      <td>8.35</td>\n",
       "      <td>5.21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27741</th>\n",
       "      <td>1.71</td>\n",
       "      <td>Premium</td>\n",
       "      <td>F</td>\n",
       "      <td>VS2</td>\n",
       "      <td>62.3</td>\n",
       "      <td>59</td>\n",
       "      <td>18791</td>\n",
       "      <td>7.57</td>\n",
       "      <td>7.53</td>\n",
       "      <td>4.70</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27739</th>\n",
       "      <td>2.05</td>\n",
       "      <td>Ideal</td>\n",
       "      <td>G</td>\n",
       "      <td>SI1</td>\n",
       "      <td>61.9</td>\n",
       "      <td>57</td>\n",
       "      <td>18787</td>\n",
       "      <td>8.10</td>\n",
       "      <td>8.16</td>\n",
       "      <td>5.03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27738</th>\n",
       "      <td>2.05</td>\n",
       "      <td>Premium</td>\n",
       "      <td>F</td>\n",
       "      <td>SI2</td>\n",
       "      <td>60.2</td>\n",
       "      <td>59</td>\n",
       "      <td>18784</td>\n",
       "      <td>8.28</td>\n",
       "      <td>8.33</td>\n",
       "      <td>5.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       carat      cut color clarity  depth  table  price     x     y     z\n",
       "27750   2.29  Premium     I     VS2   60.8     60  18823  8.50  8.47  5.16\n",
       "27748   1.51    Ideal     G      IF   61.7     55  18806  7.37  7.41  4.56\n",
       "27747   2.07    Ideal     G     SI2   62.5     55  18804  8.20  8.13  5.11\n",
       "27745   2.29  Premium     I     SI1   61.8     59  18797  8.52  8.45  5.24\n",
       "27744   2.00  Premium     I     VS1   60.8     59  18795  8.13  8.02  4.91\n",
       "27743   2.04  Premium     H     SI1   58.1     60  18795  8.37  8.28  4.84\n",
       "27742   2.15    Ideal     G     SI2   62.6     54  18791  8.29  8.35  5.21\n",
       "27741   1.71  Premium     F     VS2   62.3     59  18791  7.57  7.53  4.70\n",
       "27739   2.05    Ideal     G     SI1   61.9     57  18787  8.10  8.16  5.03\n",
       "27738   2.05  Premium     F     SI2   60.2     59  18784  8.28  8.33  5.00"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['cut'].isin(['Ideal', 'Premium'])].sort('price', ascending=False)[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In which I do a bunch of cool Pandas things without a real goal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>10</th>\n",
       "      <th>...</th>\n",
       "      <th>53931</th>\n",
       "      <th>53932</th>\n",
       "      <th>53933</th>\n",
       "      <th>53934</th>\n",
       "      <th>53935</th>\n",
       "      <th>53936</th>\n",
       "      <th>53937</th>\n",
       "      <th>53938</th>\n",
       "      <th>53939</th>\n",
       "      <th>53940</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>carat</th>\n",
       "      <td>0.23</td>\n",
       "      <td>0.21</td>\n",
       "      <td>0.23</td>\n",
       "      <td>0.29</td>\n",
       "      <td>0.31</td>\n",
       "      <td>0.24</td>\n",
       "      <td>0.24</td>\n",
       "      <td>0.26</td>\n",
       "      <td>0.22</td>\n",
       "      <td>0.23</td>\n",
       "      <td>...</td>\n",
       "      <td>0.71</td>\n",
       "      <td>0.71</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.72</td>\n",
       "      <td>0.72</td>\n",
       "      <td>0.72</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.86</td>\n",
       "      <td>0.75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cut</th>\n",
       "      <td>Ideal</td>\n",
       "      <td>Premium</td>\n",
       "      <td>Good</td>\n",
       "      <td>Premium</td>\n",
       "      <td>Good</td>\n",
       "      <td>Very Good</td>\n",
       "      <td>Very Good</td>\n",
       "      <td>Very Good</td>\n",
       "      <td>Fair</td>\n",
       "      <td>Very Good</td>\n",
       "      <td>...</td>\n",
       "      <td>Premium</td>\n",
       "      <td>Premium</td>\n",
       "      <td>Very Good</td>\n",
       "      <td>Very Good</td>\n",
       "      <td>Premium</td>\n",
       "      <td>Ideal</td>\n",
       "      <td>Good</td>\n",
       "      <td>Very Good</td>\n",
       "      <td>Premium</td>\n",
       "      <td>Ideal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>color</th>\n",
       "      <td>E</td>\n",
       "      <td>E</td>\n",
       "      <td>E</td>\n",
       "      <td>I</td>\n",
       "      <td>J</td>\n",
       "      <td>J</td>\n",
       "      <td>I</td>\n",
       "      <td>H</td>\n",
       "      <td>E</td>\n",
       "      <td>H</td>\n",
       "      <td>...</td>\n",
       "      <td>E</td>\n",
       "      <td>F</td>\n",
       "      <td>E</td>\n",
       "      <td>E</td>\n",
       "      <td>D</td>\n",
       "      <td>D</td>\n",
       "      <td>D</td>\n",
       "      <td>D</td>\n",
       "      <td>H</td>\n",
       "      <td>D</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>clarity</th>\n",
       "      <td>SI2</td>\n",
       "      <td>SI1</td>\n",
       "      <td>VS1</td>\n",
       "      <td>VS2</td>\n",
       "      <td>SI2</td>\n",
       "      <td>VVS2</td>\n",
       "      <td>VVS1</td>\n",
       "      <td>SI1</td>\n",
       "      <td>VS2</td>\n",
       "      <td>VS1</td>\n",
       "      <td>...</td>\n",
       "      <td>SI1</td>\n",
       "      <td>SI1</td>\n",
       "      <td>VS2</td>\n",
       "      <td>VS2</td>\n",
       "      <td>SI1</td>\n",
       "      <td>SI1</td>\n",
       "      <td>SI1</td>\n",
       "      <td>SI1</td>\n",
       "      <td>SI2</td>\n",
       "      <td>SI2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>depth</th>\n",
       "      <td>61.5</td>\n",
       "      <td>59.8</td>\n",
       "      <td>56.9</td>\n",
       "      <td>62.4</td>\n",
       "      <td>63.3</td>\n",
       "      <td>62.8</td>\n",
       "      <td>62.3</td>\n",
       "      <td>61.9</td>\n",
       "      <td>65.1</td>\n",
       "      <td>59.4</td>\n",
       "      <td>...</td>\n",
       "      <td>60.5</td>\n",
       "      <td>59.8</td>\n",
       "      <td>60.5</td>\n",
       "      <td>61.2</td>\n",
       "      <td>62.7</td>\n",
       "      <td>60.8</td>\n",
       "      <td>63.1</td>\n",
       "      <td>62.8</td>\n",
       "      <td>61</td>\n",
       "      <td>62.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>table</th>\n",
       "      <td>55</td>\n",
       "      <td>61</td>\n",
       "      <td>65</td>\n",
       "      <td>58</td>\n",
       "      <td>58</td>\n",
       "      <td>57</td>\n",
       "      <td>57</td>\n",
       "      <td>55</td>\n",
       "      <td>61</td>\n",
       "      <td>61</td>\n",
       "      <td>...</td>\n",
       "      <td>55</td>\n",
       "      <td>62</td>\n",
       "      <td>59</td>\n",
       "      <td>59</td>\n",
       "      <td>59</td>\n",
       "      <td>57</td>\n",
       "      <td>55</td>\n",
       "      <td>60</td>\n",
       "      <td>58</td>\n",
       "      <td>55</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>price</th>\n",
       "      <td>326</td>\n",
       "      <td>326</td>\n",
       "      <td>327</td>\n",
       "      <td>334</td>\n",
       "      <td>335</td>\n",
       "      <td>336</td>\n",
       "      <td>336</td>\n",
       "      <td>337</td>\n",
       "      <td>337</td>\n",
       "      <td>338</td>\n",
       "      <td>...</td>\n",
       "      <td>2756</td>\n",
       "      <td>2756</td>\n",
       "      <td>2757</td>\n",
       "      <td>2757</td>\n",
       "      <td>2757</td>\n",
       "      <td>2757</td>\n",
       "      <td>2757</td>\n",
       "      <td>2757</td>\n",
       "      <td>2757</td>\n",
       "      <td>2757</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>x</th>\n",
       "      <td>3.95</td>\n",
       "      <td>3.89</td>\n",
       "      <td>4.05</td>\n",
       "      <td>4.2</td>\n",
       "      <td>4.34</td>\n",
       "      <td>3.94</td>\n",
       "      <td>3.95</td>\n",
       "      <td>4.07</td>\n",
       "      <td>3.87</td>\n",
       "      <td>4</td>\n",
       "      <td>...</td>\n",
       "      <td>5.79</td>\n",
       "      <td>5.74</td>\n",
       "      <td>5.71</td>\n",
       "      <td>5.69</td>\n",
       "      <td>5.69</td>\n",
       "      <td>5.75</td>\n",
       "      <td>5.69</td>\n",
       "      <td>5.66</td>\n",
       "      <td>6.15</td>\n",
       "      <td>5.83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>y</th>\n",
       "      <td>3.98</td>\n",
       "      <td>3.84</td>\n",
       "      <td>4.07</td>\n",
       "      <td>4.23</td>\n",
       "      <td>4.35</td>\n",
       "      <td>3.96</td>\n",
       "      <td>3.98</td>\n",
       "      <td>4.11</td>\n",
       "      <td>3.78</td>\n",
       "      <td>4.05</td>\n",
       "      <td>...</td>\n",
       "      <td>5.74</td>\n",
       "      <td>5.73</td>\n",
       "      <td>5.76</td>\n",
       "      <td>5.72</td>\n",
       "      <td>5.73</td>\n",
       "      <td>5.76</td>\n",
       "      <td>5.75</td>\n",
       "      <td>5.68</td>\n",
       "      <td>6.12</td>\n",
       "      <td>5.87</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>z</th>\n",
       "      <td>2.43</td>\n",
       "      <td>2.31</td>\n",
       "      <td>2.31</td>\n",
       "      <td>2.63</td>\n",
       "      <td>2.75</td>\n",
       "      <td>2.48</td>\n",
       "      <td>2.47</td>\n",
       "      <td>2.53</td>\n",
       "      <td>2.49</td>\n",
       "      <td>2.39</td>\n",
       "      <td>...</td>\n",
       "      <td>3.49</td>\n",
       "      <td>3.43</td>\n",
       "      <td>3.47</td>\n",
       "      <td>3.49</td>\n",
       "      <td>3.58</td>\n",
       "      <td>3.5</td>\n",
       "      <td>3.61</td>\n",
       "      <td>3.56</td>\n",
       "      <td>3.74</td>\n",
       "      <td>3.64</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10 rows × 53940 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         1        2     3        4     5          6          7          8      \\\n",
       "carat     0.23     0.21  0.23     0.29  0.31       0.24       0.24       0.26   \n",
       "cut      Ideal  Premium  Good  Premium  Good  Very Good  Very Good  Very Good   \n",
       "color        E        E     E        I     J          J          I          H   \n",
       "clarity    SI2      SI1   VS1      VS2   SI2       VVS2       VVS1        SI1   \n",
       "depth     61.5     59.8  56.9     62.4  63.3       62.8       62.3       61.9   \n",
       "table       55       61    65       58    58         57         57         55   \n",
       "price      326      326   327      334   335        336        336        337   \n",
       "x         3.95     3.89  4.05      4.2  4.34       3.94       3.95       4.07   \n",
       "y         3.98     3.84  4.07     4.23  4.35       3.96       3.98       4.11   \n",
       "z         2.43     2.31  2.31     2.63  2.75       2.48       2.47       2.53   \n",
       "\n",
       "        9          10     ...      53931    53932      53933      53934  \\\n",
       "carat    0.22       0.23  ...       0.71     0.71        0.7        0.7   \n",
       "cut      Fair  Very Good  ...    Premium  Premium  Very Good  Very Good   \n",
       "color       E          H  ...          E        F          E          E   \n",
       "clarity   VS2        VS1  ...        SI1      SI1        VS2        VS2   \n",
       "depth    65.1       59.4  ...       60.5     59.8       60.5       61.2   \n",
       "table      61         61  ...         55       62         59         59   \n",
       "price     337        338  ...       2756     2756       2757       2757   \n",
       "x        3.87          4  ...       5.79     5.74       5.71       5.69   \n",
       "y        3.78       4.05  ...       5.74     5.73       5.76       5.72   \n",
       "z        2.49       2.39  ...       3.49     3.43       3.47       3.49   \n",
       "\n",
       "           53935  53936 53937      53938    53939  53940  \n",
       "carat       0.72   0.72  0.72        0.7     0.86   0.75  \n",
       "cut      Premium  Ideal  Good  Very Good  Premium  Ideal  \n",
       "color          D      D     D          D        H      D  \n",
       "clarity      SI1    SI1   SI1        SI1      SI2    SI2  \n",
       "depth       62.7   60.8  63.1       62.8       61   62.2  \n",
       "table         59     57    55         60       58     55  \n",
       "price       2757   2757  2757       2757     2757   2757  \n",
       "x           5.69   5.75  5.69       5.66     6.15   5.83  \n",
       "y           5.73   5.76  5.75       5.68     6.12   5.87  \n",
       "z           3.58    3.5  3.61       3.56     3.74   3.64  \n",
       "\n",
       "[10 rows x 53940 columns]"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# I can Transpose things!\n",
    "df.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>carat</th>\n",
       "      <th>cut</th>\n",
       "      <th>color</th>\n",
       "      <th>clarity</th>\n",
       "      <th>depth</th>\n",
       "      <th>table</th>\n",
       "      <th>price</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>z</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>27750</th>\n",
       "      <td>2.29</td>\n",
       "      <td>Premium</td>\n",
       "      <td>I</td>\n",
       "      <td>VS2</td>\n",
       "      <td>60.8</td>\n",
       "      <td>60</td>\n",
       "      <td>18823</td>\n",
       "      <td>8.50</td>\n",
       "      <td>8.47</td>\n",
       "      <td>5.16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27749</th>\n",
       "      <td>2.00</td>\n",
       "      <td>Very Good</td>\n",
       "      <td>G</td>\n",
       "      <td>SI1</td>\n",
       "      <td>63.5</td>\n",
       "      <td>56</td>\n",
       "      <td>18818</td>\n",
       "      <td>7.90</td>\n",
       "      <td>7.97</td>\n",
       "      <td>5.04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27748</th>\n",
       "      <td>1.51</td>\n",
       "      <td>Ideal</td>\n",
       "      <td>G</td>\n",
       "      <td>IF</td>\n",
       "      <td>61.7</td>\n",
       "      <td>55</td>\n",
       "      <td>18806</td>\n",
       "      <td>7.37</td>\n",
       "      <td>7.41</td>\n",
       "      <td>4.56</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27747</th>\n",
       "      <td>2.07</td>\n",
       "      <td>Ideal</td>\n",
       "      <td>G</td>\n",
       "      <td>SI2</td>\n",
       "      <td>62.5</td>\n",
       "      <td>55</td>\n",
       "      <td>18804</td>\n",
       "      <td>8.20</td>\n",
       "      <td>8.13</td>\n",
       "      <td>5.11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27746</th>\n",
       "      <td>2.00</td>\n",
       "      <td>Very Good</td>\n",
       "      <td>H</td>\n",
       "      <td>SI1</td>\n",
       "      <td>62.8</td>\n",
       "      <td>57</td>\n",
       "      <td>18803</td>\n",
       "      <td>7.95</td>\n",
       "      <td>8.00</td>\n",
       "      <td>5.01</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       carat        cut color clarity  depth  table  price     x     y     z\n",
       "27750   2.29    Premium     I     VS2   60.8     60  18823  8.50  8.47  5.16\n",
       "27749   2.00  Very Good     G     SI1   63.5     56  18818  7.90  7.97  5.04\n",
       "27748   1.51      Ideal     G      IF   61.7     55  18806  7.37  7.41  4.56\n",
       "27747   2.07      Ideal     G     SI2   62.5     55  18804  8.20  8.13  5.11\n",
       "27746   2.00  Very Good     H     SI1   62.8     57  18803  7.95  8.00  5.01"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# SORT ALL THE THINGS!\n",
    "df.sort(['price', 'carat'], ascending=False).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>float_col</th>\n",
       "      <th>int_col</th>\n",
       "      <th>str_col</th>\n",
       "      <th>time_col</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>1.5</td>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "      <td>2015-01-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>2.5</td>\n",
       "      <td>2</td>\n",
       "      <td>b</td>\n",
       "      <td>2015-01-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>3.5</td>\n",
       "      <td>3</td>\n",
       "      <td>c</td>\n",
       "      <td>2015-01-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>4.5</td>\n",
       "      <td>4</td>\n",
       "      <td>d</td>\n",
       "      <td>2015-01-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>5.5</td>\n",
       "      <td>5</td>\n",
       "      <td>e</td>\n",
       "      <td>2015-01-05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>6.5</td>\n",
       "      <td>6</td>\n",
       "      <td>f</td>\n",
       "      <td>2015-01-06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>7.5</td>\n",
       "      <td>7</td>\n",
       "      <td>g</td>\n",
       "      <td>2015-01-07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>8.5</td>\n",
       "      <td>8</td>\n",
       "      <td>h</td>\n",
       "      <td>2015-01-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>9.5</td>\n",
       "      <td>9</td>\n",
       "      <td>i</td>\n",
       "      <td>2015-01-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>10.5</td>\n",
       "      <td>10</td>\n",
       "      <td>j</td>\n",
       "      <td>2015-01-10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   float_col  int_col str_col    time_col\n",
       "a        1.5        1       a  2015-01-01\n",
       "b        2.5        2       b  2015-01-02\n",
       "c        3.5        3       c  2015-01-03\n",
       "a        4.5        4       d  2015-01-04\n",
       "b        5.5        5       e  2015-01-05\n",
       "c        6.5        6       f  2015-01-06\n",
       "a        7.5        7       g  2015-01-07\n",
       "b        8.5        8       h  2015-01-08\n",
       "c        9.5        9       i  2015-01-09\n",
       "a       10.5       10       j  2015-01-10"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Lets use some fake data to show off some stuff:\n",
    "simple_data_1 = {\"int_col\": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
    "                 \"str_col\": [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\"],\n",
    "                 \"float_col\": [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5],\n",
    "                 \"time_col\": [\"2015-01-01\", \"2015-01-02\", \"2015-01-03\", \"2015-01-04\", \"2015-01-05\",\n",
    "                              \"2015-01-06\", \"2015-01-07\", \"2015-01-08\", \"2015-01-09\", \"2015-01-10\"]}\n",
    "my_index = [\"a\", \"b\", \"c\", \"a\", \"b\", \"c\", \"a\", \"b\", \"c\", \"a\"]\n",
    "df = pd.DataFrame(simple_data_1, index=my_index)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>float_col</th>\n",
       "      <th>int_col</th>\n",
       "      <th>str_col</th>\n",
       "      <th>time_col</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>1.5</td>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "      <td>2015-01-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>4.5</td>\n",
       "      <td>4</td>\n",
       "      <td>d</td>\n",
       "      <td>2015-01-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>7.5</td>\n",
       "      <td>7</td>\n",
       "      <td>g</td>\n",
       "      <td>2015-01-07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>10.5</td>\n",
       "      <td>10</td>\n",
       "      <td>j</td>\n",
       "      <td>2015-01-10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   float_col  int_col str_col    time_col\n",
       "a        1.5        1       a  2015-01-01\n",
       "a        4.5        4       d  2015-01-04\n",
       "a        7.5        7       g  2015-01-07\n",
       "a       10.5       10       j  2015-01-10"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# DataFrames have indices that can be gotten by label or position\n",
    "df.loc['a']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "float_col           6.5\n",
       "int_col               6\n",
       "str_col               f\n",
       "time_col     2015-01-06\n",
       "Name: c, dtype: object"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.iloc[5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>float_col</th>\n",
       "      <th>int_col</th>\n",
       "      <th>str_col</th>\n",
       "      <th>time_col</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>3.5</td>\n",
       "      <td>3</td>\n",
       "      <td>c</td>\n",
       "      <td>2015-01-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>4.5</td>\n",
       "      <td>4</td>\n",
       "      <td>d</td>\n",
       "      <td>2015-01-04</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   float_col  int_col str_col    time_col\n",
       "c        3.5        3       c  2015-01-03\n",
       "a        4.5        4       d  2015-01-04"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[2:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>float_col</th>\n",
       "      <th>int_col</th>\n",
       "      <th>str_col</th>\n",
       "      <th>time_col</th>\n",
       "      <th>new_col</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>1.5</td>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "      <td>2015-01-01</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>2.5</td>\n",
       "      <td>2</td>\n",
       "      <td>b</td>\n",
       "      <td>2015-01-02</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>3.5</td>\n",
       "      <td>3</td>\n",
       "      <td>c</td>\n",
       "      <td>2015-01-03</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>4.5</td>\n",
       "      <td>4</td>\n",
       "      <td>d</td>\n",
       "      <td>2015-01-04</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>5.5</td>\n",
       "      <td>5</td>\n",
       "      <td>e</td>\n",
       "      <td>2015-01-05</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>6.5</td>\n",
       "      <td>6</td>\n",
       "      <td>f</td>\n",
       "      <td>2015-01-06</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>7.5</td>\n",
       "      <td>7</td>\n",
       "      <td>g</td>\n",
       "      <td>2015-01-07</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>8.5</td>\n",
       "      <td>8</td>\n",
       "      <td>h</td>\n",
       "      <td>2015-01-08</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>9.5</td>\n",
       "      <td>9</td>\n",
       "      <td>i</td>\n",
       "      <td>2015-01-09</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>10.5</td>\n",
       "      <td>10</td>\n",
       "      <td>j</td>\n",
       "      <td>2015-01-10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   float_col  int_col str_col    time_col  new_col\n",
       "a        1.5        1       a  2015-01-01      NaN\n",
       "b        2.5        2       b  2015-01-02      NaN\n",
       "c        3.5        3       c  2015-01-03        1\n",
       "a        4.5        4       d  2015-01-04        2\n",
       "b        5.5        5       e  2015-01-05      NaN\n",
       "c        6.5        6       f  2015-01-06        4\n",
       "a        7.5        7       g  2015-01-07        5\n",
       "b        8.5        8       h  2015-01-08        9\n",
       "c        9.5        9       i  2015-01-09      NaN\n",
       "a       10.5       10       j  2015-01-10       10"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# New Column! With Missing Data!\n",
    "df['new_col'] = [np.nan, np.nan, 1.0, 2.0, np.nan, 4.0, 5.0, 9.0, np.nan, 10.0]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>float_col</th>\n",
       "      <th>int_col</th>\n",
       "      <th>str_col</th>\n",
       "      <th>time_col</th>\n",
       "      <th>new_col</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>3.5</td>\n",
       "      <td>3</td>\n",
       "      <td>c</td>\n",
       "      <td>2015-01-03</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>4.5</td>\n",
       "      <td>4</td>\n",
       "      <td>d</td>\n",
       "      <td>2015-01-04</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>6.5</td>\n",
       "      <td>6</td>\n",
       "      <td>f</td>\n",
       "      <td>2015-01-06</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>7.5</td>\n",
       "      <td>7</td>\n",
       "      <td>g</td>\n",
       "      <td>2015-01-07</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>8.5</td>\n",
       "      <td>8</td>\n",
       "      <td>h</td>\n",
       "      <td>2015-01-08</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>10.5</td>\n",
       "      <td>10</td>\n",
       "      <td>j</td>\n",
       "      <td>2015-01-10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   float_col  int_col str_col    time_col  new_col\n",
       "c        3.5        3       c  2015-01-03        1\n",
       "a        4.5        4       d  2015-01-04        2\n",
       "c        6.5        6       f  2015-01-06        4\n",
       "a        7.5        7       g  2015-01-07        5\n",
       "b        8.5        8       h  2015-01-08        9\n",
       "a       10.5       10       j  2015-01-10       10"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Removing missing data!\n",
    "df.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>float_col</th>\n",
       "      <th>int_col</th>\n",
       "      <th>str_col</th>\n",
       "      <th>time_col</th>\n",
       "      <th>new_col</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>1.5</td>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "      <td>2015-01-01</td>\n",
       "      <td>FOO!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>2.5</td>\n",
       "      <td>2</td>\n",
       "      <td>b</td>\n",
       "      <td>2015-01-02</td>\n",
       "      <td>FOO!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>3.5</td>\n",
       "      <td>3</td>\n",
       "      <td>c</td>\n",
       "      <td>2015-01-03</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>4.5</td>\n",
       "      <td>4</td>\n",
       "      <td>d</td>\n",
       "      <td>2015-01-04</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>5.5</td>\n",
       "      <td>5</td>\n",
       "      <td>e</td>\n",
       "      <td>2015-01-05</td>\n",
       "      <td>FOO!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>6.5</td>\n",
       "      <td>6</td>\n",
       "      <td>f</td>\n",
       "      <td>2015-01-06</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>7.5</td>\n",
       "      <td>7</td>\n",
       "      <td>g</td>\n",
       "      <td>2015-01-07</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>8.5</td>\n",
       "      <td>8</td>\n",
       "      <td>h</td>\n",
       "      <td>2015-01-08</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>9.5</td>\n",
       "      <td>9</td>\n",
       "      <td>i</td>\n",
       "      <td>2015-01-09</td>\n",
       "      <td>FOO!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>10.5</td>\n",
       "      <td>10</td>\n",
       "      <td>j</td>\n",
       "      <td>2015-01-10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   float_col  int_col str_col    time_col new_col\n",
       "a        1.5        1       a  2015-01-01    FOO!\n",
       "b        2.5        2       b  2015-01-02    FOO!\n",
       "c        3.5        3       c  2015-01-03       1\n",
       "a        4.5        4       d  2015-01-04       2\n",
       "b        5.5        5       e  2015-01-05    FOO!\n",
       "c        6.5        6       f  2015-01-06       4\n",
       "a        7.5        7       g  2015-01-07       5\n",
       "b        8.5        8       h  2015-01-08       9\n",
       "c        9.5        9       i  2015-01-09    FOO!\n",
       "a       10.5       10       j  2015-01-10      10"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Fill missing data!\n",
    "df.fillna(\"FOO!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>float_col</th>\n",
       "      <th>int_col</th>\n",
       "      <th>str_col</th>\n",
       "      <th>time_col</th>\n",
       "      <th>new_col</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>1.5</td>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "      <td>2015-01-01</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>2.5</td>\n",
       "      <td>2</td>\n",
       "      <td>b</td>\n",
       "      <td>2015-01-02</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>3.5</td>\n",
       "      <td>3</td>\n",
       "      <td>c</td>\n",
       "      <td>2015-01-03</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>4.5</td>\n",
       "      <td>4</td>\n",
       "      <td>d</td>\n",
       "      <td>2015-01-04</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>5.5</td>\n",
       "      <td>5</td>\n",
       "      <td>e</td>\n",
       "      <td>2015-01-05</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>6.5</td>\n",
       "      <td>6</td>\n",
       "      <td>f</td>\n",
       "      <td>2015-01-06</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>7.5</td>\n",
       "      <td>7</td>\n",
       "      <td>g</td>\n",
       "      <td>2015-01-07</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>8.5</td>\n",
       "      <td>8</td>\n",
       "      <td>h</td>\n",
       "      <td>2015-01-08</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>9.5</td>\n",
       "      <td>9</td>\n",
       "      <td>i</td>\n",
       "      <td>2015-01-09</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>10.5</td>\n",
       "      <td>10</td>\n",
       "      <td>j</td>\n",
       "      <td>2015-01-10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   float_col  int_col str_col    time_col  new_col\n",
       "a        1.5        1       a  2015-01-01        1\n",
       "b        2.5        2       b  2015-01-02        1\n",
       "c        3.5        3       c  2015-01-03        1\n",
       "a        4.5        4       d  2015-01-04        2\n",
       "b        5.5        5       e  2015-01-05        4\n",
       "c        6.5        6       f  2015-01-06        4\n",
       "a        7.5        7       g  2015-01-07        5\n",
       "b        8.5        8       h  2015-01-08        9\n",
       "c        9.5        9       i  2015-01-09       10\n",
       "a       10.5       10       j  2015-01-10       10"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Backfill missing data!\n",
    "df.fillna(method='bfill')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>float_col</th>\n",
       "      <th>int_col</th>\n",
       "      <th>str_col</th>\n",
       "      <th>time_col</th>\n",
       "      <th>new_col</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>1.5</td>\n",
       "      <td>1</td>\n",
       "      <td>FOO</td>\n",
       "      <td>2015-01-01</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>2.5</td>\n",
       "      <td>2</td>\n",
       "      <td>FoO</td>\n",
       "      <td>2015-01-02</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>3.5</td>\n",
       "      <td>3</td>\n",
       "      <td>Foo</td>\n",
       "      <td>2015-01-03</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>4.5</td>\n",
       "      <td>4</td>\n",
       "      <td>Fo o</td>\n",
       "      <td>2015-01-04</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>5.5</td>\n",
       "      <td>5</td>\n",
       "      <td>FOO</td>\n",
       "      <td>2015-01-05</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>6.5</td>\n",
       "      <td>6</td>\n",
       "      <td>fOO</td>\n",
       "      <td>2015-01-06</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>7.5</td>\n",
       "      <td>7</td>\n",
       "      <td>fOo</td>\n",
       "      <td>2015-01-07</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>8.5</td>\n",
       "      <td>8</td>\n",
       "      <td>FoO</td>\n",
       "      <td>2015-01-08</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>9.5</td>\n",
       "      <td>9</td>\n",
       "      <td>foO</td>\n",
       "      <td>2015-01-09</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>10.5</td>\n",
       "      <td>10</td>\n",
       "      <td>Foo</td>\n",
       "      <td>2015-01-10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   float_col  int_col str_col    time_col  new_col\n",
       "a        1.5        1     FOO  2015-01-01      NaN\n",
       "b        2.5        2     FoO  2015-01-02      NaN\n",
       "c        3.5        3     Foo  2015-01-03        1\n",
       "a        4.5        4    Fo o  2015-01-04        2\n",
       "b        5.5        5     FOO  2015-01-05      NaN\n",
       "c        6.5        6     fOO  2015-01-06        4\n",
       "a        7.5        7     fOo  2015-01-07        5\n",
       "b        8.5        8     FoO  2015-01-08        9\n",
       "c        9.5        9     foO  2015-01-09      NaN\n",
       "a       10.5       10  Foo     2015-01-10       10"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Vectorized string methods!\n",
    "df['str_col'] = [\"FOO\", \"FoO\", \"Foo\", \"Fo o\", \"FOO\", \"fOO\", \"fOo\", \"FoO\", \"foO\", \"Foo   \"]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    foo\n",
       "b    foo\n",
       "c    foo\n",
       "a    foo\n",
       "b    foo\n",
       "c    foo\n",
       "a    foo\n",
       "b    foo\n",
       "c    foo\n",
       "a    foo\n",
       "Name: str_col, dtype: object"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['str_col'].str.lower().str.replace(' ', '')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>key</th>\n",
       "      <th>lval</th>\n",
       "      <th>rval</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>foo</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>bar</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   key  lval  rval\n",
       "0  foo     1     4\n",
       "1  bar     2     5"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Database style joins!!\n",
    "left = pd.DataFrame({'key': ['foo', 'bar', 'fizz'], 'lval': [1, 2, 3]})\n",
    "right = pd.DataFrame({'key': ['foo', 'bar', 'baz', 'qux'], \n",
    "                      'rval': [4, 5, 6, 7]})\n",
    "left.merge(right, how='inner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>key</th>\n",
       "      <th>rval</th>\n",
       "      <th>lval</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>foo</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>bar</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>baz</td>\n",
       "      <td>6</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>qux</td>\n",
       "      <td>7</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>fizz</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    key  rval  lval\n",
       "0   foo     4     1\n",
       "1   bar     5     2\n",
       "2   baz     6   NaN\n",
       "3   qux     7   NaN\n",
       "4  fizz   NaN     3"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "right.merge(left, how='outer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>key</th>\n",
       "      <th>lval</th>\n",
       "      <th>rval</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>foo</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>bar</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>fizz</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    key  lval  rval\n",
       "0   foo     1     4\n",
       "1   bar     2     5\n",
       "2  fizz     3   NaN"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "left.merge(right, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatetimeIndex(['2015-03-25 00:00:00', '2015-03-25 01:00:00',\n",
       "               '2015-03-25 02:00:00', '2015-03-25 03:00:00',\n",
       "               '2015-03-25 04:00:00', '2015-03-25 05:00:00',\n",
       "               '2015-03-25 06:00:00', '2015-03-25 07:00:00',\n",
       "               '2015-03-25 08:00:00', '2015-03-25 09:00:00', \n",
       "               ...\n",
       "               '2015-03-30 20:00:00', '2015-03-30 21:00:00',\n",
       "               '2015-03-30 22:00:00', '2015-03-30 23:00:00',\n",
       "               '2015-03-31 00:00:00', '2015-03-31 01:00:00',\n",
       "               '2015-03-31 02:00:00', '2015-03-31 03:00:00',\n",
       "               '2015-03-31 04:00:00', '2015-03-31 05:00:00'],\n",
       "              dtype='datetime64[ns]', length=150, freq='H', tz=None)"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Lets make some random timeseries data\n",
    "dates = pd.date_range('2015-03-25', periods=150, freq='H')\n",
    "dates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Numeric</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2015-03-25 00:00:00</th>\n",
       "      <td>451</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-25 01:00:00</th>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-25 02:00:00</th>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-25 03:00:00</th>\n",
       "      <td>113</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-25 04:00:00</th>\n",
       "      <td>283</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     Numeric\n",
       "2015-03-25 00:00:00      451\n",
       "2015-03-25 01:00:00       10\n",
       "2015-03-25 02:00:00       18\n",
       "2015-03-25 03:00:00      113\n",
       "2015-03-25 04:00:00      283"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "time_df = pd.DataFrame(np.random.randint(0, 500, 150), index=dates, columns=[\"Numeric\"])\n",
    "time_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Numeric</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2015-03-25</th>\n",
       "      <td>211.166667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-26</th>\n",
       "      <td>255.041667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-27</th>\n",
       "      <td>241.625000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-28</th>\n",
       "      <td>291.791667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-29</th>\n",
       "      <td>227.625000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-30</th>\n",
       "      <td>212.125000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-31</th>\n",
       "      <td>307.500000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               Numeric\n",
       "2015-03-25  211.166667\n",
       "2015-03-26  255.041667\n",
       "2015-03-27  241.625000\n",
       "2015-03-28  291.791667\n",
       "2015-03-29  227.625000\n",
       "2015-03-30  212.125000\n",
       "2015-03-31  307.500000"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# RESAMPLE!\n",
    "time_df.resample('D', how='mean')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatetimeIndex(['2014-03-25 23:02:00', '2014-03-25 23:01:00',\n",
       "               '2015-03-25 00:00:00'],\n",
       "              dtype='datetime64[ns]', freq=None, tz=None)"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Convert weird date formats!\n",
    "the_worst = ['3/25/2014 23:02:00', '2014-03-25 23:01:00', 'March 25 2015']\n",
    "pd.to_datetime(the_worst)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Numeric</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2015-03-26 00:00:00</th>\n",
       "      <td>451</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-26 01:00:00</th>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-26 02:00:00</th>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-26 03:00:00</th>\n",
       "      <td>113</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-26 04:00:00</th>\n",
       "      <td>283</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     Numeric\n",
       "2015-03-26 00:00:00      451\n",
       "2015-03-26 01:00:00       10\n",
       "2015-03-26 02:00:00       18\n",
       "2015-03-26 03:00:00      113\n",
       "2015-03-26 04:00:00      283"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Shift dates!\n",
    "time_df.shift(1, freq='D').head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Numeric</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2015-03-25</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-30</th>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-04-05</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            Numeric\n",
       "2015-03-25        1\n",
       "2015-03-30        9\n",
       "2015-04-05        4"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# What if I have missing dates?\n",
    "missing = pd.to_datetime(['2015-03-25', '2015-03-30', '2015-04-05'])\n",
    "missing_df = pd.DataFrame(np.random.randint(0, 10, 3), index=missing, columns=[\"Numeric\"])\n",
    "missing_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Numeric</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2015-03-25</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-26</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-27</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-28</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-29</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-30</th>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-31</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-04-01</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-04-02</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-04-03</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-04-04</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-04-05</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            Numeric\n",
       "2015-03-25        1\n",
       "2015-03-26      NaN\n",
       "2015-03-27      NaN\n",
       "2015-03-28      NaN\n",
       "2015-03-29      NaN\n",
       "2015-03-30        9\n",
       "2015-03-31      NaN\n",
       "2015-04-01      NaN\n",
       "2015-04-02      NaN\n",
       "2015-04-03      NaN\n",
       "2015-04-04      NaN\n",
       "2015-04-05        4"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "missing_df.asfreq('D')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Numeric</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2015-03-25</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-26</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-27</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-28</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-29</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-30</th>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-03-31</th>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-04-01</th>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-04-02</th>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-04-03</th>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-04-04</th>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-04-05</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            Numeric\n",
       "2015-03-25        1\n",
       "2015-03-26        1\n",
       "2015-03-27        1\n",
       "2015-03-28        1\n",
       "2015-03-29        1\n",
       "2015-03-30        9\n",
       "2015-03-31        9\n",
       "2015-04-01        9\n",
       "2015-04-02        9\n",
       "2015-04-03        9\n",
       "2015-04-04        9\n",
       "2015-04-05        4"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "missing_df.asfreq('D', method='pad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#XRAY!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import xray"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xray.DataArray (x: 3, y: 4)>\n",
       "array([[  1,   2,   3,   4],\n",
       "       [ 10,  20,  30,  40],\n",
       "       [100, 200, 300, 400]])\n",
       "Coordinates:\n",
       "  * x        (x) |S1 'a' 'b' 'c'\n",
       "  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "arr = np.array([[1, 2, 3, 4], \n",
    "                [10, 20, 30, 40],\n",
    "                [100, 200, 300, 400]])\n",
    "dim0_coords = ['a', 'b', 'c']\n",
    "dim1_coords = ['foo', 'bar', 'baz', 'qux']\n",
    "da = xray.DataArray(arr, [('x', dim0_coords), ('y', dim1_coords)])\n",
    "da"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[  1,   2,   3,   4],\n",
       "       [ 10,  20,  30,  40],\n",
       "       [100, 200, 300, 400]])"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Arrays!\n",
    "da.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('x', 'y')"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "da.dims"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Coordinates:\n",
       "  * x        (x) |S1 'a' 'b' 'c'\n",
       "  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "da.coords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xray.DataArray (x: 2, y: 4)>\n",
       "array([[ 1,  2,  3,  4],\n",
       "       [10, 20, 30, 40]])\n",
       "Coordinates:\n",
       "  * x        (x) |S1 'a' 'b'\n",
       "  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# But with some Pandas-like powers!\n",
    "\n",
    "# Index by slice\n",
    "da[0:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xray.DataArray (y: 4)>\n",
       "array([1, 2, 3, 4])\n",
       "Coordinates:\n",
       "    x        |S1 'a'\n",
       "  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Indexing!\n",
    "da.loc['a']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xray.DataArray (x: 3, y: 4)>\n",
       "array([[  1,   2,   3,   4],\n",
       "       [ 10,  20,  30,  40],\n",
       "       [100, 200, 300, 400]])\n",
       "Coordinates:\n",
       "  * x        (x) |S1 'a' 'b' 'c'\n",
       "  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "da.loc['a':'c']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xray.DataArray (x: 2, y: 4)>\n",
       "array([[  1,   2,   3,   4],\n",
       "       [100, 200, 300, 400]])\n",
       "Coordinates:\n",
       "  * x        (x) |S1 'a' 'c'\n",
       "  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "da.sel(x=['a', 'c'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xray.DataArray (x: 3)>\n",
       "array([  1,  10, 100])\n",
       "Coordinates:\n",
       "  * x        (x) |S1 'a' 'b' 'c'\n",
       "    y        |S3 'foo'"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "da.sel(y='foo')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xray.DataArray (y: 4)>\n",
       "array([1, 2, 3, 4])\n",
       "Coordinates:\n",
       "    x        |S1 'a'\n",
       "  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "da.isel(x=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(<xray.DataArray ()>\n",
       " array(92.5), <xray.DataArray ()>\n",
       " array(1110))"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Do numpy stuff\n",
    "np.mean(da), np.sum(da)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xray.DataArray (x: 3, y: 4)>\n",
       "array([[ 0.84147098,  0.90929743,  0.14112001, -0.7568025 ],\n",
       "       [-0.54402111,  0.91294525, -0.98803162,  0.74511316],\n",
       "       [-0.50636564, -0.8732973 , -0.99975584, -0.85091936]])\n",
       "Coordinates:\n",
       "  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'\n",
       "  * x        (x) |S1 'a' 'b' 'c'"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sin(da)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xray.DataArray (x: 3, y: 4)>\n",
       "array([[101, 102, 103, 104],\n",
       "       [110, 120, 130, 140],\n",
       "       [200, 300, 400, 500]])\n",
       "Coordinates:\n",
       "  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'\n",
       "  * x        (x) |S1 'a' 'b' 'c'"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Broadcast!\n",
    "da + 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xray.DataArray (y: 4)>\n",
       "array([  37.,   74.,  111.,  148.])\n",
       "Coordinates:\n",
       "  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Do all of the above by label\n",
    "da.mean(dim='x')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xray.DataArray (x: 3)>\n",
       "array([  10,  100, 1000])\n",
       "Coordinates:\n",
       "  * x        (x) |S1 'a' 'b' 'c'"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "da.sum(dim='y')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xray.DataArray (x: 3, y: 4)>\n",
       "array([[  1,   2,   3,   4],\n",
       "       [ 10,  20,  30,  40],\n",
       "       [100, 200, 300, 400]])\n",
       "Coordinates:\n",
       "  * x        (x) |S1 'a' 'b' 'c'\n",
       "  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "da"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xray.DataArray (x: 3)>\n",
       "array([   2.5,   25. ,  250. ])\n",
       "Coordinates:\n",
       "  * x        (x) |S1 'a' 'b' 'c'"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Group stuff\n",
    "da.groupby('x').mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xray.DataArray (y: 4)>\n",
       "array([111, 222, 333, 444])\n",
       "Coordinates:\n",
       "  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "da.groupby('y').sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[[ 2.51076277,  1.0696244 ,  1.33092369],\n",
       "        [ 1.8807506 , -0.2231198 , -0.24000785]],\n",
       "\n",
       "       [[-0.67361403, -0.21962662,  0.11517111],\n",
       "        [-0.79717326,  0.50143901,  0.08006242]]])"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.randn(2, 2, 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xray.Dataset>\n",
       "Dimensions:  (x: 3, y: 3, z: 2)\n",
       "Coordinates:\n",
       "  * y        (y) |S5 'row_1' 'row_2' 'row_3'\n",
       "  * x        (x) |S5 'col_1' 'col_2' 'col_3'\n",
       "  * z        (z) |S3 '10s' '20s'\n",
       "Data variables:\n",
       "    cube     (z, x, y) int64 10 10 10 10 10 10 10 10 10 20 20 20 20 20 20 20 ..."
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# DataSets\n",
    "x_y = np.array([[1, 1.1, 1.2], [2, 2.1, 2.2], [3, 3.1, 3.2]])\n",
    "z_coords = np.array([\"10s\", \"20s\"])\n",
    "data_cube = np.array([[[10, 10, 10],\n",
    "                       [10, 10, 10],\n",
    "                       [10, 10, 10]],\n",
    "                      [[20, 20, 20],\n",
    "                       [20, 20, 20],\n",
    "                       [20, 20, 20]]])\n",
    "ds = xray.Dataset({\"cube\": ([\"z\", \"x\", \"y\"], data_cube)},\n",
    "                  coords={\"z\": z_coords, \n",
    "                          \"x\": [\"col_1\", \"col_2\", \"col_3\"],\n",
    "                          \"y\": [\"row_1\", \"row_2\", \"row_3\"]})\n",
    "ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xray.Dataset>\n",
       "Dimensions:  (x: 3, y: 3)\n",
       "Coordinates:\n",
       "  * y        (y) |S5 'row_1' 'row_2' 'row_3'\n",
       "  * x        (x) |S5 'col_1' 'col_2' 'col_3'\n",
       "Data variables:\n",
       "    cube     (x, y) int64 30 30 30 30 30 30 30 30 30"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds.sum(dim=\"z\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xray.Dataset>\n",
       "Dimensions:  (y: 3, z: 2)\n",
       "Coordinates:\n",
       "  * y        (y) |S5 'row_1' 'row_2' 'row_3'\n",
       "  * z        (z) |S3 '10s' '20s'\n",
       "Data variables:\n",
       "    cube     (z, y) float64 10.0 10.0 10.0 20.0 20.0 20.0"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds.mean(dim=\"x\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xray.Dataset>\n",
       "Dimensions:  (x: 3)\n",
       "Coordinates:\n",
       "  * x        (x) |S5 'col_1' 'col_2' 'col_3'\n",
       "Data variables:\n",
       "    cube     (x) int64 90 90 90"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds.groupby(\"x\").sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#Blaze\n",
    "\n",
    "For this demo, we're going to focus on a couple queries we used in the Pandas demo:\n",
    "```sql\n",
    "select cut, mean(price)\n",
    "from diamonds\n",
    "groupby cut;\n",
    "\n",
    "select count(carat)\n",
    "from diamonds\n",
    "where price > 1000\n",
    "group by clarity;\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "blaze.expr.expressions.Symbol"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import blaze as bz\n",
    "\n",
    "\n",
    "bz_diamonds = bz.symbol('diamonds', bz.discover(df_diamonds))\n",
    "type(bz_diamonds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "mean_price = bz.by(bz_diamonds.cut, price=bz_diamonds.price.mean())\n",
    "clarity_count = bz.by(bz_diamonds[bz_diamonds.price > 1000].clarity,\n",
    "                      count=bz_diamonds.carat.count())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cut</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Fair</td>\n",
       "      <td>4358.757764</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Good</td>\n",
       "      <td>3928.864452</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Ideal</td>\n",
       "      <td>3457.541970</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Premium</td>\n",
       "      <td>4584.257704</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Very Good</td>\n",
       "      <td>3981.759891</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         cut        price\n",
       "0       Fair  4358.757764\n",
       "1       Good  3928.864452\n",
       "2      Ideal  3457.541970\n",
       "3    Premium  4584.257704\n",
       "4  Very Good  3981.759891"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We haven't actually computed anything yet!\n",
    "# Let's make Pandas compute it. \n",
    "bz.compute(mean_price, df_diamonds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>clarity</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>I1</td>\n",
       "      <td>741</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>IF</td>\n",
       "      <td>1790</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>SI1</td>\n",
       "      <td>13065</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>SI2</td>\n",
       "      <td>9194</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>VS1</td>\n",
       "      <td>8171</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>VS2</td>\n",
       "      <td>12258</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>VVS1</td>\n",
       "      <td>3655</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>VVS2</td>\n",
       "      <td>5066</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  clarity  count\n",
       "0      I1    741\n",
       "1      IF   1790\n",
       "2     SI1  13065\n",
       "3     SI2   9194\n",
       "4     VS1   8171\n",
       "5     VS2  12258\n",
       "6    VVS1   3655\n",
       "7    VVS2   5066"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bz.compute(clarity_count, df_diamonds)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ok, so what? You made Pandas do a thing we already did. \n",
    "\n",
    "### Oh Yeah, what if we want Postgres to compute *the exact same expressions*? \n",
    "\n",
    "## WAT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Blaze/Odo make it easy to move data between containers\n",
    "# Note that we have an empty table already created\n",
    "pg_datasource = bz.odo(df_diamonds, \"postgresql://postgres:postgres@localhost/pydata::diamonds\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<sqlalchemy.sql.selectable.Select at 0x11c27e610; Select object>"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Now we're going to use Postgres as our computation engine\n",
    "result = bz.compute(clarity_count, pg_datasource)\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>clarity</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>IF</td>\n",
       "      <td>1790</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>I1</td>\n",
       "      <td>741</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>VVS1</td>\n",
       "      <td>3655</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>VS2</td>\n",
       "      <td>12258</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>VS1</td>\n",
       "      <td>8171</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>VVS2</td>\n",
       "      <td>5066</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>SI2</td>\n",
       "      <td>9194</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>SI1</td>\n",
       "      <td>13065</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  clarity  count\n",
       "0      IF   1790\n",
       "1      I1    741\n",
       "2    VVS1   3655\n",
       "3     VS2  12258\n",
       "4     VS1   8171\n",
       "5    VVS2   5066\n",
       "6     SI2   9194\n",
       "7     SI1  13065"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# I don't want a selectable. I want a DataFrame\n",
    "# odo again\n",
    "bz.odo(bz.compute(clarity_count, pg_datasource), pd.DataFrame)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can use any SQL supported by SQLAlchemy as your computation. It also supports Python lists, Spark DataFrames, MongoDB, Numpy arrays..."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# bcolz!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import bcolz\n",
    "\n",
    "dc = bcolz.ctable.fromdataframe(df_diamonds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "carat : carray((53940,), float64)\n",
       "  nbytes: 421.41 KB; cbytes: 468.10 KB; ratio: 0.90\n",
       "  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n",
       "[ 0.23  0.21  0.23 ...,  0.7   0.86  0.75]\n",
       "cut : carray((53940,), |S9)\n",
       "  nbytes: 474.08 KB; cbytes: 315.96 KB; ratio: 1.50\n",
       "  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n",
       "['Ideal' 'Premium' 'Good' ..., 'Very Good' 'Premium' 'Ideal']\n",
       "color : carray((53940,), |S1)\n",
       "  nbytes: 52.68 KB; cbytes: 128.00 KB; ratio: 0.41\n",
       "  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n",
       "['E' 'E' 'E' ..., 'D' 'H' 'D']\n",
       "clarity : carray((53940,), |S4)\n",
       "  nbytes: 210.70 KB; cbytes: 256.00 KB; ratio: 0.82\n",
       "  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n",
       "['SI2' 'SI1' 'VS1' ..., 'SI1' 'SI2' 'SI2']\n",
       "depth : carray((53940,), float64)\n",
       "  nbytes: 421.41 KB; cbytes: 407.65 KB; ratio: 1.03\n",
       "  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n",
       "[ 61.5  59.8  56.9 ...,  62.8  61.   62.2]\n",
       "table : carray((53940,), float64)\n",
       "  nbytes: 421.41 KB; cbytes: 317.15 KB; ratio: 1.33\n",
       "  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n",
       "[ 55.  61.  65. ...,  60.  58.  55.]\n",
       "price : carray((53940,), int64)\n",
       "  nbytes: 421.41 KB; cbytes: 279.99 KB; ratio: 1.51\n",
       "  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n",
       "[ 326  326  327 ..., 2757 2757 2757]\n",
       "x : carray((53940,), float64)\n",
       "  nbytes: 421.41 KB; cbytes: 468.10 KB; ratio: 0.90\n",
       "  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n",
       "[ 3.95  3.89  4.05 ...,  5.66  6.15  5.83]\n",
       "y : carray((53940,), float64)\n",
       "  nbytes: 421.41 KB; cbytes: 467.91 KB; ratio: 0.90\n",
       "  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n",
       "[ 3.98  3.84  4.07 ...,  5.68  6.12  5.87]\n",
       "z : carray((53940,), float64)\n",
       "  nbytes: 421.41 KB; cbytes: 468.20 KB; ratio: 0.90\n",
       "  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n",
       "[ 2.43  2.31  2.31 ...,  3.56  3.74  3.64]"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dc.cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total size for the ctable: 3.49322795868 MB\n",
      "Compression ratio the ctable: 1.03081835096\n"
     ]
    }
   ],
   "source": [
    "dsize = dc.cbytes / 2**20.\n",
    "print(\"Total size for the ctable: {} MB\".format(dsize))\n",
    "print(\"Compression ratio the ctable: {}\".format((dc.nbytes / float(dc.cbytes))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([(0.21, 'Premium', 'E', 'SI1', 59.8, 61.0, 326, 3.89, 3.84, 2.31),\n",
       "       (0.29, 'Premium', 'I', 'VS2', 62.4, 58.0, 334, 4.2, 4.23, 2.63),\n",
       "       (0.22, 'Premium', 'F', 'SI1', 60.4, 61.0, 342, 3.88, 3.84, 2.33),\n",
       "       ...,\n",
       "       (0.71, 'Premium', 'F', 'SI1', 59.8, 62.0, 2756, 5.74, 5.73, 3.43),\n",
       "       (0.72, 'Premium', 'D', 'SI1', 62.7, 59.0, 2757, 5.69, 5.73, 3.58),\n",
       "       (0.86, 'Premium', 'H', 'SI2', 61.0, 58.0, 2757, 6.15, 6.12, 3.74)], \n",
       "      dtype=[('carat', '<f8'), ('cut', 'S9'), ('color', 'S1'), ('clarity', 'S4'), ('depth', '<f8'), ('table', '<f8'), ('price', '<i8'), ('x', '<f8'), ('y', '<f8'), ('z', '<f8')])"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# You can do DataFrame-like stuff\n",
    "dc[\"cut == 'Premium'\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([(0.23, 'Ideal', 'E', 'SI2', 61.5, 55.0, 326, 3.95, 3.98, 2.43),\n",
       "       (0.23, 'Ideal', 'J', 'VS1', 62.8, 56.0, 340, 3.93, 3.9, 2.46),\n",
       "       (0.31, 'Ideal', 'J', 'SI2', 62.2, 54.0, 344, 4.35, 4.37, 2.71), ...,\n",
       "       (0.71, 'Ideal', 'G', 'VS1', 61.4, 56.0, 2756, 5.76, 5.73, 3.53),\n",
       "       (0.72, 'Ideal', 'D', 'SI1', 60.8, 57.0, 2757, 5.75, 5.76, 3.5),\n",
       "       (0.75, 'Ideal', 'D', 'SI2', 62.2, 55.0, 2757, 5.83, 5.87, 3.64)], \n",
       "      dtype=[('carat', '<f8'), ('cut', 'S9'), ('color', 'S1'), ('clarity', 'S4'), ('depth', '<f8'), ('table', '<f8'), ('price', '<i8'), ('x', '<f8'), ('y', '<f8'), ('z', '<f8')])"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dc[\"(cut == 'Ideal') & (price > 1)\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# We can do the same thing with the ctable on disk!\n",
    "diskdc = dc.copy(rootdir='diamonds')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([(0.7, 'Ideal', 'E', 'SI1', 62.5, 57.0, 2757, 5.7, 5.72, 3.57),\n",
       "       (0.7, 'Ideal', 'G', 'VS2', 61.6, 56.0, 2757, 5.7, 5.67, 3.5),\n",
       "       (0.74, 'Ideal', 'G', 'SI1', 61.6, 55.0, 2760, 5.8, 5.85, 3.59), ...,\n",
       "       (0.71, 'Ideal', 'G', 'VS1', 61.4, 56.0, 2756, 5.76, 5.73, 3.53),\n",
       "       (0.72, 'Ideal', 'D', 'SI1', 60.8, 57.0, 2757, 5.75, 5.76, 3.5),\n",
       "       (0.75, 'Ideal', 'D', 'SI2', 62.2, 55.0, 2757, 5.83, 5.87, 3.64)], \n",
       "      dtype=[('carat', '<f8'), ('cut', 'S9'), ('color', 'S1'), ('clarity', 'S4'), ('depth', '<f8'), ('table', '<f8'), ('price', '<i8'), ('x', '<f8'), ('y', '<f8'), ('z', '<f8')])"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "diskdc[\"(cut == 'Ideal') & (price > 1000)\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "diamonds/\n",
      "    __attrs__\n",
      "    __rootdirs__\n",
      "    carat/\n",
      "        __attrs__\n",
      "        data/\n",
      "            __0.blp\n",
      "            __1.blp\n",
      "        meta/\n",
      "            sizes\n",
      "            storage\n",
      "    clarity/\n",
      "        __attrs__\n",
      "        data/\n",
      "            __0.blp\n",
      "        meta/\n",
      "            sizes\n",
      "            storage\n",
      "    color/\n",
      "        __attrs__\n",
      "        data/\n",
      "            __0.blp\n",
      "        meta/\n",
      "            sizes\n",
      "            storage\n",
      "    cut/\n",
      "        __attrs__\n",
      "        data/\n",
      "            __0.blp\n",
      "            __1.blp\n",
      "        meta/\n",
      "            sizes\n",
      "            storage\n",
      "    depth/\n",
      "        __attrs__\n",
      "        data/\n",
      "            __0.blp\n",
      "            __1.blp\n",
      "        meta/\n",
      "            sizes\n",
      "            storage\n",
      "    price/\n",
      "        __attrs__\n",
      "        data/\n",
      "            __0.blp\n",
      "            __1.blp\n",
      "        meta/\n",
      "            sizes\n",
      "            storage\n",
      "    table/\n",
      "        __attrs__\n",
      "        data/\n",
      "            __0.blp\n",
      "            __1.blp\n",
      "        meta/\n",
      "            sizes\n",
      "            storage\n",
      "    x/\n",
      "        __attrs__\n",
      "        data/\n",
      "            __0.blp\n",
      "            __1.blp\n",
      "        meta/\n",
      "            sizes\n",
      "            storage\n",
      "    y/\n",
      "        __attrs__\n",
      "        data/\n",
      "            __0.blp\n",
      "            __1.blp\n",
      "        meta/\n",
      "            sizes\n",
      "            storage\n",
      "    z/\n",
      "        __attrs__\n",
      "        data/\n",
      "            __0.blp\n",
      "            __1.blp\n",
      "        meta/\n",
      "            sizes\n",
      "            storage\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "for root, dirs, files in os.walk('diamonds'):\n",
    "    level = root.replace('diamonds', '').count(os.sep)\n",
    "    indent = ' ' * 4 * (level)\n",
    "    print('{}{}/'.format(indent, os.path.basename(root)))\n",
    "    subindent = ' ' * 4 * (level + 1)\n",
    "    for f in files:\n",
    "        print('{}{}'.format(subindent, f))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Dask!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dask.array<x_1, shape=(100000,), chunks=((100, 100, 100, ..., 100, 100)), dtype=float64>"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import dask.array as da\n",
    "import dask.dataframe as dd\n",
    "import dask.bag as db\n",
    "\n",
    "d_arr = da.from_array(np.random.randn(100000), chunks=100)\n",
    "d_arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dask.array<x_3, shape=(), chunks=(), dtype=float64>"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d_arr.sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Wait, what happened? Why didn't I get back an integer? \n",
    "\n",
    "Because dask is lazily evaluated- we must `compute` it!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "224.03863985322081"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d_arr.sum().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0022403863985322081"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d_arr.mean().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4.3235191229285963"
      ]
     },
     "execution_count": 113,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d_arr.max().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "cut\n",
       "Fair         4358.757764\n",
       "Good         3928.864452\n",
       "Ideal        3457.541970\n",
       "Premium      4584.257704\n",
       "Very Good    3981.759891\n",
       "Name: price, dtype: float64"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import dask.dataframe as dd\n",
    "\n",
    "ddf = dd.read_csv('diamonds.csv')\n",
    "ddf.groupby('cut')['price'].mean().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "212135217"
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ddf['price'].sum().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "clarity\n",
       "I1       675\n",
       "IF      1042\n",
       "SI1     9978\n",
       "SI2     8118\n",
       "VS1     5702\n",
       "VS2     8647\n",
       "VVS1    2108\n",
       "VVS2    3146\n",
       "Name: carat, dtype: int64"
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ddf[ddf['price'] > 1000].groupby('clarity')['carat'].count().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We're not quite feature complete yet..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'DataFrame' object has no attribute 'sort'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-117-b0c21ae068eb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mddf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mddf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cut'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Ideal'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'Premium'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'price'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mascending\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m/Users/robstory/src/pydataseattle2015/.env/lib/python2.7/site-packages/dask/dataframe/core.pyc\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m    584\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    585\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 586\u001b[0;31m                 \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    587\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    588\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__dir__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'sort'"
     ]
    }
   ],
   "source": [
    "ddf[ddf['cut'].isin(['Ideal', 'Premium'])].sort('price', ascending=False)[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "diamonds_b = db.from_sequence(diamonds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "18823"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tz.thread_last(diamonds_b, (tz.map, lambda x: x['price']), max)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'I1': 675,\n",
       " 'IF': 1042,\n",
       " 'SI1': 9978,\n",
       " 'SI2': 8118,\n",
       " 'VS1': 5702,\n",
       " 'VS2': 8647,\n",
       " 'VVS1': 2108,\n",
       " 'VVS2': 3146}"
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tzc.pipe(diamonds_b, \n",
    "         tzc.filter(lambda r: r['price'] > 1000),\n",
    "         tzc.map(lambda r: (r['clarity'],)),\n",
    "         tzc.countby(lambda r: r[0]),\n",
    "         dict)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}