{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# stdlib!\n", "\n", "We're going to import the data, including our own type converter:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import csv\n", "\n", "conversion_map = {\n", " 'carat': float,\n", " 'depth': float,\n", " 'price': int,\n", " 'table': float,\n", " 'x': float,\n", " 'y': float,\n", " 'z': float\n", "}\n", "def converter(type_map, row):\n", " \"\"\"Yep, we need to roll our own type conversions.\"\"\"\n", " converted_row = {}\n", " for col, val in row.items():\n", " converter = type_map.get(col)\n", " if converter:\n", " converted_row[col] = converter(val)\n", " else:\n", " converted_row[col] = val\n", " return converted_row\n", "\n", "with open('diamonds.csv', 'r') as f:\n", " reader = csv.DictReader(f)\n", " diamonds = [converter(conversion_map, r) for r in reader]" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[{'': '1',\n", " 'carat': 0.23,\n", " 'clarity': 'SI2',\n", " 'color': 'E',\n", " 'cut': 'Ideal',\n", " 'depth': 61.5,\n", " 'price': 326,\n", " 'table': 55.0,\n", " 'x': 3.95,\n", " 'y': 3.98,\n", " 'z': 2.43},\n", " {'': '2',\n", " 'carat': 0.21,\n", " 'clarity': 'SI1',\n", " 'color': 'E',\n", " 'cut': 'Premium',\n", " 'depth': 59.8,\n", " 'price': 326,\n", " 'table': 61.0,\n", " 'x': 3.89,\n", " 'y': 3.84,\n", " 'z': 2.31}]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "diamonds[:2]" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "Adding things up is easy enough..." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "43040.86999999912" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_total_carats():\n", " total_carats = 0\n", " for row in diamonds:\n", " total_carats += row['carat']\n", " return total_carats\n", "get_total_carats()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "43040.86999999912" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Faster/more compact: Generator expression!\n", "sum(row['carat'] for row in diamonds)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100 loops, best of 3: 5.29 ms per loop\n" ] } ], "source": [ "# Which is faster? \n", "%timeit get_total_carats()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100 loops, best of 3: 5.7 ms per loop\n" ] } ], "source": [ "%timeit sum(row['carat'] for row in diamonds)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "But what if we want to group, then add?" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# defaultdict is awesome. defaultdict is awesome.\n", "from collections import defaultdict\n", "\n", "def grouper(grouping_col, seq):\n", " \"\"\"People have definitely written a faster version than what I'm about to write\"\"\"\n", " groups = defaultdict(lambda: defaultdict(list))\n", " for row in seq:\n", " group = groups[row[grouping_col]]\n", " for k, v in row.items():\n", " if k != grouping_col:\n", " group[k].append(v)\n", " return groups" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "groups = grouper('cut', diamonds)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# check that the groups include all the rows from the original data\n", "assert sum(len(g['price']) for g in groups.values()) == len(diamonds)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'Fair', 'Good', 'Ideal', 'Premium', 'Very Good'}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "set(groups)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What if I wanted to do something like \n", "```sql\n", "select cut, mean(price)\n", "from diamonds\n", "group by cut;\n", "```" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'Fair': 4358.757763975155,\n", " 'Good': 3928.864451691806,\n", " 'Ideal': 3457.541970210199,\n", " 'Premium': 4584.2577042999055,\n", " 'Very Good': 3981.7598907465654}" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summary = {}\n", "for group, values in groups.items():\n", " summary[group] = sum(values['price']) / len(values['price'])\n", "summary" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "or how about this one:\n", "```sql\n", "select max(price)\n", "from diamonds\n", "where carat > 1;\n", "```" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "18823" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_max_price():\n", " max_price = 0\n", " for row in diamonds:\n", " if row['carat'] > 1 and row['price'] > max_price:\n", " max_price = row['price']\n", " return max_price\n", "get_max_price()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "18823" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# More compact yet again: generator expression!\n", "max(row['price'] for row in diamonds if row['carat'] > 1)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100 loops, best of 3: 6.82 ms per loop\n" ] } ], "source": [ "# Which is faster?\n", "%timeit get_max_price()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100 loops, best of 3: 7.36 ms per loop\n" ] } ], "source": [ "%timeit max(row['price'] for row in diamonds if row['carat'] > 1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### itertools interlude\n", "\n", "The itertools module does lots of nice things. You should be aware of it, and use it where you would be writing your own bespoke counting thing. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Toolz!\n", "\n", "Lets see what it looks like to repeat some of this analysis using the toolz library. PSA: toolz has a pretty huge API, and it's worth reading through the docs: http://toolz.readthedocs.org/en/latest/api.html" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import toolz as tz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Some quick toolz fun things:" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[1, 2, 3, 4, 5, 6]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(tz.concat(([1, 2, 3], (4, 5, 6))))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['A-foo', 'B-foo', 'c-foo', 'd-foo', 'bar-foo', 'baz-foo']" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(tz.mapcat(lambda r: [x + \"-foo\" for x in r],\n", " [[\"A\", \"B\"], (\"c\", \"d\"), (\"bar\", \"baz\")]))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'D': 6775, 'E': 9797, 'F': 9542, 'G': 11292, 'H': 8304, 'I': 5422, 'J': 2808}" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tz.frequencies([r['color'] for r in diamonds])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Remember that toolz is lazy- functions will return generator-like things:" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "take_2 = tz.take(2, diamonds)\n", "take_2" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unique_clarity = tz.unique(diamonds, key=lambda x: x.get('clarity'))\n", "unique_clarity" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[{'': '1',\n", " 'carat': 0.23,\n", " 'clarity': 'SI2',\n", " 'color': 'E',\n", " 'cut': 'Ideal',\n", " 'depth': 61.5,\n", " 'price': 326,\n", " 'table': 55.0,\n", " 'x': 3.95,\n", " 'y': 3.98,\n", " 'z': 2.43},\n", " {'': '2',\n", " 'carat': 0.21,\n", " 'clarity': 'SI1',\n", " 'color': 'E',\n", " 'cut': 'Premium',\n", " 'depth': 59.8,\n", " 'price': 326,\n", " 'table': 61.0,\n", " 'x': 3.89,\n", " 'y': 3.84,\n", " 'z': 2.31}]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(take_2)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[{'': '1',\n", " 'carat': 0.23,\n", " 'clarity': 'SI2',\n", " 'color': 'E',\n", " 'cut': 'Ideal',\n", " 'depth': 61.5,\n", " 'price': 326,\n", " 'table': 55.0,\n", " 'x': 3.95,\n", " 'y': 3.98,\n", " 'z': 2.43},\n", " {'': '2',\n", " 'carat': 0.21,\n", " 'clarity': 'SI1',\n", " 'color': 'E',\n", " 'cut': 'Premium',\n", " 'depth': 59.8,\n", " 'price': 326,\n", " 'table': 61.0,\n", " 'x': 3.89,\n", " 'y': 3.84,\n", " 'z': 2.31},\n", " {'': '3',\n", " 'carat': 0.23,\n", " 'clarity': 'VS1',\n", " 'color': 'E',\n", " 'cut': 'Good',\n", " 'depth': 56.9,\n", " 'price': 327,\n", " 'table': 65.0,\n", " 'x': 4.05,\n", " 'y': 4.07,\n", " 'z': 2.31},\n", " {'': '4',\n", " 'carat': 0.29,\n", " 'clarity': 'VS2',\n", " 'color': 'I',\n", " 'cut': 'Premium',\n", " 'depth': 62.4,\n", " 'price': 334,\n", " 'table': 58.0,\n", " 'x': 4.2,\n", " 'y': 4.23,\n", " 'z': 2.63},\n", " {'': '6',\n", " 'carat': 0.24,\n", " 'clarity': 'VVS2',\n", " 'color': 'J',\n", " 'cut': 'Very Good',\n", " 'depth': 62.8,\n", " 'price': 336,\n", " 'table': 57.0,\n", " 'x': 3.94,\n", " 'y': 3.96,\n", " 'z': 2.48},\n", " {'': '7',\n", " 'carat': 0.24,\n", " 'clarity': 'VVS1',\n", " 'color': 'I',\n", " 'cut': 'Very Good',\n", " 'depth': 62.3,\n", " 'price': 336,\n", " 'table': 57.0,\n", " 'x': 3.95,\n", " 'y': 3.98,\n", " 'z': 2.47},\n", " {'': '16',\n", " 'carat': 0.32,\n", " 'clarity': 'I1',\n", " 'color': 'E',\n", " 'cut': 'Premium',\n", " 'depth': 60.9,\n", " 'price': 345,\n", " 'table': 58.0,\n", " 'x': 4.38,\n", " 'y': 4.42,\n", " 'z': 2.68},\n", " {'': '230',\n", " 'carat': 0.52,\n", " 'clarity': 'IF',\n", " 'color': 'F',\n", " 'cut': 'Ideal',\n", " 'depth': 62.2,\n", " 'price': 2783,\n", " 'table': 55.0,\n", " 'x': 5.14,\n", " 'y': 5.18,\n", " 'z': 3.21}]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Note that this returns the entire object\n", "list(unique_clarity)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'I1': 741,\n", " 'IF': 1790,\n", " 'SI1': 13065,\n", " 'SI2': 9194,\n", " 'VS1': 8171,\n", " 'VS2': 12258,\n", " 'VVS1': 3655,\n", " 'VVS2': 5066}" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# What are our clarity counts?\n", "tz.countby(lambda x: x['clarity'], diamonds)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "18823" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# What about our max price from above? Reduction!\n", "def comparo(accum, row):\n", " price = row['price']\n", " if price > accum:\n", " return price\n", " else:\n", " return accum\n", " \n", "tz.reduce(comparo, diamonds, 0)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "18823" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# We could have also threaded here\n", "tz.thread_last(diamonds, \n", " (tz.map, lambda x: x['price']), \n", " max)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100 loops, best of 3: 8.6 ms per loop\n" ] } ], "source": [ "# Which is faster?\n", "%timeit tz.reduce(comparo, diamonds, 0)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100 loops, best of 3: 11.2 ms per loop\n" ] } ], "source": [ "%timeit tz.thread_last(diamonds, (tz.map, lambda x: x['price']), max)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's look at another SQL query- say we want clarity and carats where price > 1000:\n", "```sql\n", "select count(1)\n", "from diamonds\n", "where price > 1000\n", "group by clarity;\n", "```" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'I1': 675,\n", " 'IF': 1042,\n", " 'SI1': 9978,\n", " 'SI2': 8118,\n", " 'VS1': 5702,\n", " 'VS2': 8647,\n", " 'VVS1': 2108,\n", " 'VVS2': 3146}" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Toolz has currying!\n", "import toolz.curried as tzc\n", "tzc.pipe(diamonds, \n", " tzc.filter(lambda r: r['price'] > 1000),\n", " tzc.map(lambda r: (r['clarity'],)),\n", " tzc.countby(lambda r: r[0]),\n", " dict)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'I1': 675,\n", " 'IF': 1042,\n", " 'SI1': 9978,\n", " 'SI2': 8118,\n", " 'VS1': 5702,\n", " 'VS2': 8647,\n", " 'VVS1': 2108,\n", " 'VVS2': 3146}" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# We can go about this another way as well:\n", "def filter_and_count(kv):\n", " f_and_c = tz.thread_last(kv[1],\n", " (tz.filter, lambda r: r['price'] > 1000),\n", " tz.count)\n", " \n", " return kv[0], f_and_c\n", "\n", "tz.thread_last(diamonds,\n", " (tz.groupby, 'clarity'),\n", " (tz.itemmap, filter_and_count))\n" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'I1': 675,\n", " 'IF': 1042,\n", " 'SI1': 9978,\n", " 'SI2': 8118,\n", " 'VS1': 5702,\n", " 'VS2': 8647,\n", " 'VVS1': 2108,\n", " 'VVS2': 3146}" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Cleanest/best way: reduceby: Groupby + reduce\n", "def increment(accum, row):\n", " if row['price'] > 1000:\n", " return accum + 1\n", " else:\n", " return accum\n", "\n", "tz.reduceby('clarity', \n", " increment,\n", " diamonds, 0) " ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "# Pandas!" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "# We don't need this to use Pandas, FYI\n", "import numpy as np\n", "\n", "# CSV reader is fast!\n", "df = pd.read_csv('diamonds.csv', index_col=0)\n", "# Keep this for later, we're going to overwrite df\n", "df_diamonds = df" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
caratcutcolorclaritydepthtablepricexyz
10.23IdealESI261.5553263.953.982.43
20.21PremiumESI159.8613263.893.842.31
30.23GoodEVS156.9653274.054.072.31
40.29PremiumIVS262.4583344.204.232.63
50.31GoodJSI263.3583354.344.352.75
\n", "
" ], "text/plain": [ " carat cut color clarity depth table price x y z\n", "1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43\n", "2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31\n", "3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31\n", "4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63\n", "5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
caratdepthtablepricexyz
count53940.00000053940.00000053940.00000053940.00000053940.00000053940.00000053940.000000
mean0.79794061.74940557.4571843932.7997225.7311575.7345263.538734
std0.4740111.4326212.2344913989.4397381.1217611.1421350.705699
min0.20000043.00000043.000000326.0000000.0000000.0000000.000000
25%0.40000061.00000056.000000950.0000004.7100004.7200002.910000
50%0.70000061.80000057.0000002401.0000005.7000005.7100003.530000
75%1.04000062.50000059.0000005324.2500006.5400006.5400004.040000
max5.01000079.00000095.00000018823.00000010.74000058.90000031.800000
\n", "
" ], "text/plain": [ " carat depth table price x \\\n", "count 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000 \n", "mean 0.797940 61.749405 57.457184 3932.799722 5.731157 \n", "std 0.474011 1.432621 2.234491 3989.439738 1.121761 \n", "min 0.200000 43.000000 43.000000 326.000000 0.000000 \n", "25% 0.400000 61.000000 56.000000 950.000000 4.710000 \n", "50% 0.700000 61.800000 57.000000 2401.000000 5.700000 \n", "75% 1.040000 62.500000 59.000000 5324.250000 6.540000 \n", "max 5.010000 79.000000 95.000000 18823.000000 10.740000 \n", "\n", " y z \n", "count 53940.000000 53940.000000 \n", "mean 5.734526 3.538734 \n", "std 1.142135 0.705699 \n", "min 0.000000 0.000000 \n", "25% 4.720000 2.910000 \n", "50% 5.710000 3.530000 \n", "75% 6.540000 4.040000 \n", "max 58.900000 31.800000 " ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
caratdepthtablepricexyz
clarity
I11.28384662.73427858.3037793924.1686916.7610936.7093794.207908
IF0.50512361.51061556.5072072864.8391064.9684024.9898273.061659
SI10.85048261.85304257.6625413996.0011485.8883835.8882563.639845
SI21.07764861.77216757.9271815063.0286066.4013706.3978263.948478
VS10.72715861.66745857.3151513839.4553915.5721785.5818283.441007
VS20.76393561.72441757.4174013924.9893955.6577095.6588593.491478
VVS10.50332161.62465156.8844602523.1146374.9603644.9750753.061294
VVS20.59620261.66377857.0249903283.7370715.2184545.2321183.221465
\n", "
" ], "text/plain": [ " carat depth table price x y \\\n", "clarity \n", "I1 1.283846 62.734278 58.303779 3924.168691 6.761093 6.709379 \n", "IF 0.505123 61.510615 56.507207 2864.839106 4.968402 4.989827 \n", "SI1 0.850482 61.853042 57.662541 3996.001148 5.888383 5.888256 \n", "SI2 1.077648 61.772167 57.927181 5063.028606 6.401370 6.397826 \n", "VS1 0.727158 61.667458 57.315151 3839.455391 5.572178 5.581828 \n", "VS2 0.763935 61.724417 57.417401 3924.989395 5.657709 5.658859 \n", "VVS1 0.503321 61.624651 56.884460 2523.114637 4.960364 4.975075 \n", "VVS2 0.596202 61.663778 57.024990 3283.737071 5.218454 5.232118 \n", "\n", " z \n", "clarity \n", "I1 4.207908 \n", "IF 3.061659 \n", "SI1 3.639845 \n", "SI2 3.948478 \n", "VS1 3.441007 \n", "VS2 3.491478 \n", "VVS1 3.061294 \n", "VVS2 3.221465 " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby('clarity').mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Our previous queries:\n", "```sql\n", "select cut, mean(price)\n", "from diamonds\n", "group by cut;\n", "\n", "select count(carat)\n", "from diamonds\n", "where price > 1000\n", "group by clarity;\n", "\n", "select max(price)\n", "from diamonds\n", "where carat > 1;\n", "\n", "select cut, price\n", "from diamonds\n", "where cut in ('Ideal', 'Premium')\n", "order by price desc\n", "limit 10;\n", "```\n", "\n", "Are pretty trivial operations in Pandas:" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "cut\n", "Fair 4358.757764\n", "Good 3928.864452\n", "Ideal 3457.541970\n", "Premium 4584.257704\n", "Very Good 3981.759891\n", "Name: price, dtype: float64" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby('cut')['price'].mean()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "clarity\n", "I1 675\n", "IF 1042\n", "SI1 9978\n", "SI2 8118\n", "VS1 5702\n", "VS2 8647\n", "VVS1 2108\n", "VVS2 3146\n", "Name: carat, dtype: int64" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['price'] > 1000].groupby('clarity')['carat'].count()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "18823" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['carat'] > 1]['price'].max()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
caratcutcolorclaritydepthtablepricexyz
277502.29PremiumIVS260.860188238.508.475.16
277481.51IdealGIF61.755188067.377.414.56
277472.07IdealGSI262.555188048.208.135.11
277452.29PremiumISI161.859187978.528.455.24
277442.00PremiumIVS160.859187958.138.024.91
277432.04PremiumHSI158.160187958.378.284.84
277422.15IdealGSI262.654187918.298.355.21
277411.71PremiumFVS262.359187917.577.534.70
277392.05IdealGSI161.957187878.108.165.03
277382.05PremiumFSI260.259187848.288.335.00
\n", "
" ], "text/plain": [ " carat cut color clarity depth table price x y z\n", "27750 2.29 Premium I VS2 60.8 60 18823 8.50 8.47 5.16\n", "27748 1.51 Ideal G IF 61.7 55 18806 7.37 7.41 4.56\n", "27747 2.07 Ideal G SI2 62.5 55 18804 8.20 8.13 5.11\n", "27745 2.29 Premium I SI1 61.8 59 18797 8.52 8.45 5.24\n", "27744 2.00 Premium I VS1 60.8 59 18795 8.13 8.02 4.91\n", "27743 2.04 Premium H SI1 58.1 60 18795 8.37 8.28 4.84\n", "27742 2.15 Ideal G SI2 62.6 54 18791 8.29 8.35 5.21\n", "27741 1.71 Premium F VS2 62.3 59 18791 7.57 7.53 4.70\n", "27739 2.05 Ideal G SI1 61.9 57 18787 8.10 8.16 5.03\n", "27738 2.05 Premium F SI2 60.2 59 18784 8.28 8.33 5.00" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['cut'].isin(['Ideal', 'Premium'])].sort('price', ascending=False)[:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In which I do a bunch of cool Pandas things without a real goal" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
12345678910...53931539325393353934539355393653937539385393953940
carat0.230.210.230.290.310.240.240.260.220.23...0.710.710.70.70.720.720.720.70.860.75
cutIdealPremiumGoodPremiumGoodVery GoodVery GoodVery GoodFairVery Good...PremiumPremiumVery GoodVery GoodPremiumIdealGoodVery GoodPremiumIdeal
colorEEEIJJIHEH...EFEEDDDDHD
claritySI2SI1VS1VS2SI2VVS2VVS1SI1VS2VS1...SI1SI1VS2VS2SI1SI1SI1SI1SI2SI2
depth61.559.856.962.463.362.862.361.965.159.4...60.559.860.561.262.760.863.162.86162.2
table55616558585757556161...55625959595755605855
price326326327334335336336337337338...2756275627572757275727572757275727572757
x3.953.894.054.24.343.943.954.073.874...5.795.745.715.695.695.755.695.666.155.83
y3.983.844.074.234.353.963.984.113.784.05...5.745.735.765.725.735.765.755.686.125.87
z2.432.312.312.632.752.482.472.532.492.39...3.493.433.473.493.583.53.613.563.743.64
\n", "

10 rows × 53940 columns

\n", "
" ], "text/plain": [ " 1 2 3 4 5 6 7 8 \\\n", "carat 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 \n", "cut Ideal Premium Good Premium Good Very Good Very Good Very Good \n", "color E E E I J J I H \n", "clarity SI2 SI1 VS1 VS2 SI2 VVS2 VVS1 SI1 \n", "depth 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 \n", "table 55 61 65 58 58 57 57 55 \n", "price 326 326 327 334 335 336 336 337 \n", "x 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 \n", "y 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 \n", "z 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 \n", "\n", " 9 10 ... 53931 53932 53933 53934 \\\n", "carat 0.22 0.23 ... 0.71 0.71 0.7 0.7 \n", "cut Fair Very Good ... Premium Premium Very Good Very Good \n", "color E H ... E F E E \n", "clarity VS2 VS1 ... SI1 SI1 VS2 VS2 \n", "depth 65.1 59.4 ... 60.5 59.8 60.5 61.2 \n", "table 61 61 ... 55 62 59 59 \n", "price 337 338 ... 2756 2756 2757 2757 \n", "x 3.87 4 ... 5.79 5.74 5.71 5.69 \n", "y 3.78 4.05 ... 5.74 5.73 5.76 5.72 \n", "z 2.49 2.39 ... 3.49 3.43 3.47 3.49 \n", "\n", " 53935 53936 53937 53938 53939 53940 \n", "carat 0.72 0.72 0.72 0.7 0.86 0.75 \n", "cut Premium Ideal Good Very Good Premium Ideal \n", "color D D D D H D \n", "clarity SI1 SI1 SI1 SI1 SI2 SI2 \n", "depth 62.7 60.8 63.1 62.8 61 62.2 \n", "table 59 57 55 60 58 55 \n", "price 2757 2757 2757 2757 2757 2757 \n", "x 5.69 5.75 5.69 5.66 6.15 5.83 \n", "y 5.73 5.76 5.75 5.68 6.12 5.87 \n", "z 3.58 3.5 3.61 3.56 3.74 3.64 \n", "\n", "[10 rows x 53940 columns]" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# I can Transpose things!\n", "df.T" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
caratcutcolorclaritydepthtablepricexyz
277502.29PremiumIVS260.860188238.508.475.16
277492.00Very GoodGSI163.556188187.907.975.04
277481.51IdealGIF61.755188067.377.414.56
277472.07IdealGSI262.555188048.208.135.11
277462.00Very GoodHSI162.857188037.958.005.01
\n", "
" ], "text/plain": [ " carat cut color clarity depth table price x y z\n", "27750 2.29 Premium I VS2 60.8 60 18823 8.50 8.47 5.16\n", "27749 2.00 Very Good G SI1 63.5 56 18818 7.90 7.97 5.04\n", "27748 1.51 Ideal G IF 61.7 55 18806 7.37 7.41 4.56\n", "27747 2.07 Ideal G SI2 62.5 55 18804 8.20 8.13 5.11\n", "27746 2.00 Very Good H SI1 62.8 57 18803 7.95 8.00 5.01" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# SORT ALL THE THINGS!\n", "df.sort(['price', 'carat'], ascending=False).head()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
float_colint_colstr_coltime_col
a1.51a2015-01-01
b2.52b2015-01-02
c3.53c2015-01-03
a4.54d2015-01-04
b5.55e2015-01-05
c6.56f2015-01-06
a7.57g2015-01-07
b8.58h2015-01-08
c9.59i2015-01-09
a10.510j2015-01-10
\n", "
" ], "text/plain": [ " float_col int_col str_col time_col\n", "a 1.5 1 a 2015-01-01\n", "b 2.5 2 b 2015-01-02\n", "c 3.5 3 c 2015-01-03\n", "a 4.5 4 d 2015-01-04\n", "b 5.5 5 e 2015-01-05\n", "c 6.5 6 f 2015-01-06\n", "a 7.5 7 g 2015-01-07\n", "b 8.5 8 h 2015-01-08\n", "c 9.5 9 i 2015-01-09\n", "a 10.5 10 j 2015-01-10" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Lets use some fake data to show off some stuff:\n", "simple_data_1 = {\"int_col\": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n", " \"str_col\": [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\"],\n", " \"float_col\": [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5],\n", " \"time_col\": [\"2015-01-01\", \"2015-01-02\", \"2015-01-03\", \"2015-01-04\", \"2015-01-05\",\n", " \"2015-01-06\", \"2015-01-07\", \"2015-01-08\", \"2015-01-09\", \"2015-01-10\"]}\n", "my_index = [\"a\", \"b\", \"c\", \"a\", \"b\", \"c\", \"a\", \"b\", \"c\", \"a\"]\n", "df = pd.DataFrame(simple_data_1, index=my_index)\n", "df" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
float_colint_colstr_coltime_col
a1.51a2015-01-01
a4.54d2015-01-04
a7.57g2015-01-07
a10.510j2015-01-10
\n", "
" ], "text/plain": [ " float_col int_col str_col time_col\n", "a 1.5 1 a 2015-01-01\n", "a 4.5 4 d 2015-01-04\n", "a 7.5 7 g 2015-01-07\n", "a 10.5 10 j 2015-01-10" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# DataFrames have indices that can be gotten by label or position\n", "df.loc['a']" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "float_col 6.5\n", "int_col 6\n", "str_col f\n", "time_col 2015-01-06\n", "Name: c, dtype: object" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[5]" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
float_colint_colstr_coltime_col
c3.53c2015-01-03
a4.54d2015-01-04
\n", "
" ], "text/plain": [ " float_col int_col str_col time_col\n", "c 3.5 3 c 2015-01-03\n", "a 4.5 4 d 2015-01-04" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[2:4]" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
float_colint_colstr_coltime_colnew_col
a1.51a2015-01-01NaN
b2.52b2015-01-02NaN
c3.53c2015-01-031
a4.54d2015-01-042
b5.55e2015-01-05NaN
c6.56f2015-01-064
a7.57g2015-01-075
b8.58h2015-01-089
c9.59i2015-01-09NaN
a10.510j2015-01-1010
\n", "
" ], "text/plain": [ " float_col int_col str_col time_col new_col\n", "a 1.5 1 a 2015-01-01 NaN\n", "b 2.5 2 b 2015-01-02 NaN\n", "c 3.5 3 c 2015-01-03 1\n", "a 4.5 4 d 2015-01-04 2\n", "b 5.5 5 e 2015-01-05 NaN\n", "c 6.5 6 f 2015-01-06 4\n", "a 7.5 7 g 2015-01-07 5\n", "b 8.5 8 h 2015-01-08 9\n", "c 9.5 9 i 2015-01-09 NaN\n", "a 10.5 10 j 2015-01-10 10" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# New Column! With Missing Data!\n", "df['new_col'] = [np.nan, np.nan, 1.0, 2.0, np.nan, 4.0, 5.0, 9.0, np.nan, 10.0]\n", "df" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
float_colint_colstr_coltime_colnew_col
c3.53c2015-01-031
a4.54d2015-01-042
c6.56f2015-01-064
a7.57g2015-01-075
b8.58h2015-01-089
a10.510j2015-01-1010
\n", "
" ], "text/plain": [ " float_col int_col str_col time_col new_col\n", "c 3.5 3 c 2015-01-03 1\n", "a 4.5 4 d 2015-01-04 2\n", "c 6.5 6 f 2015-01-06 4\n", "a 7.5 7 g 2015-01-07 5\n", "b 8.5 8 h 2015-01-08 9\n", "a 10.5 10 j 2015-01-10 10" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Removing missing data!\n", "df.dropna()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
float_colint_colstr_coltime_colnew_col
a1.51a2015-01-01FOO!
b2.52b2015-01-02FOO!
c3.53c2015-01-031
a4.54d2015-01-042
b5.55e2015-01-05FOO!
c6.56f2015-01-064
a7.57g2015-01-075
b8.58h2015-01-089
c9.59i2015-01-09FOO!
a10.510j2015-01-1010
\n", "
" ], "text/plain": [ " float_col int_col str_col time_col new_col\n", "a 1.5 1 a 2015-01-01 FOO!\n", "b 2.5 2 b 2015-01-02 FOO!\n", "c 3.5 3 c 2015-01-03 1\n", "a 4.5 4 d 2015-01-04 2\n", "b 5.5 5 e 2015-01-05 FOO!\n", "c 6.5 6 f 2015-01-06 4\n", "a 7.5 7 g 2015-01-07 5\n", "b 8.5 8 h 2015-01-08 9\n", "c 9.5 9 i 2015-01-09 FOO!\n", "a 10.5 10 j 2015-01-10 10" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Fill missing data!\n", "df.fillna(\"FOO!\")" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
float_colint_colstr_coltime_colnew_col
a1.51a2015-01-011
b2.52b2015-01-021
c3.53c2015-01-031
a4.54d2015-01-042
b5.55e2015-01-054
c6.56f2015-01-064
a7.57g2015-01-075
b8.58h2015-01-089
c9.59i2015-01-0910
a10.510j2015-01-1010
\n", "
" ], "text/plain": [ " float_col int_col str_col time_col new_col\n", "a 1.5 1 a 2015-01-01 1\n", "b 2.5 2 b 2015-01-02 1\n", "c 3.5 3 c 2015-01-03 1\n", "a 4.5 4 d 2015-01-04 2\n", "b 5.5 5 e 2015-01-05 4\n", "c 6.5 6 f 2015-01-06 4\n", "a 7.5 7 g 2015-01-07 5\n", "b 8.5 8 h 2015-01-08 9\n", "c 9.5 9 i 2015-01-09 10\n", "a 10.5 10 j 2015-01-10 10" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Backfill missing data!\n", "df.fillna(method='bfill')" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
float_colint_colstr_coltime_colnew_col
a1.51FOO2015-01-01NaN
b2.52FoO2015-01-02NaN
c3.53Foo2015-01-031
a4.54Fo o2015-01-042
b5.55FOO2015-01-05NaN
c6.56fOO2015-01-064
a7.57fOo2015-01-075
b8.58FoO2015-01-089
c9.59foO2015-01-09NaN
a10.510Foo2015-01-1010
\n", "
" ], "text/plain": [ " float_col int_col str_col time_col new_col\n", "a 1.5 1 FOO 2015-01-01 NaN\n", "b 2.5 2 FoO 2015-01-02 NaN\n", "c 3.5 3 Foo 2015-01-03 1\n", "a 4.5 4 Fo o 2015-01-04 2\n", "b 5.5 5 FOO 2015-01-05 NaN\n", "c 6.5 6 fOO 2015-01-06 4\n", "a 7.5 7 fOo 2015-01-07 5\n", "b 8.5 8 FoO 2015-01-08 9\n", "c 9.5 9 foO 2015-01-09 NaN\n", "a 10.5 10 Foo 2015-01-10 10" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Vectorized string methods!\n", "df['str_col'] = [\"FOO\", \"FoO\", \"Foo\", \"Fo o\", \"FOO\", \"fOO\", \"fOo\", \"FoO\", \"foO\", \"Foo \"]\n", "df" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a foo\n", "b foo\n", "c foo\n", "a foo\n", "b foo\n", "c foo\n", "a foo\n", "b foo\n", "c foo\n", "a foo\n", "Name: str_col, dtype: object" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['str_col'].str.lower().str.replace(' ', '')" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keylvalrval
0foo14
1bar25
\n", "
" ], "text/plain": [ " key lval rval\n", "0 foo 1 4\n", "1 bar 2 5" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Database style joins!!\n", "left = pd.DataFrame({'key': ['foo', 'bar', 'fizz'], 'lval': [1, 2, 3]})\n", "right = pd.DataFrame({'key': ['foo', 'bar', 'baz', 'qux'], \n", " 'rval': [4, 5, 6, 7]})\n", "left.merge(right, how='inner')" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keyrvallval
0foo41
1bar52
2baz6NaN
3qux7NaN
4fizzNaN3
\n", "
" ], "text/plain": [ " key rval lval\n", "0 foo 4 1\n", "1 bar 5 2\n", "2 baz 6 NaN\n", "3 qux 7 NaN\n", "4 fizz NaN 3" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "right.merge(left, how='outer')" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keylvalrval
0foo14
1bar25
2fizz3NaN
\n", "
" ], "text/plain": [ " key lval rval\n", "0 foo 1 4\n", "1 bar 2 5\n", "2 fizz 3 NaN" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "left.merge(right, how='left')" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "DatetimeIndex(['2015-03-25 00:00:00', '2015-03-25 01:00:00',\n", " '2015-03-25 02:00:00', '2015-03-25 03:00:00',\n", " '2015-03-25 04:00:00', '2015-03-25 05:00:00',\n", " '2015-03-25 06:00:00', '2015-03-25 07:00:00',\n", " '2015-03-25 08:00:00', '2015-03-25 09:00:00', \n", " ...\n", " '2015-03-30 20:00:00', '2015-03-30 21:00:00',\n", " '2015-03-30 22:00:00', '2015-03-30 23:00:00',\n", " '2015-03-31 00:00:00', '2015-03-31 01:00:00',\n", " '2015-03-31 02:00:00', '2015-03-31 03:00:00',\n", " '2015-03-31 04:00:00', '2015-03-31 05:00:00'],\n", " dtype='datetime64[ns]', length=150, freq='H', tz=None)" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Lets make some random timeseries data\n", "dates = pd.date_range('2015-03-25', periods=150, freq='H')\n", "dates" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Numeric
2015-03-25 00:00:00451
2015-03-25 01:00:0010
2015-03-25 02:00:0018
2015-03-25 03:00:00113
2015-03-25 04:00:00283
\n", "
" ], "text/plain": [ " Numeric\n", "2015-03-25 00:00:00 451\n", "2015-03-25 01:00:00 10\n", "2015-03-25 02:00:00 18\n", "2015-03-25 03:00:00 113\n", "2015-03-25 04:00:00 283" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "time_df = pd.DataFrame(np.random.randint(0, 500, 150), index=dates, columns=[\"Numeric\"])\n", "time_df.head()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Numeric
2015-03-25211.166667
2015-03-26255.041667
2015-03-27241.625000
2015-03-28291.791667
2015-03-29227.625000
2015-03-30212.125000
2015-03-31307.500000
\n", "
" ], "text/plain": [ " Numeric\n", "2015-03-25 211.166667\n", "2015-03-26 255.041667\n", "2015-03-27 241.625000\n", "2015-03-28 291.791667\n", "2015-03-29 227.625000\n", "2015-03-30 212.125000\n", "2015-03-31 307.500000" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# RESAMPLE!\n", "time_df.resample('D', how='mean')" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "DatetimeIndex(['2014-03-25 23:02:00', '2014-03-25 23:01:00',\n", " '2015-03-25 00:00:00'],\n", " dtype='datetime64[ns]', freq=None, tz=None)" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Convert weird date formats!\n", "the_worst = ['3/25/2014 23:02:00', '2014-03-25 23:01:00', 'March 25 2015']\n", "pd.to_datetime(the_worst)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Numeric
2015-03-26 00:00:00451
2015-03-26 01:00:0010
2015-03-26 02:00:0018
2015-03-26 03:00:00113
2015-03-26 04:00:00283
\n", "
" ], "text/plain": [ " Numeric\n", "2015-03-26 00:00:00 451\n", "2015-03-26 01:00:00 10\n", "2015-03-26 02:00:00 18\n", "2015-03-26 03:00:00 113\n", "2015-03-26 04:00:00 283" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Shift dates!\n", "time_df.shift(1, freq='D').head()" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Numeric
2015-03-251
2015-03-309
2015-04-054
\n", "
" ], "text/plain": [ " Numeric\n", "2015-03-25 1\n", "2015-03-30 9\n", "2015-04-05 4" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# What if I have missing dates?\n", "missing = pd.to_datetime(['2015-03-25', '2015-03-30', '2015-04-05'])\n", "missing_df = pd.DataFrame(np.random.randint(0, 10, 3), index=missing, columns=[\"Numeric\"])\n", "missing_df" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Numeric
2015-03-251
2015-03-26NaN
2015-03-27NaN
2015-03-28NaN
2015-03-29NaN
2015-03-309
2015-03-31NaN
2015-04-01NaN
2015-04-02NaN
2015-04-03NaN
2015-04-04NaN
2015-04-054
\n", "
" ], "text/plain": [ " Numeric\n", "2015-03-25 1\n", "2015-03-26 NaN\n", "2015-03-27 NaN\n", "2015-03-28 NaN\n", "2015-03-29 NaN\n", "2015-03-30 9\n", "2015-03-31 NaN\n", "2015-04-01 NaN\n", "2015-04-02 NaN\n", "2015-04-03 NaN\n", "2015-04-04 NaN\n", "2015-04-05 4" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "missing_df.asfreq('D')" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Numeric
2015-03-251
2015-03-261
2015-03-271
2015-03-281
2015-03-291
2015-03-309
2015-03-319
2015-04-019
2015-04-029
2015-04-039
2015-04-049
2015-04-054
\n", "
" ], "text/plain": [ " Numeric\n", "2015-03-25 1\n", "2015-03-26 1\n", "2015-03-27 1\n", "2015-03-28 1\n", "2015-03-29 1\n", "2015-03-30 9\n", "2015-03-31 9\n", "2015-04-01 9\n", "2015-04-02 9\n", "2015-04-03 9\n", "2015-04-04 9\n", "2015-04-05 4" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "missing_df.asfreq('D', method='pad')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#XRAY!" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import xray" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "array([[ 1, 2, 3, 4],\n", " [ 10, 20, 30, 40],\n", " [100, 200, 300, 400]])\n", "Coordinates:\n", " * x (x) |S1 'a' 'b' 'c'\n", " * y (y) |S3 'foo' 'bar' 'baz' 'qux'" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr = np.array([[1, 2, 3, 4], \n", " [10, 20, 30, 40],\n", " [100, 200, 300, 400]])\n", "dim0_coords = ['a', 'b', 'c']\n", "dim1_coords = ['foo', 'bar', 'baz', 'qux']\n", "da = xray.DataArray(arr, [('x', dim0_coords), ('y', dim1_coords)])\n", "da" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 1, 2, 3, 4],\n", " [ 10, 20, 30, 40],\n", " [100, 200, 300, 400]])" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Arrays!\n", "da.values" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "('x', 'y')" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "da.dims" ] }, { "cell_type": "code", "execution_count": 66, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Coordinates:\n", " * x (x) |S1 'a' 'b' 'c'\n", " * y (y) |S3 'foo' 'bar' 'baz' 'qux'" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "da.coords" ] }, { "cell_type": "code", "execution_count": 67, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "array([[ 1, 2, 3, 4],\n", " [10, 20, 30, 40]])\n", "Coordinates:\n", " * x (x) |S1 'a' 'b'\n", " * y (y) |S3 'foo' 'bar' 'baz' 'qux'" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# But with some Pandas-like powers!\n", "\n", "# Index by slice\n", "da[0:2]" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "array([1, 2, 3, 4])\n", "Coordinates:\n", " x |S1 'a'\n", " * y (y) |S3 'foo' 'bar' 'baz' 'qux'" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Indexing!\n", "da.loc['a']" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "array([[ 1, 2, 3, 4],\n", " [ 10, 20, 30, 40],\n", " [100, 200, 300, 400]])\n", "Coordinates:\n", " * x (x) |S1 'a' 'b' 'c'\n", " * y (y) |S3 'foo' 'bar' 'baz' 'qux'" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "da.loc['a':'c']" ] }, { "cell_type": "code", "execution_count": 70, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "array([[ 1, 2, 3, 4],\n", " [100, 200, 300, 400]])\n", "Coordinates:\n", " * x (x) |S1 'a' 'c'\n", " * y (y) |S3 'foo' 'bar' 'baz' 'qux'" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "da.sel(x=['a', 'c'])" ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "array([ 1, 10, 100])\n", "Coordinates:\n", " * x (x) |S1 'a' 'b' 'c'\n", " y |S3 'foo'" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "da.sel(y='foo')" ] }, { "cell_type": "code", "execution_count": 72, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "array([1, 2, 3, 4])\n", "Coordinates:\n", " x |S1 'a'\n", " * y (y) |S3 'foo' 'bar' 'baz' 'qux'" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "da.isel(x=0)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(\n", " array(92.5), \n", " array(1110))" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Do numpy stuff\n", "np.mean(da), np.sum(da)" ] }, { "cell_type": "code", "execution_count": 74, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "array([[ 0.84147098, 0.90929743, 0.14112001, -0.7568025 ],\n", " [-0.54402111, 0.91294525, -0.98803162, 0.74511316],\n", " [-0.50636564, -0.8732973 , -0.99975584, -0.85091936]])\n", "Coordinates:\n", " * y (y) |S3 'foo' 'bar' 'baz' 'qux'\n", " * x (x) |S1 'a' 'b' 'c'" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.sin(da)" ] }, { "cell_type": "code", "execution_count": 75, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "array([[101, 102, 103, 104],\n", " [110, 120, 130, 140],\n", " [200, 300, 400, 500]])\n", "Coordinates:\n", " * y (y) |S3 'foo' 'bar' 'baz' 'qux'\n", " * x (x) |S1 'a' 'b' 'c'" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Broadcast!\n", "da + 100" ] }, { "cell_type": "code", "execution_count": 76, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "array([ 37., 74., 111., 148.])\n", "Coordinates:\n", " * y (y) |S3 'foo' 'bar' 'baz' 'qux'" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Do all of the above by label\n", "da.mean(dim='x')" ] }, { "cell_type": "code", "execution_count": 77, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "array([ 10, 100, 1000])\n", "Coordinates:\n", " * x (x) |S1 'a' 'b' 'c'" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "da.sum(dim='y')" ] }, { "cell_type": "code", "execution_count": 78, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "array([[ 1, 2, 3, 4],\n", " [ 10, 20, 30, 40],\n", " [100, 200, 300, 400]])\n", "Coordinates:\n", " * x (x) |S1 'a' 'b' 'c'\n", " * y (y) |S3 'foo' 'bar' 'baz' 'qux'" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "da" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "array([ 2.5, 25. , 250. ])\n", "Coordinates:\n", " * x (x) |S1 'a' 'b' 'c'" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Group stuff\n", "da.groupby('x').mean()" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "array([111, 222, 333, 444])\n", "Coordinates:\n", " * y (y) |S3 'foo' 'bar' 'baz' 'qux'" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "da.groupby('y').sum()" ] }, { "cell_type": "code", "execution_count": 81, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[[ 2.51076277, 1.0696244 , 1.33092369],\n", " [ 1.8807506 , -0.2231198 , -0.24000785]],\n", "\n", " [[-0.67361403, -0.21962662, 0.11517111],\n", " [-0.79717326, 0.50143901, 0.08006242]]])" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.random.randn(2, 2, 3)" ] }, { "cell_type": "code", "execution_count": 82, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "Dimensions: (x: 3, y: 3, z: 2)\n", "Coordinates:\n", " * y (y) |S5 'row_1' 'row_2' 'row_3'\n", " * x (x) |S5 'col_1' 'col_2' 'col_3'\n", " * z (z) |S3 '10s' '20s'\n", "Data variables:\n", " cube (z, x, y) int64 10 10 10 10 10 10 10 10 10 20 20 20 20 20 20 20 ..." ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# DataSets\n", "x_y = np.array([[1, 1.1, 1.2], [2, 2.1, 2.2], [3, 3.1, 3.2]])\n", "z_coords = np.array([\"10s\", \"20s\"])\n", "data_cube = np.array([[[10, 10, 10],\n", " [10, 10, 10],\n", " [10, 10, 10]],\n", " [[20, 20, 20],\n", " [20, 20, 20],\n", " [20, 20, 20]]])\n", "ds = xray.Dataset({\"cube\": ([\"z\", \"x\", \"y\"], data_cube)},\n", " coords={\"z\": z_coords, \n", " \"x\": [\"col_1\", \"col_2\", \"col_3\"],\n", " \"y\": [\"row_1\", \"row_2\", \"row_3\"]})\n", "ds" ] }, { "cell_type": "code", "execution_count": 83, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "Dimensions: (x: 3, y: 3)\n", "Coordinates:\n", " * y (y) |S5 'row_1' 'row_2' 'row_3'\n", " * x (x) |S5 'col_1' 'col_2' 'col_3'\n", "Data variables:\n", " cube (x, y) int64 30 30 30 30 30 30 30 30 30" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ds.sum(dim=\"z\")" ] }, { "cell_type": "code", "execution_count": 84, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "Dimensions: (y: 3, z: 2)\n", "Coordinates:\n", " * y (y) |S5 'row_1' 'row_2' 'row_3'\n", " * z (z) |S3 '10s' '20s'\n", "Data variables:\n", " cube (z, y) float64 10.0 10.0 10.0 20.0 20.0 20.0" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ds.mean(dim=\"x\")" ] }, { "cell_type": "code", "execution_count": 85, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "Dimensions: (x: 3)\n", "Coordinates:\n", " * x (x) |S5 'col_1' 'col_2' 'col_3'\n", "Data variables:\n", " cube (x) int64 90 90 90" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ds.groupby(\"x\").sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#Blaze\n", "\n", "For this demo, we're going to focus on a couple queries we used in the Pandas demo:\n", "```sql\n", "select cut, mean(price)\n", "from diamonds\n", "groupby cut;\n", "\n", "select count(carat)\n", "from diamonds\n", "where price > 1000\n", "group by clarity;\n", "```" ] }, { "cell_type": "code", "execution_count": 86, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "blaze.expr.expressions.Symbol" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import blaze as bz\n", "\n", "\n", "bz_diamonds = bz.symbol('diamonds', bz.discover(df_diamonds))\n", "type(bz_diamonds)" ] }, { "cell_type": "code", "execution_count": 93, "metadata": { "collapsed": false }, "outputs": [], "source": [ "mean_price = bz.by(bz_diamonds.cut, price=bz_diamonds.price.mean())\n", "clarity_count = bz.by(bz_diamonds[bz_diamonds.price > 1000].clarity,\n", " count=bz_diamonds.carat.count())" ] }, { "cell_type": "code", "execution_count": 88, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
cutprice
0Fair4358.757764
1Good3928.864452
2Ideal3457.541970
3Premium4584.257704
4Very Good3981.759891
\n", "
" ], "text/plain": [ " cut price\n", "0 Fair 4358.757764\n", "1 Good 3928.864452\n", "2 Ideal 3457.541970\n", "3 Premium 4584.257704\n", "4 Very Good 3981.759891" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# We haven't actually computed anything yet!\n", "# Let's make Pandas compute it. \n", "bz.compute(mean_price, df_diamonds)" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
claritycount
0I1741
1IF1790
2SI113065
3SI29194
4VS18171
5VS212258
6VVS13655
7VVS25066
\n", "
" ], "text/plain": [ " clarity count\n", "0 I1 741\n", "1 IF 1790\n", "2 SI1 13065\n", "3 SI2 9194\n", "4 VS1 8171\n", "5 VS2 12258\n", "6 VVS1 3655\n", "7 VVS2 5066" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bz.compute(clarity_count, df_diamonds)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Ok, so what? You made Pandas do a thing we already did. \n", "\n", "### Oh Yeah, what if we want Postgres to compute *the exact same expressions*? \n", "\n", "## WAT" ] }, { "cell_type": "code", "execution_count": 95, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Blaze/Odo make it easy to move data between containers\n", "# Note that we have an empty table already created\n", "pg_datasource = bz.odo(df_diamonds, \"postgresql://postgres:postgres@localhost/pydata::diamonds\")" ] }, { "cell_type": "code", "execution_count": 97, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Now we're going to use Postgres as our computation engine\n", "result = bz.compute(clarity_count, pg_datasource)\n", "result" ] }, { "cell_type": "code", "execution_count": 99, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
claritycount
0IF1790
1I1741
2VVS13655
3VS212258
4VS18171
5VVS25066
6SI29194
7SI113065
\n", "
" ], "text/plain": [ " clarity count\n", "0 IF 1790\n", "1 I1 741\n", "2 VVS1 3655\n", "3 VS2 12258\n", "4 VS1 8171\n", "5 VVS2 5066\n", "6 SI2 9194\n", "7 SI1 13065" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# I don't want a selectable. I want a DataFrame\n", "# odo again\n", "bz.odo(bz.compute(clarity_count, pg_datasource), pd.DataFrame)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can use any SQL supported by SQLAlchemy as your computation. It also supports Python lists, Spark DataFrames, MongoDB, Numpy arrays..." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# bcolz!" ] }, { "cell_type": "code", "execution_count": 100, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import bcolz\n", "\n", "dc = bcolz.ctable.fromdataframe(df_diamonds)" ] }, { "cell_type": "code", "execution_count": 101, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "carat : carray((53940,), float64)\n", " nbytes: 421.41 KB; cbytes: 468.10 KB; ratio: 0.90\n", " cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n", "[ 0.23 0.21 0.23 ..., 0.7 0.86 0.75]\n", "cut : carray((53940,), |S9)\n", " nbytes: 474.08 KB; cbytes: 315.96 KB; ratio: 1.50\n", " cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n", "['Ideal' 'Premium' 'Good' ..., 'Very Good' 'Premium' 'Ideal']\n", "color : carray((53940,), |S1)\n", " nbytes: 52.68 KB; cbytes: 128.00 KB; ratio: 0.41\n", " cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n", "['E' 'E' 'E' ..., 'D' 'H' 'D']\n", "clarity : carray((53940,), |S4)\n", " nbytes: 210.70 KB; cbytes: 256.00 KB; ratio: 0.82\n", " cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n", "['SI2' 'SI1' 'VS1' ..., 'SI1' 'SI2' 'SI2']\n", "depth : carray((53940,), float64)\n", " nbytes: 421.41 KB; cbytes: 407.65 KB; ratio: 1.03\n", " cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n", "[ 61.5 59.8 56.9 ..., 62.8 61. 62.2]\n", "table : carray((53940,), float64)\n", " nbytes: 421.41 KB; cbytes: 317.15 KB; ratio: 1.33\n", " cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n", "[ 55. 61. 65. ..., 60. 58. 55.]\n", "price : carray((53940,), int64)\n", " nbytes: 421.41 KB; cbytes: 279.99 KB; ratio: 1.51\n", " cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n", "[ 326 326 327 ..., 2757 2757 2757]\n", "x : carray((53940,), float64)\n", " nbytes: 421.41 KB; cbytes: 468.10 KB; ratio: 0.90\n", " cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n", "[ 3.95 3.89 4.05 ..., 5.66 6.15 5.83]\n", "y : carray((53940,), float64)\n", " nbytes: 421.41 KB; cbytes: 467.91 KB; ratio: 0.90\n", " cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n", "[ 3.98 3.84 4.07 ..., 5.68 6.12 5.87]\n", "z : carray((53940,), float64)\n", " nbytes: 421.41 KB; cbytes: 468.20 KB; ratio: 0.90\n", " cparams := cparams(clevel=5, shuffle=True, cname='blosclz')\n", "[ 2.43 2.31 2.31 ..., 3.56 3.74 3.64]" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dc.cols" ] }, { "cell_type": "code", "execution_count": 102, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total size for the ctable: 3.49322795868 MB\n", "Compression ratio the ctable: 1.03081835096\n" ] } ], "source": [ "dsize = dc.cbytes / 2**20.\n", "print(\"Total size for the ctable: {} MB\".format(dsize))\n", "print(\"Compression ratio the ctable: {}\".format((dc.nbytes / float(dc.cbytes))))" ] }, { "cell_type": "code", "execution_count": 103, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([(0.21, 'Premium', 'E', 'SI1', 59.8, 61.0, 326, 3.89, 3.84, 2.31),\n", " (0.29, 'Premium', 'I', 'VS2', 62.4, 58.0, 334, 4.2, 4.23, 2.63),\n", " (0.22, 'Premium', 'F', 'SI1', 60.4, 61.0, 342, 3.88, 3.84, 2.33),\n", " ...,\n", " (0.71, 'Premium', 'F', 'SI1', 59.8, 62.0, 2756, 5.74, 5.73, 3.43),\n", " (0.72, 'Premium', 'D', 'SI1', 62.7, 59.0, 2757, 5.69, 5.73, 3.58),\n", " (0.86, 'Premium', 'H', 'SI2', 61.0, 58.0, 2757, 6.15, 6.12, 3.74)], \n", " dtype=[('carat', ' 1)\"]" ] }, { "cell_type": "code", "execution_count": 106, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# We can do the same thing with the ctable on disk!\n", "diskdc = dc.copy(rootdir='diamonds')" ] }, { "cell_type": "code", "execution_count": 107, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([(0.7, 'Ideal', 'E', 'SI1', 62.5, 57.0, 2757, 5.7, 5.72, 3.57),\n", " (0.7, 'Ideal', 'G', 'VS2', 61.6, 56.0, 2757, 5.7, 5.67, 3.5),\n", " (0.74, 'Ideal', 'G', 'SI1', 61.6, 55.0, 2760, 5.8, 5.85, 3.59), ...,\n", " (0.71, 'Ideal', 'G', 'VS1', 61.4, 56.0, 2756, 5.76, 5.73, 3.53),\n", " (0.72, 'Ideal', 'D', 'SI1', 60.8, 57.0, 2757, 5.75, 5.76, 3.5),\n", " (0.75, 'Ideal', 'D', 'SI2', 62.2, 55.0, 2757, 5.83, 5.87, 3.64)], \n", " dtype=[('carat', ' 1000)\"]" ] }, { "cell_type": "code", "execution_count": 108, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "diamonds/\n", " __attrs__\n", " __rootdirs__\n", " carat/\n", " __attrs__\n", " data/\n", " __0.blp\n", " __1.blp\n", " meta/\n", " sizes\n", " storage\n", " clarity/\n", " __attrs__\n", " data/\n", " __0.blp\n", " meta/\n", " sizes\n", " storage\n", " color/\n", " __attrs__\n", " data/\n", " __0.blp\n", " meta/\n", " sizes\n", " storage\n", " cut/\n", " __attrs__\n", " data/\n", " __0.blp\n", " __1.blp\n", " meta/\n", " sizes\n", " storage\n", " depth/\n", " __attrs__\n", " data/\n", " __0.blp\n", " __1.blp\n", " meta/\n", " sizes\n", " storage\n", " price/\n", " __attrs__\n", " data/\n", " __0.blp\n", " __1.blp\n", " meta/\n", " sizes\n", " storage\n", " table/\n", " __attrs__\n", " data/\n", " __0.blp\n", " __1.blp\n", " meta/\n", " sizes\n", " storage\n", " x/\n", " __attrs__\n", " data/\n", " __0.blp\n", " __1.blp\n", " meta/\n", " sizes\n", " storage\n", " y/\n", " __attrs__\n", " data/\n", " __0.blp\n", " __1.blp\n", " meta/\n", " sizes\n", " storage\n", " z/\n", " __attrs__\n", " data/\n", " __0.blp\n", " __1.blp\n", " meta/\n", " sizes\n", " storage\n" ] } ], "source": [ "import os\n", "\n", "for root, dirs, files in os.walk('diamonds'):\n", " level = root.replace('diamonds', '').count(os.sep)\n", " indent = ' ' * 4 * (level)\n", " print('{}{}/'.format(indent, os.path.basename(root)))\n", " subindent = ' ' * 4 * (level + 1)\n", " for f in files:\n", " print('{}{}'.format(subindent, f))" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "# Dask!" ] }, { "cell_type": "code", "execution_count": 109, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "dask.array" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import dask.array as da\n", "import dask.dataframe as dd\n", "import dask.bag as db\n", "\n", "d_arr = da.from_array(np.random.randn(100000), chunks=100)\n", "d_arr" ] }, { "cell_type": "code", "execution_count": 110, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "dask.array" ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d_arr.sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Wait, what happened? Why didn't I get back an integer? \n", "\n", "Because dask is lazily evaluated- we must `compute` it!" ] }, { "cell_type": "code", "execution_count": 111, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "224.03863985322081" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d_arr.sum().compute()" ] }, { "cell_type": "code", "execution_count": 112, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.0022403863985322081" ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d_arr.mean().compute()" ] }, { "cell_type": "code", "execution_count": 113, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "4.3235191229285963" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d_arr.max().compute()" ] }, { "cell_type": "code", "execution_count": 114, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "cut\n", "Fair 4358.757764\n", "Good 3928.864452\n", "Ideal 3457.541970\n", "Premium 4584.257704\n", "Very Good 3981.759891\n", "Name: price, dtype: float64" ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import dask.dataframe as dd\n", "\n", "ddf = dd.read_csv('diamonds.csv')\n", "ddf.groupby('cut')['price'].mean().compute()" ] }, { "cell_type": "code", "execution_count": 115, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "212135217" ] }, "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ddf['price'].sum().compute()" ] }, { "cell_type": "code", "execution_count": 116, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "clarity\n", "I1 675\n", "IF 1042\n", "SI1 9978\n", "SI2 8118\n", "VS1 5702\n", "VS2 8647\n", "VVS1 2108\n", "VVS2 3146\n", "Name: carat, dtype: int64" ] }, "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ddf[ddf['price'] > 1000].groupby('clarity')['carat'].count().compute()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We're not quite feature complete yet..." ] }, { "cell_type": "code", "execution_count": 117, "metadata": { "collapsed": false }, "outputs": [ { "ename": "AttributeError", "evalue": "'DataFrame' object has no attribute 'sort'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mddf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mddf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cut'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Ideal'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'Premium'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'price'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mascending\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/Users/robstory/src/pydataseattle2015/.env/lib/python2.7/site-packages/dask/dataframe/core.pyc\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 584\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 585\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 586\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 587\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 588\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__dir__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'sort'" ] } ], "source": [ "ddf[ddf['cut'].isin(['Ideal', 'Premium'])].sort('price', ascending=False)[:10]" ] }, { "cell_type": "code", "execution_count": 118, "metadata": { "collapsed": true }, "outputs": [], "source": [ "diamonds_b = db.from_sequence(diamonds)" ] }, { "cell_type": "code", "execution_count": 119, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "18823" ] }, "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tz.thread_last(diamonds_b, (tz.map, lambda x: x['price']), max)" ] }, { "cell_type": "code", "execution_count": 120, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'I1': 675,\n", " 'IF': 1042,\n", " 'SI1': 9978,\n", " 'SI2': 8118,\n", " 'VS1': 5702,\n", " 'VS2': 8647,\n", " 'VVS1': 2108,\n", " 'VVS2': 3146}" ] }, "execution_count": 120, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tzc.pipe(diamonds_b, \n", " tzc.filter(lambda r: r['price'] > 1000),\n", " tzc.map(lambda r: (r['clarity'],)),\n", " tzc.countby(lambda r: r[0]),\n", " dict)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.3" } }, "nbformat": 4, "nbformat_minor": 0 }