{ "metadata": { "name": "", "signature": "sha256:a9496818306c490d4f6055f124bcc84256c26db21a6b9c0ed646d259db719db3" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Back: [Whirl-torial](Whirl-torial.ipynb)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from __future__ import division, print_function" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from IPython.display import HTML, display as disp, Audio\n", "with open(\"../css/css.css\", \"r\") as f:\n", " styles = f.read()\n", "HTML(styles)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "import os\n", "import operator\n", "import json\n", "import tokenize\n", "import re\n", "\n", "from itertools import imap, ifilter, islice, cycle\n", "from functools import partial\n", "\n", "import sh\n", "import requests\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "import numpy as np\n", "from numpy.random import randn, randint, rand, choice\n", "\n", "import pandas as pd\n", "from pandas import DataFrame, Series, Index\n", "from pandas.compat import map, StringIO\n", "import pandas.util.testing as tm" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "pd.options.display.max_rows = 10\n", "pd.options.display.max_columns = 7\n", "\n", "try:\n", " from mpltools import style\n", " style.use('ggplot')\n", "except ImportError:\n", " pass\n", "\n", "# because of our bg color\n", "plt.rc('text', color='white')\n", "plt.rc('axes', labelcolor='white')\n", "plt.rc('xtick', color='white')\n", "plt.rc('ytick', color='white')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%matplotlib inline" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "def insert_page(url):\n", " \"\"\"Embed a webpage in the notebook\"\"\"\n", " disp(HTML('' % url))\n", "\n", "\n", "def read_text(*args, **kwargs):\n", " \"\"\"Simple text reader because I don't like typing ``with`` every time\"\"\"\n", " \n", " with open(*args, **kwargs) as f:\n", " return f.read()\n", " \n", " \n", "def highlight(filename, style='fruity'):\n", " \"\"\"Syntax highlight a file based on its extension\"\"\"\n", " \n", " from pygments import highlight as h\n", " from pygments.lexers import guess_lexer_for_filename\n", " from pygments.formatters import HtmlFormatter\n", "\n", " code = read_text(filename, mode='rt')\n", "\n", " formatter = HtmlFormatter(style=style)\n", " lexer = guess_lexer_for_filename(filename, code)\n", " disp(HTML('{1}'.format(\n", " formatter.get_style_defs('.highlight'),\n", " h(code, lexer, formatter))))\n", " \n", " \n", "def gen_frames(n, size, f=randn):\n", " \"\"\"Generate `n` frames of size `size` using the function `f`.\"\"\"\n", " return (DataFrame(f(*sz)) for sz in [size] * n)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "New Features since v0.11.0" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "(or Interactive Release Notes)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# v0.11\n", "\n", "* indexers ``loc``/``at``, ``iloc``/``iat``\n", "* all dtypes allowed\n", "* now we use [``numexpr``](https://github.com/pydata/numexpr) to evaluate arithmetic expressions where possible (with objects whose ``len`` is `>` 10k elements)" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "``numexpr`` speedups" ] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "More apparent for long(ish) expressions with large(ish) arrays" ] }, { "cell_type": "code", "collapsed": false, "input": [ "x, y, z, w = gen_frames(4, size=(1e6, 20))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "def show_faster(num, denom):\n", " ratio = num / denom\n", " disp(HTML('numexpr is %.2g× as fast' % ratio))\n", " \n", " \n", "def biggish():\n", " disp(HTML('biggish'))\n", " with tm.use_numexpr(True):\n", " Y = %timeit -r 1 -n 1 -o x + y + z + w ** 3\n", "\n", " with tm.use_numexpr(False):\n", " N = %timeit -r 1 -n 1 -o x + y + z + w ** 3\n", "\n", " show_faster(N.best, Y.best)\n", " \n", " \n", "def smallish():\n", " disp(HTML('smallish'))\n", " with tm.use_numexpr(False):\n", " Y = %timeit -r 1 -n 1 -o x + y\n", "\n", " with tm.use_numexpr(False):\n", " N = %timeit -r 1 -n 1 -o x + y\n", "\n", " show_faster(N.best, Y.best)\n", "\n", "\n", "biggish()\n", "smallish()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# v0.12\n", "\n", "* [``read_html``](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_html.html#pandas.read_html)\n", "* [``read_json``](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html#pandas.read_json)\n", "* [``read_csv``](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html#pandas.read_csv) accepts S3 URLs\n", "* ``DataFrame.replace()`` with regular expressions\n", "* ``Series.str`` iteration\n", "* ``MultiIndex`` column reading and writing in ``read_csv``\n", "* ``GroupBy.filter()``" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "[``read_html``](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.html.read_html.html#pandas.io.html.read_html)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "insert_page(\"http://www.fdic.gov/bank/individual/failed/banklist.html\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "url = '../data/banklist.html'\n", "dfs = pd.read_html(url) # returns a list of all tables found on the page" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "assert len(dfs) == 1, \"you're wrong about me\"\n", "df = dfs.pop()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# not sure where those extra columns are from ...\n", "df" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Select tables based on class" ] }, { "cell_type": "code", "collapsed": false, "input": [ "dat_url = 'tmp.html'\n", "with open(dat_url, 'w') as f:\n", " DataFrame(randn(2, 2)).to_html(f, classes=['first'])\n", " f.write('\\n\\n')\n", " DataFrame(randn(2, 2)).to_html(f, classes=['second'])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "highlight(dat_url)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "df, = pd.read_html(dat_url, attrs={'class': 'first'}, index_col=0)\n", "df" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "dfs = pd.read_html(dat_url, index_col=0)\n", "\n", "for df in dfs:\n", " disp(df)\n", "# not really a way to tell which table is which; ordered by appearance in HTML" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "top_url = 'http://www.tylervigen.com'\n", "url = 'http://www.tylervigen.com/view_correlation?id=1703'" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "insert_page(top_url)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "insert_page(url)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "raw = requests.get(url).text\n", "match = r'Divorce rate in Maine'\n", "dfs = pd.read_html(raw, match=match, header=0, index_col=0)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": true, "input": [ "dfs[-1]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# get rid of junk columns\n", "df = dfs[-1].dropna(how='all', axis=(0, 1)).T\n", "\n", "# better names\n", "df.columns = ['mn_divorce_rate', 'per_capita_marg']\n", "\n", "# rename generic index name to year\n", "df = df.reset_index().rename(columns={'index': 'year'})\n", "\n", "# make years integers\n", "df = df.convert_objects(convert_numeric=True)\n", "df" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "def blacken_legend_text(leg):\n", " for t in leg.get_texts():\n", " t.set_color('k')\n", " \n", " \n", "fig, (ax, ax2) = plt.subplots(2, 1, figsize=(8, 6))\n", "\n", "# maine divorces\n", "ln = ax.plot(df.mn_divorce_rate.values, r'ro-', label='Divorce Rate / 1000 People')\n", "ax.set_xticklabels(df.year)\n", "ax.set_xlabel('Year')\n", "ax.set_ylabel(ln[0].get_label())\n", "\n", "# butter eating\n", "axt = ax.twinx()\n", "lt = axt.plot(df.per_capita_marg.values, r'bo-', label='Per Capita Lbs of Margarine')\n", "axt.set_ylabel(lt[0].get_label())\n", "\n", "# scatter plot\n", "ax2.scatter(df.mn_divorce_rate.values, df.per_capita_marg.values, s=100)\n", "ax2.set_xlabel('MN Divorce Rate')\n", "ax2.set_ylabel('Margarine')\n", "ax2.set_title(r'Divorce vs. Margarine, $r = %.2g$' % df.mn_divorce_rate.corr(df.per_capita_marg))\n", "ax2.axis('tight')\n", "\n", "# legend madness\n", "lns = ln + lt\n", "leg = ax.legend(lns, [l.get_label() for l in lns], loc=0)\n", "blacken_legend_text(leg)\n", "\n", "fig.tight_layout()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "``DataFrame.replace()`` with regular expressions" ] }, { "cell_type": "code", "collapsed": false, "input": [ "tips = pd.read_csv('s3://nyqpug/tips.csv')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# add some random lower cased versions of yes and no\n", "nrows = len(tips)\n", "tips.loc[(rand(nrows) > 0.5) & (tips.smoker == 'Yes'), 'smoker'] = 'yes'\n", "tips.loc[(rand(nrows) > 0.5) & (tips.smoker == 'No'), 'smoker'] = 'no'" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": true, "input": [ "tips.smoker.value_counts().plot(kind='bar')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# sanity check\n", "tips.smoker.value_counts()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": true, "input": [ "repd = tips.replace(regex={'smoker': {'[yY]es': True, '[nN]o': False}})\n", "repd" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": true, "input": [ "repd_all = tips.replace(regex={'[yY]es': True, '[nN]o': False})\n", "repd_all" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "[``read_json``](http://pandas.pydata.org/pandas-docs/stable/io.html#io-json-reader) (simple)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "jsfile = 'data.json'" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%%writefile $jsfile\n", "{\n", " \"name\": [\"Bob Jones\", \"Karen Smith\"],\n", " \"age\": [28, 26],\n", " \"gender\": [\"M\", \"F\"]\n", "}" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_json(jsfile) # no problemo" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# can also use keys as the rows instead of columns\n", "pd.read_json(jsfile, orient='index')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%%writefile $jsfile\n", "{\n", " \"region\": {\n", " \"Canada\": {\n", " \"name\": \"Bob Jones\",\n", " \"age\": 28,\n", " \"gender\": \"M\"\n", " },\n", " \"USA\": {\n", " \"name\": \"Karen Smith\",\n", " \"age\": 26,\n", " \"gender\": \"F\"\n", " }\n", " }\n", "}" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "disp(pd.read_json(jsfile, orient='records'))\n", "disp(Audio(os.path.join(os.pardir, 'mp3', 'w.mp3'), autoplay=True))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# disp(Audio(os.path.join(os.pardir, 'mp3', 'c.mp3'), autoplay=True))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "``read_json`` (not so simple)" ] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "pandas plays nicely with other libraries" ] }, { "cell_type": "code", "collapsed": false, "input": [ "data = read_text(jsfile)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": true, "input": [ "# avoid read_json entirely :)\n", "# get transposed\n", "df = DataFrame(json.loads(data)[\"region\"])\n", "df = df.T.convert_objects(convert_numeric=True)\n", "df" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "df.dtypes" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "jq = sh.jq.bake('-M') # -M disables colorizing" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "rule = \"(.region)\" # this rule is essentially data[\"region\"]\n", "out = jq(rule, _in=data).stdout\n", "res = pd.read_json(out, orient='index')\n", "res" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "res.dtypes" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Let's try something a bit hairier..." ] }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "adapted from [this StackOverflow question](http://stackoverflow.com/q/21494030/564538)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%%writefile $jsfile\n", "{\n", " \"intervals\": [\n", " {\n", " \"pivots\": \"Jane Smith\",\n", " \"series\": [\n", " {\n", " \"interval_id\": 0,\n", " \"p_value\": 1\n", " },\n", " {\n", " \"interval_id\": 1,\n", " \"p_value\": 1.1162791357932633e-8\n", " },\n", " {\n", " \"interval_id\": 2,\n", " \"p_value\": 0.0000028675012051504467\n", " }\n", " ]\n", " },\n", " {\n", " \"pivots\": \"Bob Smith\",\n", " \"series\": [\n", " {\n", " \"interval_id\": 0,\n", " \"p_value\": 1\n", " },\n", " {\n", " \"interval_id\": 1,\n", " \"p_value\": 1.1162791357932633e-8\n", " },\n", " {\n", " \"interval_id\": 2,\n", " \"p_value\": 0.0000028675012051504467\n", " }\n", " ]\n", " }\n", " ]\n", "}" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%%writefile rule.txt\n", "[{pivots: .intervals[].pivots, \n", " interval_id: .intervals[].series[].interval_id,\n", " p_value: .intervals[].series[].p_value}] | unique" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "data = read_text(jsfile)\n", "\n", "# check out http://stedolan.github.io/jq/manual for more details on these rules\n", "rule = read_text('rule.txt')\n", "out = jq(rule, _in=data).stdout\n", "js = json.loads(out)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "js[:2]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "res = pd.read_json(out)\n", "res" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "res.dtypes" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# v0.13\n", "* ``DataFrame.isin()``\n", "* [``str.extract()``](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.strings.StringMethods.extract.html#pandas.core.strings.StringMethods.extract)\n", "* Experimental Features\n", " * ``query``/``eval``\n", " * ``msgpack`` IO\n", " * ``Google BigQuery`` IO" ] }, { "cell_type": "code", "collapsed": false, "input": [ "names = list(filter(None, read_text('names.txt').split('\\n')))\n", "names" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "df = DataFrame(dict(zip(['math', 'physics'], \n", " [names[:5], names[-5:]])))\n", "df" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "df.isin(['Brook', 'Bradley', 'Richie', 'Sarah'])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "``str.extract()``" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!grep -P '^[a-zA-Z_]\\w*$' /usr/share/dict/cracklib-small | head -10" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "def gen_filenames(n, pattern='%d_%s', dict_file='/usr/share/dict/words'):\n", " matches_id = partial(re.match, '^%s$' % tokenize.Name)\n", " interpolator = partial(operator.mod, pattern)\n", " \n", " with open(dict_file, 'rt') as f:\n", " only_valid_names = ifilter(matches_id, cycle(f))\n", " n_matches = islice(only_valid_names, 0, n)\n", " \n", " for el in imap(interpolator, enumerate(imap(str.strip, n_matches))):\n", " yield el" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "vids = Series(list(gen_filenames(30, pattern='%d_%s.mp4')))\n", "vids" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "ext = vids.str.extract('(?P\\d+)_(?P.+)')\n", "ext" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "ext = ext.convert_objects(convert_numeric=True)\n", "disp(ext.dtypes)\n", "ext" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "v0.13 Experimental Features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* ``query``/``eval``\n", "* msgpack IO\n", "* Google BigQuery IO" ] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "[``query``](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.query.html#pandas.DataFrame.query)/[``eval``](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.eval.html#pandas.DataFrame.eval)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "n = 1e6\n", "df = DataFrame({'a': randint(10, size=n),\n", " 'b': rand(n),\n", " 'c': rand(n)})\n", "df.head()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "sub = df.query('1 <= a <= 5 and 0.1 < b < 0.4 and 0.5 <= c <= 0.95')\n", "sub" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "qtime = %timeit -o df.query('1 <= a <= 5 and 0.1 < b < 0.4 and 0.5 <= c <= 0.95')\n", "pytime = %timeit -o df.loc[(1 <= df.a) & (df.a <= 5) & (0.1 <= df.b) & (df.b <= 0.4) & (0.5 <= df.c) & (df.c <= 0.9)]\n", "\n", "print('query is %.2gx faster than pure Python' % (pytime.best / qtime.best))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "A, B, C, D = (DataFrame(randn(n, 40)) for _ in range(4))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "qtime = %timeit -r 1 -n 1 -o pd.eval('A + B * 2 + C / D ** 3 * B / C + A ** 10 < A ** 5')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "pytime = %timeit -r 1 -n 1 -o A + B * 2 + C / D ** 3 * B / C + A ** 10 < A ** 5\n", "print('query is %.2gx faster than pure Python' % (pytime.best / qtime.best))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Local variables" ] }, { "cell_type": "code", "collapsed": false, "input": [ "a = rand()\n", "df.query('a <= @a <= b')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "[``MessagePack``](http://msgpack.org/) IO (``to_msgpack``/``read_msgpack``)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### MessagePack is like JSON but smaller and it's a binary format.\n", "\n", "#### Support for this is experimental.\n", "#### Serialization of large objects is not recommended as they are converted to Python ``list``s before writing" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df.head(2).to_msgpack()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "s = pd.to_msgpack(None, # we want the raw bytes output so pass None\n", " Series(randn(2)), \n", " ['yep', 'a', 'list'], \n", " randn(2), \n", " {'a': 2, 'b': 3})\n", "sio = StringIO(s)\n", "pd.read_msgpack(sio)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Google BigQuery IO" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You need to\n", "\n", "```sh\n", "pip install bigquery\n", "```\n", "\n", "as well as set up a Google BigQuery account before this will work\n", "\n", "#### Data can be found [here](https://bigquery.cloud.google.com/table/publicdata:samples.github_timeline)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "highlight('query.sql')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "query = read_text('query.sql')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.read_gbq(query, project_id='metal-lantern-572')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notice the `NaT`s and `NaN`s. Those are where other repositories have valid pull request dates" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df = df.rename(columns=lambda x: x.replace('payload_pull_request_', ''))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "df.dtypes\n", "df" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "df['created_at'] = pd.to_datetime(df.created_at)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "df" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": true, "input": [ "# set the index to the datetime column just created\n", "df = df.set_index('created_at').sort_index()\n", "df" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": true, "input": [ "s = df.additions" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "def remove_time(ax):\n", " replacer = lambda x: x.get_text().replace(' 00:00:00', '')\n", " ax.set_xticklabels(list(map(replacer, ax.get_xticklabels())))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "r = s.resample('B', how='sum')\n", "r.index.name = 'Pull Request Day'\n", "ax = r.plot(kind='bar', figsize=(18, 5))\n", "\n", "remove_time(ax)\n", "ax.set_ylabel('Pull Request Additions per Business Day')\n", "ax.get_figure().autofmt_xdate()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Non user facing but worth mentioning:" ] }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "Jeff Reback's refactor of ``Series`` to use composition instead of inheriting from ``numpy.ndarray``. Bravo!" ] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "v0.14 (soon to be released)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* ``MultiIndex`` slicing\n", "* ``nlargest``/``nsmallest``\n", "* hexbin, pie, and table plotting" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Prelude to `MultiIndex` slicing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "``MultiIndex`` Slicing" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def channel_index(name, nchannels):\n", " return list(zip(np.repeat(name, nchannels), range(nchannels)))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# simulate our EEG data set\n", "\n", "fs = 256 # sampling rate\n", "neeg = 8 # number of EEG channels\n", "nex = 4 # number of auxiliary channels\n", "nsensors = 2 # number of gsr channels\n", "\n", "eeg_chan = channel_index('eeg', neeg)\n", "ex_chan = channel_index('ex', nex)\n", "sens_chan = channel_index('gsr', nsensors)\n", "\n", "disp(eeg_chan)\n", "disp(ex_chan)\n", "disp(sens_chan)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "columns = pd.MultiIndex.from_tuples(eeg_chan + ex_chan + sens_chan,\n", " names=['signal', 'channel'])\n", "\n", "# 10 seconds of fake data\n", "df = pd.DataFrame(np.random.randn(fs * 10, columns.labels[0].size), columns=columns)\n", "\n", "# add in some nans (e.g., a person moved around during these samples)\n", "df.loc[rand(len(df)) < 0.20, 'eeg'] = np.nan\n", "df.head()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# simulate a stimulus marker\n", "df['stim'] = np.sort(randint(10, size=len(df)))\n", "df" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "df.loc[:, np.s_[('ex', 'stim'), :]].head()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# the EX and STIM channels where EEG channels 0 and 1 are not null\n", "row_idx = df.eeg[[0, 1]].notnull().all(axis=1)\n", "col_idx = np.s_[('ex', 'stim'), :]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "col_idx" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "res = df.loc[row_idx, col_idx]\n", "res" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# use np.s_ to construct slices\n", "assert slice(None, 1000, 2) == np.s_[:1000:2] # Which would you prefer?\n", "assert slice(900, None, -1) == np.s_[900::-1]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The expression\n", "\n", "```python\n", "np.s_[('ex', 'stim'), :]\n", "```\n", "\n", "says select `'ex'` and `'stim'` from the first level of the `MultiIndex` and the `:` as saying give me all columns from the second level" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "``nlargest``/``nsmallest``" ] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "If $n << $ ``len(s)`` you'll see a performance improvement. " ] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Idea: Don't sort the whole array when you only need 5 values because" ] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "" ] }, { "cell_type": "code", "collapsed": false, "input": [ "s = Series(randn(1000000), name='a')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "a = %timeit -o s.nlargest(5) # 5 << 1,000,000\n", "b = %timeit -o s.order(ascending=False).head()\n", "print('nlargest is %.2gx faster than order + head' % (b.best / a.best))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "a = %timeit -o s.nsmallest(5)\n", "b = %timeit -o s.order().head()\n", "print('nsmallest is %.2gx faster than order + head' % (b.best / a.best))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Thanks!" ] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Questions?" ] } ], "metadata": {} } ] }