{ "metadata": { "name": "Distributed Aggregate and Join" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "%pylab inline" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.kernel.zmq.pylab.backend_inline].\n", "For more information, type 'help(pylab)'.\n" ] } ], "prompt_number": 42 }, { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "import pandas as pd\n", "import pylab as pl" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 43 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.utils import murmurhash3_32" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 24 }, { "cell_type": "code", "collapsed": false, "input": [ "n_groups = 100\n", "n_samples = 100000\n", "\n", "group_id = np.asarray(np.random.randint(n_groups, size=n_samples), dtype=np.int32)\n", "data = np.random.normal(size=n_samples)\n", "\n", "df = pd.DataFrame({'group_id': group_id, 'data': data})" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 25 }, { "cell_type": "code", "collapsed": false, "input": [ "%time grouped = df.groupby('group_id')\n", "%time aggregate = grouped.sum()\n", "\n", "aggregate[:10]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 253 \u00b5s, sys: 16 \u00b5s, total: 269 \u00b5s\n", "Wall time: 271 \u00b5s\n", "CPU times: user 4.9 ms, sys: 474 \u00b5s, total: 5.37 ms\n", "Wall time: 4.86 ms\n" ] }, { "html": [ "
\n", " | data | \n", "
---|---|
group_id | \n", "\n", " |
0 | \n", "4.974845 | \n", "
1 | \n", "-50.376857 | \n", "
2 | \n", "5.168091 | \n", "
3 | \n", "6.355739 | \n", "
4 | \n", "22.481601 | \n", "
5 | \n", "-0.748311 | \n", "
6 | \n", "-73.982681 | \n", "
7 | \n", "4.263864 | \n", "
8 | \n", "43.841003 | \n", "
9 | \n", "14.636269 | \n", "