{ "metadata": { "name": "", "signature": "sha256:750f8695699a0452431af2f4d9c13fabab5677008befb250f824dfba617f0834" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import numpy as np" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "n = 10e6\n", "m = 1e6" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "%time d = pd.DataFrame({\"x\": np.random.randint(0,m,n), \"y\": np.random.random(n)})" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 336 ms, sys: 144 ms, total: 480 ms\n", "Wall time: 482 ms\n" ] } ], "prompt_number": 3 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Filter" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%time dd = d[(d.x>=10) & (d.x<20)]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 96 ms, sys: 20 ms, total: 116 ms\n", "Wall time: 115 ms\n" ] } ], "prompt_number": 4 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Sort" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%time dd = d.sort(\"x\")" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 3.49 s, sys: 264 ms, total: 3.76 s\n", "Wall time: 3.77 s\n" ] } ], "prompt_number": 5 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### New column" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%time dd = d.copy()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 24 ms, sys: 72 ms, total: 96 ms\n", "Wall time: 96.9 ms\n" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "%time dd[\"y2\"] = 2*dd[\"y\"]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 60 ms, sys: 88 ms, total: 148 ms\n", "Wall time: 64.9 ms\n" ] } ], "prompt_number": 7 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Aggregate" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%time dd = d.groupby(\"x\")[\"y\"].mean()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 1.3 s, sys: 156 ms, total: 1.46 s\n", "Wall time: 1.46 s\n" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "type(dd)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 9, "text": [ "pandas.core.series.Series" ] } ], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "%time dd = d.groupby(\"x\", as_index = False)[\"y\"].mean()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 1.39 s, sys: 140 ms, total: 1.53 s\n", "Wall time: 1.53 s\n" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "type(dd)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 11, "text": [ "pandas.core.frame.DataFrame" ] } ], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "%time dk = d.sort_index(by = \"x\")" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 3.48 s, sys: 240 ms, total: 3.72 s\n", "Wall time: 3.72 s\n" ] } ], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "%time dd = dk.groupby(\"x\")[\"y\"].mean()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 284 ms, sys: 120 ms, total: 404 ms\n", "Wall time: 402 ms\n" ] } ], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "%time dd = dk.groupby(\"x\", as_index = False)[\"y\"].mean()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 352 ms, sys: 132 ms, total: 484 ms\n", "Wall time: 485 ms\n" ] } ], "prompt_number": 14 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Join" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%time dm = pd.DataFrame({\"x\": np.random.permutation(np.arange(m))})" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 176 ms, sys: 4 ms, total: 180 ms\n", "Wall time: 179 ms\n" ] } ], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "%time dd = pd.merge(d, dm)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 5.38 s, sys: 504 ms, total: 5.88 s\n", "Wall time: 5.89 s\n" ] } ], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "%time dmk = dm.sort_index(by = \"x\")" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 212 ms, sys: 4 ms, total: 216 ms\n", "Wall time: 217 ms\n" ] } ], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "%time dd = pd.merge(dk, dmk)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 1.78 s, sys: 380 ms, total: 2.16 s\n", "Wall time: 2.16 s\n" ] } ], "prompt_number": 18 } ], "metadata": {} } ] }