{ "metadata": { "name": "", "signature": "sha256:0f74a27bb1c2a1e34928aa2e82af83e7a6b486b9a514117b1bc5d7b5eb626d4b" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Playing with Blaze" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from blaze import *\n", "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Let's play with Kaggle data" ] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "This is taken from the Click-Through Rate Prediction\n", "(http://www.kaggle.com/c/avazu-ctr-prediction)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "! head train.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21\r", "\r\n", "1000009418151094273,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,ddd2926e,44956a24,1,2,15706,320,50,1722,0,35,-1,79\r", "\r\n", "10000169349117863715,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,96809ac8,711ee120,1,0,15704,320,50,1722,0,35,100084,79\r", "\r\n", "10000371904215119486,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,b3cf8def,8a4875bd,1,0,15704,320,50,1722,0,35,100084,79\r", "\r\n", "10000640724480838376,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,e8275b8f,6332421a,1,0,15706,320,50,1722,0,35,100084,79\r", "\r\n", "10000679056417042096,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,07d7df22,a99f214a,9644d0bf,779d90c2,1,0,18993,320,50,2161,0,35,-1,157\r", "\r\n", "10000720757801103869,0,14102100,1005,0,d6137915," ] }, { "output_type": "stream", "stream": "stdout", "text": [ "bb1ef334,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,05241af0,8a4875bd,1,0,16920,320,50,1899,0,431,100077,117\r", "\r\n", "10000724729988544911,0,14102100,1005,0,8fda644b,25d4cfcd,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,b264c159,be6db1d7,1,0,20362,320,50,2333,0,39,-1,157\r", "\r\n", "10000918755742328737,0,14102100,1005,1,e151e245,7e091613,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,e6f67278,be74e6fe,1,0,20632,320,50,2374,3,39,-1,23\r", "\r\n", "10000949271186029916,1,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,37e8da74,5db079b5,1,2,15707,320,50,1722,0,35,-1,79\r", "\r\n" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "%time train_pandas = pd.read_csv('train.csv',error_bad_lines=False, index_col=False, dtype=unicode)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 2min 15s, sys: 2min 23s, total: 4min 38s\n", "Wall time: 4min 46s\n" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "train_pandas.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idclickhourC1banner_possite_idsite_domainsite_categoryapp_idapp_domain...device_typedevice_conn_typeC14C15C16C17C18C19C20C21
0 1000009418151094273 0 14102100 1005 0 1fbe01fe f3845767 28905ebd ecad2386 7801e8d9... 1 2 15706 320 50 1722 0 35 -1 79
1 10000169349117863715 0 14102100 1005 0 1fbe01fe f3845767 28905ebd ecad2386 7801e8d9... 1 0 15704 320 50 1722 0 35 100084 79
2 10000371904215119486 0 14102100 1005 0 1fbe01fe f3845767 28905ebd ecad2386 7801e8d9... 1 0 15704 320 50 1722 0 35 100084 79
3 10000640724480838376 0 14102100 1005 0 1fbe01fe f3845767 28905ebd ecad2386 7801e8d9... 1 0 15706 320 50 1722 0 35 100084 79
4 10000679056417042096 0 14102100 1005 1 fe8cc448 9166c161 0569f928 ecad2386 7801e8d9... 1 0 18993 320 50 2161 0 35 -1 157
\n", "

5 rows \u00d7 24 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 5, "text": [ " id click hour C1 banner_pos site_id \\\n", "0 1000009418151094273 0 14102100 1005 0 1fbe01fe \n", "1 10000169349117863715 0 14102100 1005 0 1fbe01fe \n", "2 10000371904215119486 0 14102100 1005 0 1fbe01fe \n", "3 10000640724480838376 0 14102100 1005 0 1fbe01fe \n", "4 10000679056417042096 0 14102100 1005 1 fe8cc448 \n", "\n", " site_domain site_category app_id app_domain ... device_type \\\n", "0 f3845767 28905ebd ecad2386 7801e8d9 ... 1 \n", "1 f3845767 28905ebd ecad2386 7801e8d9 ... 1 \n", "2 f3845767 28905ebd ecad2386 7801e8d9 ... 1 \n", "3 f3845767 28905ebd ecad2386 7801e8d9 ... 1 \n", "4 9166c161 0569f928 ecad2386 7801e8d9 ... 1 \n", "\n", " device_conn_type C14 C15 C16 C17 C18 C19 C20 C21 \n", "0 2 15706 320 50 1722 0 35 -1 79 \n", "1 0 15704 320 50 1722 0 35 100084 79 \n", "2 0 15704 320 50 1722 0 35 100084 79 \n", "3 0 15706 320 50 1722 0 35 100084 79 \n", "4 0 18993 320 50 2161 0 35 -1 157 \n", "\n", "[5 rows x 24 columns]" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "train_pandas.columns" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 9, "text": [ "Index([u'id', u'click', u'hour', u'C1', u'banner_pos', u'site_id', u'site_domain', u'site_category', u'app_id', u'app_domain', u'app_category', u'device_id', u'device_ip', u'device_model', u'device_type', u'device_conn_type', u'C14', u'C15', u'C16', u'C17', u'C18', u'C19', u'C20', u'C21'], dtype='object')" ] } ], "prompt_number": 9 }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "Let's find the mean for the column C14. Let's also find the number of 0 and 1 in the click column" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%time train_pandas[['click','C14']].groupby('click').mean()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 1.22 s, sys: 1.7 s, total: 2.92 s\n", "Wall time: 3.31 s\n" ] }, { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
C14
click
0 18976.761383
1 18181.994233
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 6, "text": [ " C14\n", "click \n", "0 18976.761383\n", "1 18181.994233" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "%time pd.value_counts(train_pandas[['click']].values.ravel())" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 748 ms, sys: 350 ms, total: 1.1 s\n", "Wall time: 1.1 s\n" ] }, { "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ "0 33563901\n", "1 6865066\n", "dtype: int64" ] } ], "prompt_number": 7 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#Now use Blaze to read the data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Data - function used to point to the data source\n", "%time ctr_data_csv = Data('train.csv')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 19.5 ms, sys: 11.3 ms, total: 30.8 ms\n", "Wall time: 35.5 ms\n" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "# Identify the shape and column types of the dataset\n", "ds_train = discover(ctr_data_csv)\n", "ds_train" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 10, "text": [ "dshape(\"\"\"var * {\n", " id: string,\n", " click: int64,\n", " hour: int64,\n", " C1: int64,\n", " banner_pos: int64,\n", " site_id: string,\n", " site_domain: string,\n", " site_category: string,\n", " app_id: string,\n", " app_domain: string,\n", " app_category: string,\n", " device_id: string,\n", " device_ip: string,\n", " device_model: string,\n", " device_type: int64,\n", " device_conn_type: int64,\n", " C14: int64,\n", " C15: int64,\n", " C16: int64,\n", " C17: int64,\n", " C18: int64,\n", " C19: int64,\n", " C20: int64,\n", " C21: int64\n", " }\"\"\")" ] } ], "prompt_number": 10 }, { "cell_type": "markdown", "metadata": {}, "source": [ "###Separating Computation from Data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Construct expressions\n", "expr = by(ctr_data_csv.click,avg_c14=ctr_data_csv.C14.mean(),count_click=ctr_data_csv.click.count())" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "# Compute the expression now\n", "%time compute_01 = compute(expr)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 43.2 s, sys: 9.04 s, total: 52.3 s\n", "Wall time: 52.5 s\n" ] } ], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "compute_01" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
clickavg_c14count_click
0 0 18976.761383 33563901
1 1 18181.994233 6865066
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 13, "text": [ " click avg_c14 count_click\n", "0 0 18976.761383 33563901\n", "1 1 18181.994233 6865066" ] } ], "prompt_number": 13 }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "Convert the data to BColz and check how the performance improves.\n", "\n", "One thing to note here: It does take a while to convert the data to BColz. \n", "This particular instance took 58 minutes." ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Convert data into BColz\n", "\n", "%time into('ctr_train.bcolz','train.csv', dshape=ds_train)\n", "\n", "# This will take an hour" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "####Running the code on BColz now" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%time ds_train_bcolz = Data('ctr_train.bcolz')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 7.99 ms, sys: 20.3 ms, total: 28.3 ms\n", "Wall time: 69.3 ms\n" ] } ], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "expr = by(ds_train_bcolz.click,avg_c14=ds_train_bcolz.C14.mean(),count_click=ds_train_bcolz.click.count())" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "%time compute_02 = compute(expr)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 6.86 s, sys: 5.71 s, total: 12.6 s\n", "Wall time: 20.4 s\n" ] } ], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "compute_02" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
clickavg_c14count_click
0 0 18976.761383 33563901
1 1 18181.994233 6865066
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 17, "text": [ " click avg_c14 count_click\n", "0 0 18976.761383 33563901\n", "1 1 18181.994233 6865066" ] } ], "prompt_number": 17 }, { "cell_type": "markdown", "metadata": {}, "source": [ "###What happens when we introduce multiprocessing into this mix?" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import multiprocessing\n", "pool = multiprocessing.Pool(8)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "%time compute_03 = compute(expr,map=pool.map)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 186 ms, sys: 44.2 ms, total: 230 ms\n", "Wall time: 5.9 s\n" ] } ], "prompt_number": 19 }, { "cell_type": "code", "collapsed": false, "input": [ "compute_03" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##Tab completion on both column names and kinds of functions available for a column" ] }, { "cell_type": "code", "collapsed": false, "input": [ "ds_train_bcolz.C14." ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pandas-like operations" ] }, { "cell_type": "code", "collapsed": false, "input": [ "max(ctr_data_csv[ctr_data_csv.click>0].C14)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## We can specify the chunks to read in from the file and fo the processing. Very powerful when handling extremely large datasets (and very useful for out-of-core computations)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from glob import glob" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "# Specifying active columns at parse time greatly improves performance\n", "active_columns = ['click', 'C14']\n", "intermediates = []" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 21 }, { "cell_type": "code", "collapsed": false, "input": [ "# Do a split-apply-combine operation on each chunk of each CSV file\n", "for fn in sorted(glob('train.csv')):\n", " for df in pd.read_csv(fn, usecols=active_columns, \n", " chunksize=1000000, skipinitialspace=True):\n", " chunk = df.groupby('click').agg({'click': ['count'],\n", " 'C14': ['sum', 'mean']})\n", " intermediates.append(chunk)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "# What's there in each chunk?\n", "chunk" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
clickC14
countsummean
click
0 351604 7380262923 20990.270085
1 77363 1614597749 20870.412846
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 23, "text": [ " click C14 \n", " count sum mean\n", "click \n", "0 351604 7380262923 20990.270085\n", "1 77363 1614597749 20870.412846" ] } ], "prompt_number": 23 }, { "cell_type": "code", "collapsed": false, "input": [ "# Bring those results together. These are much smaller and so likely fit in memory\n", "df = pd.concat(intermediates, axis=0)\n", "df.columns = ['click_count', 'c14_sum', 'c14_mean'] # Flatten multi-index" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 24 }, { "cell_type": "code", "collapsed": false, "input": [ "# How does the data frame looks after concatenating all the chunks?\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
click_countc14_sumc14_mean
click
0 839781 15485235097 18439.611157
1 160219 2776968054 17332.326715
0 837010 15704841688 18763.000000
1 162990 2885584400 17704.000000
0 809601 15315350620 18917.158724
1 190399 3431793636 18024.220905
0 818277 14933318159 18249.710256
1 181723 3084832029 16975.462814
0 826325 15537485195 18803.116443
1 173675 3128568228 18013.923869
0 843004 16242455011 19267.352244
1 156996 2869792215 18279.397023
0 861439 17307156868 20090.983654
1 138561 2617169629 18888.212621
0 857272 16675203706 19451.473635
1 142728 2572199256 18021.686397
0 825936 13952152313 16892.534425
1 174064 2806406042 16122.840116
0 819998 14432652499 17600.838659
1 180002 3069368429 17051.857363
0 826223 15897438092 19241.098459
1 173777 3290585448 18935.678761
0 800866 13501155227 16858.195038
1 199134 3358807679 16867.072820
0 824410 12882590472 15626.436448
1 175590 2771596619 15784.478723
0 826295 13980965879 16920.065932
1 173705 2915364308 16783.421939
0 823124 14756484543 17927.413783
1 176876 3174768304 17949.118614
............
0 829161 15065875099 18170.023794
1 170839 2960857595 17331.274446
0 826675 16561823475 20034.261923
1 173325 3370328925 19445.140199
0 860241 18336728906 21315.804415
1 139759 2884343460 20637.980094
0 850870 18181877915 21368.573243
1 149130 3109406311 20850.307188
0 845280 17888362685 21162.647507
1 154720 3119763717 20163.933021
0 851055 17543236640 20613.516917
1 148945 2954959976 19839.269368
0 842777 17901540358 21241.000000
1 157223 3270726603 20803.000000
0 852260 17922574250 21029.467827
1 147740 2981194374 20178.654217
0 844891 16641698024 19696.857966
1 155109 2818825075 18173.188371
0 841249 16362477086 19450.218765
1 158751 2969096121 18702.849878
0 848139 17979268744 21198.493105
1 151861 3141363709 20685.783111
0 832139 17520063181 21054.250769
1 167861 3460206354 20613.521628
0 831248 17727410244 21326.259124
1 168752 3471709753 20572.851006
0 815621 17007405385 20852.093540
1 184379 3789951606 20555.223784
0 351604 7380262923 20990.270085
1 77363 1614597749 20870.412846
\n", "

82 rows \u00d7 3 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 25, "text": [ " click_count c14_sum c14_mean\n", "click \n", "0 839781 15485235097 18439.611157\n", "1 160219 2776968054 17332.326715\n", "0 837010 15704841688 18763.000000\n", "1 162990 2885584400 17704.000000\n", "0 809601 15315350620 18917.158724\n", "1 190399 3431793636 18024.220905\n", "0 818277 14933318159 18249.710256\n", "1 181723 3084832029 16975.462814\n", "0 826325 15537485195 18803.116443\n", "1 173675 3128568228 18013.923869\n", "0 843004 16242455011 19267.352244\n", "1 156996 2869792215 18279.397023\n", "0 861439 17307156868 20090.983654\n", "1 138561 2617169629 18888.212621\n", "0 857272 16675203706 19451.473635\n", "1 142728 2572199256 18021.686397\n", "0 825936 13952152313 16892.534425\n", "1 174064 2806406042 16122.840116\n", "0 819998 14432652499 17600.838659\n", "1 180002 3069368429 17051.857363\n", "0 826223 15897438092 19241.098459\n", "1 173777 3290585448 18935.678761\n", "0 800866 13501155227 16858.195038\n", "1 199134 3358807679 16867.072820\n", "0 824410 12882590472 15626.436448\n", "1 175590 2771596619 15784.478723\n", "0 826295 13980965879 16920.065932\n", "1 173705 2915364308 16783.421939\n", "0 823124 14756484543 17927.413783\n", "1 176876 3174768304 17949.118614\n", "... ... ... ...\n", "0 829161 15065875099 18170.023794\n", "1 170839 2960857595 17331.274446\n", "0 826675 16561823475 20034.261923\n", "1 173325 3370328925 19445.140199\n", "0 860241 18336728906 21315.804415\n", "1 139759 2884343460 20637.980094\n", "0 850870 18181877915 21368.573243\n", "1 149130 3109406311 20850.307188\n", "0 845280 17888362685 21162.647507\n", "1 154720 3119763717 20163.933021\n", "0 851055 17543236640 20613.516917\n", "1 148945 2954959976 19839.269368\n", "0 842777 17901540358 21241.000000\n", "1 157223 3270726603 20803.000000\n", "0 852260 17922574250 21029.467827\n", "1 147740 2981194374 20178.654217\n", "0 844891 16641698024 19696.857966\n", "1 155109 2818825075 18173.188371\n", "0 841249 16362477086 19450.218765\n", "1 158751 2969096121 18702.849878\n", "0 848139 17979268744 21198.493105\n", "1 151861 3141363709 20685.783111\n", "0 832139 17520063181 21054.250769\n", "1 167861 3460206354 20613.521628\n", "0 831248 17727410244 21326.259124\n", "1 168752 3471709753 20572.851006\n", "0 815621 17007405385 20852.093540\n", "1 184379 3789951606 20555.223784\n", "0 351604 7380262923 20990.270085\n", "1 77363 1614597749 20870.412846\n", "\n", "[82 rows x 3 columns]" ] } ], "prompt_number": 25 }, { "cell_type": "code", "collapsed": false, "input": [ "%%time\n", "\n", "# Perform second split-apply-combine operation on those intermediate results\n", "groups = df.groupby(df.index) # group once for many of the following applies\n", "df2 = pd.concat([groups.click_count.sum(),\n", " groups.c14_sum.sum(),\n", " groups.c14_mean.sum()],\n", " axis=1)\n", "\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 3.8 ms, sys: 880 \u00b5s, total: 4.68 ms\n", "Wall time: 4.09 ms\n" ] } ], "prompt_number": 26 }, { "cell_type": "code", "collapsed": false, "input": [ "df2['avg_c14_metric'] = df2.c14_sum / df2.c14_mean\n", "df2['count'] = df2.click_count\n", "df2" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "result = df2[['avg_c14_metric', 'click_count']]\n", "result" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%%time\n", "\n", "# Timing the entire run\n", "\n", "# Specifying active columns at parse time greatly improves performance\n", "active_columns = ['click', 'C14']\n", "intermediates = []\n", "\n", "# Do a split-apply-combine operation on each chunk of each CSV file\n", "for fn in sorted(glob('train.csv')):\n", " for df in pd.read_csv(fn, usecols=active_columns, \n", " chunksize=1000000, skipinitialspace=True):\n", " chunk = df.groupby('click').agg({'click': ['count'],\n", " 'C14': ['sum', 'mean']})\n", " intermediates.append(chunk)\n", " \n", "# Bring those results together. These are much smaller and so likely fit in memory\n", "df = pd.concat(intermediates, axis=0)\n", "df.columns = ['click_count', 'c14_sum', 'c14_mean'] # Flatten multi-index\n", "\n", "# Perform second split-apply-combine operation on those intermediate results\n", "groups = df.groupby(df.index) # group once for many of the following applies\n", "df2 = pd.concat([groups.click_count.sum(),\n", " groups.c14_sum.sum(),\n", " groups.c14_mean.sum()],\n", " axis=1)\n", "\n", "df2['avg_c14_metric'] = df2.c14_sum / df2.c14_mean\n", "df2['count'] = df2.click_count\n", "\n", "# Select out the columns we want\n", "result = df2[['avg_c14_metric', 'click_count']]\n", "result" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 39.3 s, sys: 5.64 s, total: 45 s\n", "Wall time: 45.5 s\n" ] } ], "prompt_number": 10 } ], "metadata": {} } ] }