{ "metadata": { "name": "data_munging_basics" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Data munging basics" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import numpy as np" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "markdown", "metadata": {}, "source": [ "
Camera data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "cameraData = pd.read_csv('../data/cameras.csv')\n", "cameraData.columns\n", "\n", "# somehow the column is named 'Location 1' instead of 'Location.1'" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 2, "text": [ "Index([address, direction, street, crossStreet, intersection, Location 1], dtype=object)" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "# transform column names to lowercase\n", "# equivalent to R's tolower()\n", "cameraData.columns = cameraData.columns.map(lambda x: x.lower())\n", "cameraData.columns" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 3, "text": [ "Index([address, direction, street, crossstreet, intersection, location 1], dtype=object)" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "# string split; split a string into list of strings\n", "# equivalent to R's strsplit()\n", "splitNames = cameraData.columns.map(lambda x: x.split(' '))\n", "splitNames[4]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 4, "text": [ "['intersection']" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "splitNames[5]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 5, "text": [ "['location', '1']" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "splitNames[5][0]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 6, "text": [ "'location'" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "# apply function to names\n", "cameraData.columns.map(lambda x: x.split(' ')[0])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 7, "text": [ "array(['address', 'direction', 'street', 'crossstreet', 'intersection',\n", " 'location'], dtype=object)" ] } ], "prompt_number": 7 }, { "cell_type": "markdown", "metadata": {}, "source": [ "
Peer review experiment data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "fileUrl1 = 'https://dl.dropbox.com/u/7710864/data/reviews-apr29.csv'\n", "fileUrl2 = 'https://dl.dropbox.com/u/7710864/data/solutions-apr29.csv'\n", "\n", "reviews = pd.read_csv(fileUrl1)\n", "solutions = pd.read_csv(fileUrl2)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "reviews.head(2)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsolution_idreviewer_idstartstoptime_leftaccept
0 1 3 27 1304095698 1304095758 1754 1
1 2 4 22 1304095188 1304095206 2306 1
\n", "
" ], "output_type": "pyout", "prompt_number": 9, "text": [ " id solution_id reviewer_id start stop time_left accept\n", "0 1 3 27 1304095698 1304095758 1754 1\n", "1 2 4 22 1304095188 1304095206 2306 1" ] } ], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "solutions.head(2)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idproblem_idsubject_idstartstoptime_leftanswer
0 1 156 29 1304095119 1304095169 2343 B
1 2 269 25 1304095119 1304095183 2329 C
\n", "
" ], "output_type": "pyout", "prompt_number": 10, "text": [ " id problem_id subject_id start stop time_left answer\n", "0 1 156 29 1304095119 1304095169 2343 B\n", "1 2 269 25 1304095119 1304095183 2329 C" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "reviews.columns" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 11, "text": [ "Index([id, solution_id, reviewer_id, start, stop, time_left, accept], dtype=object)" ] } ], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "# remove underscores\n", "# equivalent to R's sub()\n", "reviews.columns = reviews.columns.map(lambda x: x.replace('_', ''))\n", "solutions.columns = solutions.columns.map(lambda x: x.replace('_', ''))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "# test; every occurence is replaced\n", "# in contrast to R's sub(), where only one occurence is replaced\n", "testName = 'this_is_a_test'\n", "testName.replace('_', '')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 13, "text": [ "'thisisatest'" ] } ], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "reviews['timeleft'][:10]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 14, "text": [ "0 1754\n", "1 2306\n", "2 2192\n", "3 2089\n", "4 2043\n", "5 1999\n", "6 2130\n", "7 NaN\n", "8 1899\n", "9 2024\n", "Name: timeleft" ] } ], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "# cut into ranges\n", "# this is equivalent to R's cut()\n", "timeRanges = pd.cut(reviews['timeleft'], range(0, 4000, 600)) # note it needs to go up to 4000 in order to include 3600\n", "timeRanges[:10]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 15, "text": [ "Categorical: \n", "array(['(1200, 1800]', '(1800, 2400]', '(1800, 2400]', '(1800, 2400]',\n", " '(1800, 2400]', '(1800, 2400]', '(1800, 2400]', nan, '(1800, 2400]',\n", " '(1800, 2400]'], dtype=object)\n", "Levels (6): Index(['(0, 600]', '(600, 1200]', '(1200, 1800]',\n", " '(1800, 2400]', '(2400, 3000]', '(3000, 3600]'], dtype=object)" ] } ], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "type(timeRanges)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 16, "text": [ "pandas.core.categorical.Categorical" ] } ], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.value_counts(timeRanges)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 17, "text": [ "(600, 1200] 32\n", "(0, 600] 30\n", "(1800, 2400] 28\n", "(1200, 1800] 25" ] } ], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "# equivalent to R's cut2()\n", "timeRanges = pd.cut(reviews['timeleft'], 6)\n", "pd.value_counts(timeRanges) # note that NaN values are excluded; the resulting ranges are thus different from video" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 18, "text": [ "(19.716, 402.667] 22\n", "(1164, 1544.667] 20\n", "(783.333, 1164] 19\n", "(1544.667, 1925.333] 19\n", "(402.667, 783.333] 19\n", "(1925.333, 2306] 16" ] } ], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "# adding an extra variable\n", "# similarly to R, simply assign to a new column\n", "reviews['timeRanges'] = timeRanges\n", "reviews.head(2)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsolutionidrevieweridstartstoptimeleftaccepttimeRanges
0 1 3 27 1304095698 1304095758 1754 1 (1544.667, 1925.333]
1 2 4 22 1304095188 1304095206 2306 1 (1925.333, 2306]
\n", "
" ], "output_type": "pyout", "prompt_number": 19, "text": [ " id solutionid reviewerid start stop timeleft accept \\\n", "0 1 3 27 1304095698 1304095758 1754 1 \n", "1 2 4 22 1304095188 1304095206 2306 1 \n", "\n", " timeRanges \n", "0 (1544.667, 1925.333] \n", "1 (1925.333, 2306] " ] } ], "prompt_number": 19 }, { "cell_type": "markdown", "metadata": {}, "source": [ "
Merging data\n", "\n", "First let's have a look at the columns:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "print reviews.columns\n", "print solutions.columns" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Index([id, solutionid, reviewerid, start, stop, timeleft, accept, timeRanges], dtype=object)\n", "Index([id, problemid, subjectid, start, stop, timeleft, answer], dtype=object)\n" ] } ], "prompt_number": 20 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then merge; this is equivalent to R's `merge()`" ] }, { "cell_type": "code", "collapsed": false, "input": [ "mergedData2 = pd.merge(reviews, solutions, left_on='solutionid', right_on='id', sort=True)\n", "mergedData2.ix[:,0:6].head(3)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id_xsolutionidrevieweridstart_xstop_xtimeleft_x
0 4 1 26 1304095267 1304095423 2089
1 6 2 29 1304095471 1304095513 1999
2 1 3 27 1304095698 1304095758 1754
\n", "
" ], "output_type": "pyout", "prompt_number": 21, "text": [ " id_x solutionid reviewerid start_x stop_x timeleft_x\n", "0 4 1 26 1304095267 1304095423 2089\n", "1 6 2 29 1304095471 1304095513 1999\n", "2 1 3 27 1304095698 1304095758 1754" ] } ], "prompt_number": 21 }, { "cell_type": "code", "collapsed": false, "input": [ "reviews.ix[0,0:6]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 22, "text": [ "id 1\n", "solutionid 3\n", "reviewerid 27\n", "start 1.304096e+09\n", "stop 1.304096e+09\n", "timeleft 1754\n", "Name: 0" ] } ], "prompt_number": 22 }, { "cell_type": "markdown", "metadata": {}, "source": [ "
Sorting values\n", "\n", "First let's have a look at the unsorted values:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "mergedData2['reviewerid'][:10]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 24, "text": [ "0 26\n", "1 29\n", "2 27\n", "3 22\n", "4 28\n", "5 22\n", "6 29\n", "7 23\n", "8 25\n", "9 29\n", "Name: reviewerid" ] } ], "prompt_number": 24 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then sort; this is equivalent to R's `sort()`" ] }, { "cell_type": "code", "collapsed": false, "input": [ "mergedData2['reviewerid'].order()[:10]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 25, "text": [ "3 22\n", "5 22\n", "13 22\n", "21 22\n", "22 22\n", "23 22\n", "26 22\n", "31 22\n", "36 22\n", "38 22\n", "Name: reviewerid" ] } ], "prompt_number": 25 }, { "cell_type": "markdown", "metadata": {}, "source": [ "R's `order` can be implemented by getting the index of the sorted Series:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "mergedData2['reviewerid'][mergedData2['reviewerid'].order().index][:10]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 30, "text": [ "3 22\n", "5 22\n", "13 22\n", "21 22\n", "22 22\n", "23 22\n", "26 22\n", "31 22\n", "36 22\n", "38 22\n", "Name: reviewerid" ] } ], "prompt_number": 30 }, { "cell_type": "markdown", "metadata": {}, "source": [ "
Reordering a DataFrame\n", "\n", "The original DataFrame ordering:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "mergedData2.ix[:,0:6].head(3)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id_xsolutionidrevieweridstart_xstop_xtimeleft_x
0 4 1 26 1304095267 1304095423 2089
1 6 2 29 1304095471 1304095513 1999
2 1 3 27 1304095698 1304095758 1754
\n", "
" ], "output_type": "pyout", "prompt_number": 33, "text": [ " id_x solutionid reviewerid start_x stop_x timeleft_x\n", "0 4 1 26 1304095267 1304095423 2089\n", "1 6 2 29 1304095471 1304095513 1999\n", "2 1 3 27 1304095698 1304095758 1754" ] } ], "prompt_number": 33 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Order by `reviewerid`:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "sortedData = mergedData2.sort(['reviewerid'])\n", "sortedData.ix[:,0:6].head(3)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id_xsolutionidrevieweridstart_xstop_xtimeleft_x
3 2 4 22 1304095188 1304095206 2306
5 16 6 22 1304095303 1304095471 2041
13 12 14 22 1304095280 1304095301 2211
\n", "
" ], "output_type": "pyout", "prompt_number": 34, "text": [ " id_x solutionid reviewerid start_x stop_x timeleft_x\n", "3 2 4 22 1304095188 1304095206 2306\n", "5 16 6 22 1304095303 1304095471 2041\n", "13 12 14 22 1304095280 1304095301 2211" ] } ], "prompt_number": 34 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Reordering by multiple columns:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "sortedData = mergedData2.sort(['reviewerid', 'id_x'])\n", "sortedData.ix[:,0:6].head(3)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id_xsolutionidrevieweridstart_xstop_xtimeleft_x
3 2 4 22 1304095188 1304095206 2306
13 12 14 22 1304095280 1304095301 2211
5 16 6 22 1304095303 1304095471 2041
\n", "
" ], "output_type": "pyout", "prompt_number": 33, "text": [ " id_x solutionid reviewerid start_x stop_x timeleft_x\n", "3 2 4 22 1304095188 1304095206 2306\n", "13 12 14 22 1304095280 1304095301 2211\n", "5 16 6 22 1304095303 1304095471 2041" ] } ], "prompt_number": 33 }, { "cell_type": "markdown", "metadata": {}, "source": [ "
Reshaping data - example" ] }, { "cell_type": "code", "collapsed": false, "input": [ "misShaped = pd.DataFrame({'treatmentA' : [NaN, 1, 2], 'treatmentB' : [5, 4, 3]})\n", "misShaped['people'] = ['John', 'Jane', 'Mary']\n", "misShaped" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
treatmentAtreatmentBpeople
0NaN 5 John
1 1 4 Jane
2 2 3 Mary
\n", "
" ], "output_type": "pyout", "prompt_number": 35, "text": [ " treatmentA treatmentB people\n", "0 NaN 5 John\n", "1 1 4 Jane\n", "2 2 3 Mary" ] } ], "prompt_number": 35 }, { "cell_type": "code", "collapsed": false, "input": [ "# equivalent to R's melt()\n", "pd.melt(misShaped, id_vars='people')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
peoplevariablevalue
0 John treatmentANaN
1 Jane treatmentA 1
2 Mary treatmentA 2
3 John treatmentB 5
4 Jane treatmentB 4
5 Mary treatmentB 3
\n", "
" ], "output_type": "pyout", "prompt_number": 36, "text": [ " people variable value\n", "0 John treatmentA NaN\n", "1 Jane treatmentA 1\n", "2 Mary treatmentA 2\n", "3 John treatmentB 5\n", "4 Jane treatmentB 4\n", "5 Mary treatmentB 3" ] } ], "prompt_number": 36 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }