{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "from io import StringIO\n", "import timeit\n", "#This is a series of flight itinerary information with leg by leg pairs. \n", "#For example the first line is MEL to GUM then GUM to SEL then SEL to MEL\n", "#That is Melbourne to Guam then Guam to Seoul then Seoul to Melbourne.\n", "base_data = '''MultiCityItinerary\n", "MEL, GUM, GUM, SEL, SEL, MEL\n", "MEL, SHA, SHA, TPE, TPE, SHA, SHA, DEL, DEL, SHA, SHA, MEL\n", "MEL, SHA, SHA, TPE, TPE, SHA, SHA, DEL, DEL, MEL\n", "ADL, LON, LON, LOS, LOS, ADL\n", "MEL, CAN, CAN, TPE, TPE, SHA, SHA, DEL, DEL, MEL\n", "WEH, WEH, SZX, CAN, SGN, SGN\n", "BNE, NYC, LAX, SAT, SAT, BNE\n", "BNE, DXB, DXB, DUB, DUB, DXB, DXB, BNE\n", "SYD, LAX, LAX, SFO, SFO, STL, STL, SYD\n", "PER, MEL, MEL, OOL, OOL, PER\n", "MEL, LON, LON, HKG, HKG, MEL\n", "MEL, HKG, HKG, LON, LON, HKG, HKG, MEL\n", "DRW, DRW, AKL, AKL, LON\n", "ROM, DXB, DXB, SYD, SYD, DXB, DXB, ROM\n", "SYD, HNL, HNL, NYC, NYC, HNL, HNL, SYD\n", "PER, BKK, BKK, BNE, BNE, PER\n", "BNE, LAX, LAX, ATL, ATL, OAX, OAX, LAX, LAX, BNE\n", "AKL, NYC, NYC, LAS, LAS, SFO, SFO, AKL\n", "SYD, HKG, HKG, SEL, SEL, HKG, HKG, SYD\n", "AKL, NYC, NYC, SJO, SJO, AKL\n", "ZQN, SYD, SYD, TYO, TYO, ZQN\n", "RAR, LAX, LAX, YTO, NYC, LAS, LAS, LAX, LAX, RAR\n", "PER, MEL, MEL, OOL, OOL, PER\n", "SYD, DAD, HAN, SYD\n", "ADL, BOD, BOD, BER, BER, SIN, SIN, ADL'''\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As you will see below For the 25600 case we can see that the naive case takes 14.7s, ie 14700ms and the vectorized version takes 73.1ms which is approximately 200 times faster. So although both are O(n), the co-efficient is drastically different, such that it makes far more sense to mutate the data (by adding spaces) so that it can be processed more efficientl rather than trying to go through row by row." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df = \"\" #The dataframe we'll be reusing to test on\n", "runs = 10 #how long you want to wait to see how different the co-effecient is, wouldn't recommend going higher than 15\n", "maxLegs = 6 #maximum number of legs\n", "\n", "# Creating the dataframe with the columns for each leg that we want to fill\n", "def create_df():\n", " df = pd.read_csv(StringIO(raw_data), sep=\"|\") #nonsense pipe seperator as I want the source data to be in a single column\n", " print('Total rows '+str(len(df)))\n", " #adding in the extra leg columns\n", " for x in range(6):\n", " col = ('Leg'+str(x+1))\n", " df[col] = ''\n", " return df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total rows 50\n", "Wall time: 35 ms\n", "Total rows 100\n", "Wall time: 60 ms\n", "Total rows 200\n", "Wall time: 130 ms\n", "Total rows 400\n", "Wall time: 240 ms\n", "Total rows 800\n", "Wall time: 468 ms\n", "Total rows 1600\n", "Wall time: 952 ms\n", "Total rows 3200\n", "Wall time: 1.89 s\n", "Total rows 6400\n", "Wall time: 3.67 s\n", "Total rows 12800\n", "Wall time: 7.36 s\n", "Total rows 25600\n", "Wall time: 14.7 s\n" ] } ], "source": [ "#Add leg data in. WAY slow\n", "def naive_row():\n", " for idx, row in enumerate(df['MultiCityItinerary']):\n", " legs = int((len(row)+2) / 10) #the number of legs in a particular string\n", " for l in range(0, legs):\n", " legstr = row[l*10:l*10+8] #each individual city pair\n", " df.loc[idx, str('Leg'+str(l+1))] = legstr #placing that citypair into a dedicated leg column\n", "\n", "#reset the data and start our profiling run\n", "raw_data = base_data\n", "for x in range(runs):\n", " raw_data = raw_data + raw_data #doubling the size of the data we're processing on each run\n", " df = create_df()\n", " %time naive_row()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total rows 50\n", "Wall time: 4.02 ms\n", "Total rows 100\n", "Wall time: 3 ms\n", "Total rows 200\n", "Wall time: 6.02 ms\n", "Total rows 400\n", "Wall time: 4 ms\n", "Total rows 800\n", "Wall time: 6 ms\n", "Total rows 1600\n", "Wall time: 7 ms\n", "Total rows 3200\n", "Wall time: 11 ms\n", "Total rows 6400\n", "Wall time: 18 ms\n", "Total rows 12800\n", "Wall time: 34 ms\n", "Total rows 25600\n", "Wall time: 73.1 ms\n" ] } ], "source": [ "def smarter_row():\n", " #can't just filter because of the chaining issue, ie is it a view or a copy\n", " #The solution is to pad out each one to be the same length and essentially and brute force\n", " df['MultiCityItinerary'] = df['MultiCityItinerary'].str.ljust(maxLegs*10-2) #this is the longest a string can be, ie 6 legs\n", " \n", " #Now we can go through each string and because we've padded it out we can just process each one in the same way\n", " for leg in range(1,maxLegs+1):\n", " df['Leg'+str(leg)] = df['MultiCityItinerary'].str[(leg-1)*10:(leg-1)*10+8]\n", " \n", " df['MultiCityItinerary'] = df['MultiCityItinerary'].str.strip() #return it to normal\n", "\n", "\n", "#reset the data and start our profiling run\n", "raw_data = base_data \n", "for x in range(runs):\n", " raw_data = raw_data + raw_data\n", " df = create_df()\n", " %time smarter_row()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MultiCityItineraryLeg1Leg2Leg3Leg4Leg5Leg6
0MEL, GUM, GUM, SEL, SEL, MELMEL, GUMGUM, SELSEL, MEL
1MEL, SHA, SHA, TPE, TPE, SHA, SHA, DEL, DEL, S...MEL, SHASHA, TPETPE, SHASHA, DELDEL, SHASHA, MEL
2MEL, SHA, SHA, TPE, TPE, SHA, SHA, DEL, DEL, MELMEL, SHASHA, TPETPE, SHASHA, DELDEL, MEL
3ADL, LON, LON, LOS, LOS, ADLADL, LONLON, LOSLOS, ADL
4MEL, CAN, CAN, TPE, TPE, SHA, SHA, DEL, DEL, MELMEL, CANCAN, TPETPE, SHASHA, DELDEL, MEL
\n", "
" ], "text/plain": [ " MultiCityItinerary Leg1 Leg2 \\\n", "0 MEL, GUM, GUM, SEL, SEL, MEL MEL, GUM GUM, SEL \n", "1 MEL, SHA, SHA, TPE, TPE, SHA, SHA, DEL, DEL, S... MEL, SHA SHA, TPE \n", "2 MEL, SHA, SHA, TPE, TPE, SHA, SHA, DEL, DEL, MEL MEL, SHA SHA, TPE \n", "3 ADL, LON, LON, LOS, LOS, ADL ADL, LON LON, LOS \n", "4 MEL, CAN, CAN, TPE, TPE, SHA, SHA, DEL, DEL, MEL MEL, CAN CAN, TPE \n", "\n", " Leg3 Leg4 Leg5 Leg6 \n", "0 SEL, MEL \n", "1 TPE, SHA SHA, DEL DEL, SHA SHA, MEL \n", "2 TPE, SHA SHA, DEL DEL, MEL \n", "3 LOS, ADL \n", "4 TPE, SHA SHA, DEL DEL, MEL " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#What the dataframe actually ends up looking like\n", "df.head()" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda env:py35]", "language": "python", "name": "conda-env-py35-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }