In [1]:
import ast

import pandas as pd

import datetime

from keras.layers import Input, Dense, Embedding, merge, Flatten, Merge, BatchNormalization
from keras.models import Model, load_model
from keras.regularizers import l2
import keras.backend as K
from keras.optimizers import SGD
import numpy as np

from sklearn.cluster import MeanShift, estimate_bandwidth

import utils

import data

from sklearn.model_selection import train_test_split

from bcolz_array_iterator import BcolzArrayIterator

import bcolz

from keras_tqdm import TQDMNotebookCallback
from keras.callbacks import ModelCheckpoint

Using Theano backend.
Using gpu device 1: GeForce GTX TITAN X (CNMeM is enabled with initial size: 80.0% of memory, cuDNN 5110)


Below path is a shared directory, swap to own

In [2]:
data_path = "/data/datasets/taxi/"

## Replication of 'csv_to_hdf5.py'

Original repo used some bizarre tuple method of reading in data to save in a hdf5 file using fuel. The following does the same approach in that module, only using pandas and saving in a bcolz format (w/ training data as example)

In [3]:
meta = pd.read_csv(data_path+'metaData_taxistandsID_name_GPSlocation.csv', header=0)

In [66]:
meta.head()

Unnamed: 0,ID,Descricao,Latitude,Longitude
0,1,Agra,41.177146,-8.60967
1,2,Alameda,41.15619,-8.591064
2,3,Aldoar,41.170525,-8.665876
3,4,Alfândega,41.143764,-8.621803
4,5,Amial,41.18351,-8.612726


In [85]:
train = pd.read_csv(data_path+'train/train.csv', header=0)

In [5]:
train.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [6]:
train['ORIGIN_CALL'] = pd.Series(pd.factorize(train['ORIGIN_CALL'])[0]) + 1

In [7]:
train['ORIGIN_STAND']=pd.Series([0 if pd.isnull(x) or x=='' else int(x) for x in train["ORIGIN_STAND"]])

In [8]:
train['TAXI_ID'] = pd.Series(pd.factorize(train['TAXI_ID'])[0]) + 1

In [9]:
train['DAY_TYPE'] = pd.Series([ord(x[0]) - ord('A') for x in train['DAY_TYPE']])

The array of long/lat coordinates per trip (row) is read in as a string. The function `ast.literal_eval(x)` evaluates the string into the expression it represents (safely). This happens below

In [138]:
polyline = pd.Series([ast.literal_eval(x) for x in train['POLYLINE']])

Split into latitude/longitude

In [148]:
train['LATITUDE'] = pd.Series([np.array([point[1] for point in poly],dtype=np.float32) for poly in polyline])

In [150]:
train['LONGITUDE'] = pd.Series([np.array([point[0] for point in poly],dtype=np.float32) for poly in polyline])

In [157]:
utils.save_array(data_path+'train/train.bc', train.as_matrix())

In [158]:
utils.save_array(data_path+'train/meta_train.bc', meta.as_matrix())

## Further Feature Engineering

After converting 'csv_to_hdf5.py' functionality to pandas, I saved that array and then simply constructed the rest of the features as specified in the paper using pandas. I didn't bother seeing how the author did it as it was extremely obtuse and involved the fuel module.

In [424]:
train = pd.DataFrame(utils.load_array(data_path+'train/train.bc'), columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE'])

In [425]:
train.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,LATITUDE,LONGITUDE
0,1372636858620000589,C,0,0,1,1372636858,0,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...","[41.1414, 41.1414, 41.1425, 41.1438, 41.1444, ...","[-8.61864, -8.6185, -8.62033, -8.62215, -8.623..."
1,1372637303620000596,B,0,7,2,1372637303,0,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...","[41.1598, 41.1599, 41.1601, 41.1605, 41.1609, ...","[-8.63985, -8.64035, -8.6422, -8.64445, -8.646..."
2,1372636951620000320,C,0,0,3,1372636951,0,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...","[41.1404, 41.1404, 41.1403, 41.1404, 41.1404, ...","[-8.61296, -8.61338, -8.61421, -8.61477, -8.61..."
3,1372636854620000520,C,0,0,4,1372636854,0,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...","[41.152, 41.1519, 41.1519, 41.152, 41.1519, 41...","[-8.57468, -8.57471, -8.5747, -8.57466, -8.574..."
4,1372637091620000337,C,0,0,5,1372637091,0,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...","[41.1805, 41.1805, 41.18, 41.1789, 41.1785, 41...","[-8.64599, -8.64595, -8.64605, -8.6468, -8.649..."


The paper discusses how many categorical variables there are per category. The following all check out

In [426]:
train['ORIGIN_CALL'].max()

57105

In [427]:
train['ORIGIN_STAND'].max()

63

In [428]:
train['TAXI_ID'].max()

448

Self-explanatory

In [429]:
train['DAY_OF_WEEK'] = pd.Series([datetime.datetime.fromtimestamp(t).weekday() for t in train['TIMESTAMP']])

Quarter hour of the day, i.e. 1 of the `4*24 = 96` quarter hours of the day

In [430]:
train['QUARTER_HOUR'] = pd.Series([int((datetime.datetime.fromtimestamp(t).hour*60 + datetime.datetime.fromtimestamp(t).minute)/15)
                                   for t in train['TIMESTAMP']])

Self-explanatory

In [431]:
train['WEEK_OF_YEAR'] = pd.Series([datetime.datetime.fromtimestamp(t).isocalendar()[1] for t in train['TIMESTAMP']])

Target coords are the last in the sequence (final position). If there are no positions, or only 1, then mark as invalid w/ nan in order to drop later

In [433]:
train['TARGET'] = pd.Series([[l[1][0][-1], l[1][1][-1]] if len(l[1][0]) > 1 else numpy.nan for l in train[['LONGITUDE','LATITUDE']].iterrows()])

This function creates the continuous inputs, which are the concatened k first and k last coords in a sequence, as discussed in the paper. 

If there aren't at least 2* k coords excluding the target, then the k first and k last overlap. In this case the sequence (excluding target) is padded at the end with the last coord in the sequence. The paper mentioned they padded front and back but didn't specify in what manner.

Also marks any invalid w/ na's

In [437]:
def start_stop_inputs(k):
    result = []
    for l in train[['LONGITUDE','LATITUDE']].iterrows():
        if len(l[1][0]) < 2 or len(l[1][1]) < 2:
            result.append(numpy.nan)
        elif len(l[1][0][:-1]) >= 2*k:
            result.append(numpy.concatenate([l[1][0][0:k],l[1][0][-(k+1):-1],l[1][1][0:k],l[1][1][-(k+1):-1]]).flatten())
        else:
            l1 = numpy.lib.pad(l[1][0][:-1], (0,20-len(l[1][0][:-1])), mode='edge')
            l2 = numpy.lib.pad(l[1][1][:-1], (0,20-len(l[1][1][:-1])), mode='edge')
            result.append(numpy.concatenate([l1[0:k],l1[-k:],l2[0:k],l2[-k:]]).flatten())
    return pd.Series(result)        

In [438]:
train['COORD_FEATURES'] = start_stop_inputs(5)

In [442]:
train.shape

(1710670, 16)

In [441]:
train.dropna().shape

(1674160, 16)

Drop na's

In [443]:
train = train.dropna()

In [446]:
utils.save_array(data_path+'train/train_features.bc', train.as_matrix())

## End to end feature transformation

In [155]:
train = pd.read_csv(data_path+'train/train.csv', header=0)

In [None]:
test = pd.read_csv(data_path+'test/test.csv', header=0)

In [139]:
def start_stop_inputs(k, data, test):
    result = []
    for l in data[['LONGITUDE','LATITUDE']].iterrows():
        if not test:
            if len(l[1][0]) < 2 or len(l[1][1]) < 2:
                result.append(np.nan)
            elif len(l[1][0][:-1]) >= 2*k:
                result.append(np.concatenate([l[1][0][0:k],l[1][0][-(k+1):-1],l[1][1][0:k],l[1][1][-(k+1):-1]]).flatten())
            else:
                l1 = np.lib.pad(l[1][0][:-1], (0,4*k-len(l[1][0][:-1])), mode='edge')
                l2 = np.lib.pad(l[1][1][:-1], (0,4*k-len(l[1][1][:-1])), mode='edge')
                result.append(np.concatenate([l1[0:k],l1[-k:],l2[0:k],l2[-k:]]).flatten())
        else:
            if len(l[1][0]) < 1 or len(l[1][1]) < 1:
                result.append(np.nan)
            elif len(l[1][0]) >= 2*k:
                result.append(np.concatenate([l[1][0][0:k],l[1][0][-k:],l[1][1][0:k],l[1][1][-k:]]).flatten())
            else:
                l1 = np.lib.pad(l[1][0], (0,4*k-len(l[1][0])), mode='edge')
                l2 = np.lib.pad(l[1][1], (0,4*k-len(l[1][1])), mode='edge')
                result.append(np.concatenate([l1[0:k],l1[-k:],l2[0:k],l2[-k:]]).flatten())
    return pd.Series(result)     

Pre-calculated below on train set

In [143]:
lat_mean = 41.15731
lat_std = 0.074120656
long_mean = -8.6161413
long_std = 0.057200309

In [None]:
def feature_ext(data, test=False):   
    
    data['ORIGIN_CALL'] = pd.Series(pd.factorize(data['ORIGIN_CALL'])[0]) + 1

    data['ORIGIN_STAND']=pd.Series([0 if pd.isnull(x) or x=='' else int(x) for x in data["ORIGIN_STAND"]])

    data['TAXI_ID'] = pd.Series(pd.factorize(data['TAXI_ID'])[0]) + 1

    data['DAY_TYPE'] = pd.Series([ord(x[0]) - ord('A') for x in data['DAY_TYPE']])

    polyline = pd.Series([ast.literal_eval(x) for x in data['POLYLINE']])

    data['LATITUDE'] = pd.Series([np.array([point[1] for point in poly],dtype=np.float32) for poly in polyline])

    data['LONGITUDE'] = pd.Series([np.array([point[0] for point in poly],dtype=np.float32) for poly in polyline])
    
    if not test:
    
        data['TARGET'] = pd.Series([[l[1][0][-1], l[1][1][-1]] if len(l[1][0]) > 1 else np.nan for l in data[['LONGITUDE','LATITUDE']].iterrows()])

    
    data['LATITUDE'] = pd.Series([(t-lat_mean)/lat_std for t in data['LATITUDE']])
    
    data['LONGITUDE'] = pd.Series([(t-long_mean)/long_std for t in data['LONGITUDE']])
    
    data['COORD_FEATURES'] = start_stop_inputs(5, data, test)

    data['DAY_OF_WEEK'] = pd.Series([datetime.datetime.fromtimestamp(t).weekday() for t in data['TIMESTAMP']])

    data['QUARTER_HOUR'] = pd.Series([int((datetime.datetime.fromtimestamp(t).hour*60 + datetime.datetime.fromtimestamp(t).minute)/15)
                                       for t in data['TIMESTAMP']])

    data['WEEK_OF_YEAR'] = pd.Series([datetime.datetime.fromtimestamp(t).isocalendar()[1] for t in data['TIMESTAMP']])
    
        
    data = data.dropna()

    return data

In [None]:
train = feature_ext(train)

In [None]:
test = feature_ext(test, test=True)

In [161]:
test.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,LATITUDE,LONGITUDE,COORD_FEATURES,DAY_OF_WEEK,QUARTER_HOUR,WEEK_OF_YEAR
0,T1,B,0,15,1,1408039037,0,False,"[[-8.585676,41.148522],[-8.585712,41.148639],[...","[-0.118578, -0.116982, -0.1141, -0.113122, -0....","[0.532604, 0.531971, 0.532454, 0.531671, 0.527...","[0.532604, 0.531971, 0.532454, 0.531671, 0.527...",3,43,33
1,T2,B,0,57,2,1408038611,0,False,"[[-8.610876,41.14557],[-8.610858,41.145579],[-...","[-0.158413, -0.158258, -0.155736, -0.150024, -...","[0.0920491, 0.0923659, 0.0915823, 0.0996017, 0...","[0.0920491, 0.0923659, 0.0915823, 0.0996017, 0...",3,43,33
2,T3,B,0,15,3,1408038568,0,False,"[[-8.585739,41.148558],[-8.58573,41.148828],[-...","[-0.118063, -0.11446, -0.112505, -0.111887, -0...","[0.531504, 0.531671, 0.531821, 0.5219, 0.52490...","[0.531504, 0.531671, 0.531821, 0.5219, 0.52490...",3,43,33
3,T4,B,0,53,4,1408039090,0,False,"[[-8.613963,41.141169],[-8.614125,41.141124],[...","[-0.217753, -0.21837, -0.221047, -0.222488, -0...","[0.0380801, 0.0352457, 0.0184065, 0.0151053, 0...","[0.0380801, 0.0352457, 0.0184065, 0.0151053, 0...",3,43,33
4,T5,B,0,18,5,1408039177,0,False,"[[-8.619903,41.148036],[-8.619894,41.148036]]","[-0.125114, -0.125114]","[-0.0657565, -0.0656064]","[-0.0657565, -0.0656064, -0.0656064, -0.065606...",3,43,33


In [162]:
utils.save_array(data_path+'train/train_features.bc', train.as_matrix())

In [163]:
utils.save_array(data_path+'test/test_features.bc', test.as_matrix())

In [164]:
train.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,LATITUDE,LONGITUDE,TARGET,COORD_FEATURES,DAY_OF_WEEK,QUARTER_HOUR,WEEK_OF_YEAR
0,1372636858620000589,C,0,0,1,1372636858,0,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...","[-0.21451, -0.214974, -0.199688, -0.182087, -0...","[-0.0437321, -0.0412145, -0.0731591, -0.105104...","[-8.63084, 41.1545]","[-0.0437321, -0.0412145, -0.0731591, -0.105104...",6,68,26
1,1372637303620000596,B,0,7,2,1372637303,0,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...","[0.0339161, 0.0345337, 0.0378275, 0.0429227, 0...","[-0.414429, -0.423249, -0.455494, -0.494991, -...","[-8.66574, 41.1707]","[-0.414429, -0.423249, -0.455494, -0.494991, -...",6,68,26
2,1372636951620000320,C,0,0,3,1372636951,0,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...","[-0.228715, -0.228818, -0.229796, -0.228561, -...","[0.0555529, 0.048317, 0.0336785, 0.0239251, 0....","[-8.61597, 41.1405]","[0.0555529, 0.048317, 0.0336785, 0.0239251, 0....",6,68,26
3,1372636854620000520,C,0,0,4,1372636854,0,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...","[-0.0723098, -0.0724127, -0.0725671, -0.072206...","[0.724872, 0.724405, 0.724572, 0.725189, 0.724...","[-8.608, 41.1429]","[0.724872, 0.724405, 0.724572, 0.725189, 0.724...",6,68,26
4,1372637091620000337,C,0,0,5,1372637091,0,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...","[0.312708, 0.313068, 0.306789, 0.291092, 0.285...","[-0.5219, -0.521117, -0.522834, -0.536055, -0....","[-8.68727, 41.1781]","[-0.5219, -0.521117, -0.522834, -0.536055, -0....",6,68,26


## MEANSHIFT

Meanshift clustering as performed in the paper

In [None]:
train = pd.DataFrame(utils.load_array(data_path+'train/train_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'DAY_OF_WEEK',
                            'QUARTER_HOUR', "WEEK_OF_YEAR", "TARGET", "COORD_FEATURES"])

Clustering performed on the targets

In [532]:
y_targ = np.vstack(train["TARGET"].as_matrix())

In [524]:
from sklearn.cluster import MeanShift, estimate_bandwidth

Can use the commented out code for a estimate of bandwidth, which causes clustering to converge much quicker.

This is not mentioned in the paper but is included in the code. In order to get results similar to the paper's,
they manually chose the uncommented bandwidth

In [533]:
#bw = estimate_bandwidth(y_targ, quantile=.1, n_samples=1000)
bw = 0.001

This takes some time

In [545]:
ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5)
ms.fit(y_targ)

MeanShift(bandwidth=0.001, bin_seeding=True, cluster_all=True, min_bin_freq=5,
     n_jobs=1, seeds=None)

In [546]:
cluster_centers = ms.cluster_centers_

This is very close to the number of clusters mentioned in the paper

In [547]:
cluster_centers.shape

(3421, 2)

In [548]:
utils.save_array(data_path+"cluster_centers_bw_001.bc", cluster_centers)

## Formatting Features for Bcolz iterator / garbage

In [None]:
train = pd.DataFrame(utils.load_array(data_path+'train/train_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
                            'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])

In [None]:
cluster_centers = utils.load_array(data_path+"cluster_centers_bw_001.bc")

In [50]:
long = np.array([c[0] for c in cluster_centers])
lat = np.array([c[1] for c in cluster_centers])

In [None]:
X_train, X_val = train_test_split(train, test_size=0.2, random_state=42)

In [11]:
def get_features(data):
    return [np.vstack(data['COORD_FEATURES'].as_matrix()), np.vstack(data['ORIGIN_CALL'].as_matrix()), 
           np.vstack(data['TAXI_ID'].as_matrix()), np.vstack(data['ORIGIN_STAND'].as_matrix()),
           np.vstack(data['QUARTER_HOUR'].as_matrix()), np.vstack(data['DAY_OF_WEEK'].as_matrix()), 
           np.vstack(data['WEEK_OF_YEAR'].as_matrix()), np.array([long for i in range(0,data.shape[0])]),
               np.array([lat for i in range(0,data.shape[0])])]

In [7]:
def get_target(data):
    return np.vstack(data["TARGET"].as_matrix())

In [None]:
X_train_features = get_features(X_train)

In [14]:
X_train_target = get_target(X_train)

In [13]:
utils.save_array(data_path+'train/X_train_features.bc', get_features(X_train))

(1339328, 20)

## MODEL

Load training data and cluster centers

In [16]:
train = pd.DataFrame(utils.load_array(data_path+'train/train_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
                            'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])

Validation cuts 

In [17]:
cuts = [
    1376503200, # 2013-08-14 18:00
    1380616200, # 2013-10-01 08:30
    1381167900, # 2013-10-07 17:45
    1383364800, # 2013-11-02 04:00
    1387722600  # 2013-12-22 14:30
]

In [41]:
print(datetime.datetime.fromtimestamp(1376503200))

2013-08-14 11:00:00


In [22]:
train.shape

(1674160, 16)

In [24]:
val_indices = []
index = 0
for index, row in train.iterrows():
    time = row['TIMESTAMP']
    latitude = row['LATITUDE']
    for ts in cuts:
        if time <= ts and time + 15 * (len(latitude) - 1) >= ts:
            val_indices.append(index)
            break
    index += 1

In [60]:
X_valid = train.iloc[val_indices]

In [53]:
valid.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,LATITUDE,LONGITUDE,TARGET,COORD_FEATURES,DAY_OF_WEEK,QUARTER_HOUR,WEEK_OF_YEAR
200153,1376502576620000126,B,0,36,247,1376502576,0,False,"[[-8.649504,41.15421],[-8.649684,41.154201],[-...","[-0.0418419, -0.0419448, -0.0449813, -0.046422...","[-0.583255, -0.586407, -0.59711, -0.589074, -0...","[-8.61122, 41.1463]","[-0.583255, -0.586407, -0.59711, -0.589074, -0...",2,43,33
200186,1376503146620000161,B,0,35,19,1376503146,0,False,"[[-8.649621,41.167323],[-8.64963,41.167251],[-...","[0.135098, 0.134121, 0.126709, 0.125371, 0.124...","[-0.585306, -0.585456, -0.589241, -0.588774, -...","[-8.64504, 41.1586]","[-0.585306, -0.585456, -0.589241, -0.588774, -...",2,43,33
200200,1376502942620000500,B,0,15,428,1376502942,0,False,"[[-8.585694,41.148522],[-8.585712,41.148801],[...","[-0.118578, -0.114821, -0.112402, -0.116982, -...","[0.532287, 0.531971, 0.523018, 0.524735, 0.524...","[-8.61524, 41.1418]","[0.532287, 0.531971, 0.523018, 0.524735, 0.524...",2,43,33
200202,1376502604620000105,C,0,0,87,1376502604,0,False,"[[-8.61093,41.145498],[-8.610939,41.145516],[-...","[-0.15939, -0.159133, -0.153883, -0.145392, -0...","[0.0910987, 0.0909487, 0.093783, 0.108572, 0.1...","[-8.64832, 41.1648]","[0.0910987, 0.0909487, 0.093783, 0.108572, 0.1...",2,43,33
200227,1376502611620000022,C,0,0,304,1376502611,0,False,"[[-8.591301,41.162715],[-8.591004,41.162562],[...","[0.0729274, 0.0708687, 0.0587228, 0.0539879, 0...","[0.43427, 0.439455, 0.42735, 0.423566, 0.41539...","[-8.60977, 41.1512]","[0.43427, 0.439455, 0.42735, 0.423566, 0.41539...",2,43,33


In [35]:
for d in valid['TIMESTAMP']:
    print(datetime.datetime.fromtimestamp(d))

2013-08-14 10:49:36
2013-08-14 10:59:06
2013-08-14 10:55:42
2013-08-14 10:50:04
2013-08-14 10:50:11
2013-08-14 10:56:57
2013-08-14 10:36:51
2013-08-14 10:44:15
2013-08-14 10:55:50
2013-08-14 10:50:35
2013-08-14 10:50:27
2013-08-14 10:43:57
2013-08-14 10:16:48
2013-08-14 10:40:47
2013-08-14 10:45:55
2013-08-14 10:43:00
2013-08-14 10:53:22
2013-08-14 10:50:03
2013-08-14 10:26:22
2013-08-14 10:59:15
2013-08-14 10:50:17
2013-08-14 10:56:34
2013-08-14 10:53:42
2013-08-14 10:47:46
2013-08-14 10:58:46
2013-08-14 10:24:23
2013-08-14 10:55:19
2013-08-14 10:57:03
2013-08-14 10:56:11
2013-08-14 10:56:52
2013-08-14 10:57:57
2013-08-14 10:08:15
2013-08-14 10:51:14
2013-08-14 10:58:31
2013-08-14 10:47:31
2013-08-14 10:30:36
2013-08-14 10:17:59
2013-08-14 10:48:03
2013-08-14 10:55:52
2013-08-14 10:49:06
2013-08-14 10:58:55
2013-08-14 10:51:24
2013-08-14 10:54:12
2013-08-14 10:54:26
2013-08-14 10:51:18
2013-08-14 10:59:56
2013-08-14 10:48:31
2013-08-14 10:51:56
2013-08-14 10:39:22
2013-08-14 10:57:25


In [58]:
X_train = train.drop(train.index[[val_indices]])

In [5]:
cluster_centers = utils.load_array(data_path+"/data/cluster_centers_bw_001.bc")

In [6]:
long = np.array([c[0] for c in cluster_centers])
lat = np.array([c[1] for c in cluster_centers])

In [62]:
utils.save_array(data_path+'train/X_train.bc', X_train.as_matrix())

In [64]:
utils.save_array(data_path+'valid/X_val.bc', X_valid.as_matrix())

In [24]:
X_train = pd.DataFrame(utils.load_array(data_path+'train/X_train.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
                            'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])

In [25]:
X_val = pd.DataFrame(utils.load_array(data_path+'valid/X_val.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE', 'TARGET',
                            'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])

The equirectangular loss function mentioned in the paper.

Note: Very important that y[0] is longitude and y[1] is latitude.

Omitted the radius of the earth constant "R" as it does not affect minimization and units were not given in the paper.

In [7]:
def equirectangular_loss(y_true, y_pred):
    deg2rad = 3.141592653589793 / 180
    long_1 = y_true[:,0]*deg2rad
    long_2 = y_pred[:,0]*deg2rad
    lat_1 = y_true[:,1]*deg2rad
    lat_2 = y_pred[:,1]*deg2rad
    return 6371*K.sqrt(K.square((long_1 - long_2)*K.cos((lat_1 + lat_2)/2.))
                       +K.square(lat_1 - lat_2))

In [9]:
def embedding_input(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, W_regularizer=l2(reg))(inp)

The following returns a fully-connected model as mentioned in the paper. Takes as input k as defined before, and the cluster centers.

Inputs: Embeddings for each category, concatenated w/ the 4*k continous variable representing the first/last k coords as mentioned above.

Embeddings have no regularization, as it was not mentioned in paper, though are easily equipped to include.

Paper mentions global normalization. Didn't specify exactly how they did that, whether thay did it sequentially or whatnot. I just included a batchnorm layer for the continuous inputs.

After concatenation, 1 hidden layer of 500 neurons as called for in paper.

Finally, output layer has as many outputs as there are cluster centers, w/ a softmax activation. Call this output P.

The prediction is the weighted sum of each cluster center c_i w/ corresponding predicted prob P_i.

To facilitate this, dotted output w/ cluster latitudes and longitudes separately. (this happens at variable y), then concatenated 
    into single tensor.
    
NOTE!!: You will see that I have the cluster center coords as inputs. Ideally, This function should store the cluster longs/lats as a constant to be used in the model, but I could not figure out. As a consequence, I pass them in as a repeated input.

In [67]:
def taxi_mlp(k, cluster_centers):
    shp = cluster_centers.shape[0]
    nums = Input(shape=(4*k,))

    center_longs = Input(shape=(shp,))
    center_lats = Input(shape=(shp,))

    emb_names = ['client_ID', 'taxi_ID', "stand_ID", "quarter_hour", "day_of_week", "week_of_year"]
    emb_ins = [57106, 448, 64, 96, 7, 52]
    emb_outs = [10 for i in range(0,6)]
    regs = [0 for i in range(0,6)]

    embs = [embedding_input(e[0], e[1]+1, e[2], e[3]) for e in zip(emb_names, emb_ins, emb_outs, regs)]

    x = merge([nums] + [Flatten()(e[1]) for e in embs], mode='concat')

    x = Dense(500, activation='relu')(x)

    x = Dense(shp, activation='softmax')(x)

    y = merge([merge([x, center_longs], mode='dot'), merge([x, center_lats], mode='dot')], mode='concat')

    return Model(input = [nums]+[e[0] for e in embs] + [center_longs, center_lats], output = y)

As mentioned, construction of repeated cluster longs/lats for input

Iterator for in memory `train` pandas dataframe. I did this as opposed to bcolz iterator due to the pre-processing

In [43]:
def data_iter(data, batch_size, cluster_centers):
    long = [c[0] for c in cluster_centers]
    lat = [c[1] for c in cluster_centers]
    i = 0
    N = data.shape[0]
    while True:
        yield ([np.vstack(data['COORD_FEATURES'][i:i+batch_size].as_matrix()), np.vstack(data['ORIGIN_CALL'][i:i+batch_size].as_matrix()), 
           np.vstack(data['TAXI_ID'][i:i+batch_size].as_matrix()), np.vstack(data['ORIGIN_STAND'][i:i+batch_size].as_matrix()),
           np.vstack(data['QUARTER_HOUR'][i:i+batch_size].as_matrix()), np.vstack(data['DAY_OF_WEEK'][i:i+batch_size].as_matrix()), 
           np.vstack(data['WEEK_OF_YEAR'][i:i+batch_size].as_matrix()), np.array([long for i in range(0,batch_size)]),
               np.array([lat for i in range(0,batch_size)])], np.vstack(data["TARGET"][i:i+batch_size].as_matrix()))
        i += batch_size


In [None]:
x=Lambda(thing)([x,long,lat])

Of course, k in the model needs to match k from feature construction. We again use 5 as they did in the paper

In [68]:
model = taxi_mlp(5, cluster_centers)

Paper used SGD opt w/ following paramerters

In [69]:
model.compile(optimizer=SGD(0.01, momentum=0.9), loss=equirectangular_loss, metrics=['mse'])

In [73]:
X_train_feat = get_features(X_train)

In [74]:
X_train_target = get_target(X_train)

In [76]:
X_val_feat = get_features(X_valid)

In [77]:
X_val_target = get_target(X_valid)

In [78]:
tqdm = TQDMNotebookCallback()

In [79]:
checkpoint = ModelCheckpoint(filepath=data_path+'models/tmp/weights.{epoch:03d}.{val_loss:.8f}.hdf5', save_best_only=True)

In [80]:
batch_size=256

### original

In [84]:
model.fit(X_train_feat, X_train_target, nb_epoch=1, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)

          5272/|/[loss: 0.469, mean_squared_error: 0.000] 100%|| 5272/5273 [01:54<00:00, 47.14it/s]

<keras.callbacks.History at 0x7fb2bb8a19e8>

In [None]:
model.fit(X_train_feat, X_train_target, nb_epoch=30, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)

5272/|/[loss: 0.107, mean_squared_error: 0.000] 100%|| 5272/5273 [01:54<00:00, 49.65it/s]

In [20]:
model = load_model(data_path+'models/weights.0.0799.hdf5', custom_objects={'equirectangular_loss':equirectangular_loss})

In [42]:
model.fit(X_train_feat, X_train_target, nb_epoch=100, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)


5231/|/[loss: 0.074, mean_squared_error: 0.000] 100%|| 5231/5232 [01:58<00:00, 50.19it/s]


<keras.callbacks.History at 0x7fced25954a8>

In [43]:
model.save(data_path+'models/current_model.hdf5')

### new valid

In [81]:
model.fit(X_train_feat, X_train_target, nb_epoch=1, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)




<keras.callbacks.History at 0x7f82d815c550>

In [None]:
model.fit(X_train_feat, X_train_target, nb_epoch=400, batch_size=batch_size, validation_data=(X_val_feat, X_val_target), callbacks=[tqdm, checkpoint], verbose=0)

In [102]:
model.save(data_path+'/models/current_model.hdf5')

In [84]:
len(X_val_feat[0])

304

It works, but it seems to converge unrealistically quick and the loss values are not the same. The paper does not mention what it's using as "error" in it's results. I assume the same equirectangular? Not very clear. The difference in values could be due to the missing Earth-radius factor

## Kaggle Entry

In [23]:
best_model = load_model(data_path+'models/weights.308.0.03373993.hdf5', custom_objects={'equirectangular_loss':equirectangular_loss})

In [104]:
best_model.evaluate(X_val_feat, X_val_target)

 32/304 [==>...........................] - ETA: 0s

[0.033743755401749363, 2.5798687967213293e-07]

In [61]:
test = pd.DataFrame(utils.load_array(data_path+'test/test_features.bc'),columns=['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE', 'LONGITUDE',
                            'COORD_FEATURES', "DAY_OF_WEEK", "QUARTER_HOUR", "WEEK_OF_YEAR"])

In [62]:
test['ORIGIN_CALL'] = pd.read_csv(data_path+'real_origin_call.csv', header=None)

In [63]:
test['TAXI_ID'] = pd.read_csv(data_path+'real_taxi_id.csv',header=None)

In [64]:
X_test = get_features(test)

In [65]:
b = np.sort(X_test[1],axis=None)

In [67]:
test_preds = np.round(best_model.predict(X_test), decimals=6)

In [68]:
d = {0:test['TRIP_ID'], 1:test_preds[:,1], 2:test_preds[:,0]}
kaggle_out = pd.DataFrame(data=d)

In [121]:
kaggle_out.to_csv(data_path+'submission.csv', header=['TRIP_ID','LATITUDE', 'LONGITUDE'], index=False)

In [117]:
def hdist(a, b):
    deg2rad = 3.141592653589793 / 180

    lat1 = a[:, 1] * deg2rad
    lon1 = a[:, 0] * deg2rad
    lat2 = b[:, 1] * deg2rad
    lon2 = b[:, 0] * deg2rad

    dlat = abs(lat1-lat2)
    dlon = abs(lon1-lon2)

    al = np.sin(dlat/2)**2  + np.cos(lat1) * np.cos(lat2) * (np.sin(dlon/2)**2)
    d = np.arctan2(np.sqrt(al), np.sqrt(1-al))

    hd = 2 * 6371 * d

    return hd

In [118]:
val_preds = best_model.predict(X_val_feat)

In [88]:
trn_preds = model.predict(X_train_feat)

KeyboardInterrupt: 

In [119]:
er = hdist(val_preds, X_val_target)

In [120]:
er.mean()

0.033741556

In [None]:
K.equal()

To-do: simple to extend to validation data

## Uh oh... training data not representative of test

In [67]:
cuts = [
    1376503200, # 2013-08-14 18:00
    1380616200, # 2013-10-01 08:30
    1381167900, # 2013-10-07 17:45
    1383364800, # 2013-11-02 04:00
    1387722600  # 2013-12-22 14:30
]

In [86]:
np.any([train['TIMESTAMP'].map(lambda x: x in cuts)])

False

In [87]:
train['TIMESTAMP']

0          1372636858
1          1372637303
2          1372636951
3          1372636854
4          1372637091
5          1372636965
6          1372637210
7          1372637299
8          1372637274
9          1372637905
10         1372636875
11         1372637984
12         1372637343
13         1372638595
14         1372638151
15         1372637610
16         1372638481
17         1372639135
18         1372637482
19         1372639181
20         1372638161
21         1372637254
22         1372638502
23         1372639960
24         1372637658
25         1372639092
26         1372639535
27         1372640499
28         1372639635
29         1372640555
              ...    
1710640    1404151621
1710641    1404152121
1710642    1404170192
1710643    1386603894
1710644    1401596832
1710645    1404151410
1710646    1404172198
1710647    1404155241
1710648    1404171548
1710649    1404151498
1710650    1404168899
1710651    1404153627
1710652    1401475142
1710653    1403935197
1710654   

In [90]:
np.any(train['TIMESTAMP']==1381167900)

False

In [91]:
times = train['TIMESTAMP'].as_matrix()

In [98]:
X_train.columns

Index(['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LATITUDE',
       'LONGITUDE', 'TARGET', 'COORD_FEATURES', 'DAY_OF_WEEK', 'QUARTER_HOUR',
       'WEEK_OF_YEAR'],
      dtype='object')

In [92]:
times

array([1372636858, 1372637303, 1372636951, ..., 1388745716, 1404141826, 1404157147])

In [102]:

count = 0
for index, row in X_val.iterrows():
    for ts in cuts:
        time = row['TIMESTAMP']
        latitude = row['LATITUDE']
        if time <= ts and time + 15 * (len(latitude) - 1) >= ts:
            count += 1

In [101]:
one = count

In [104]:
count + one

304

In [6]:
import h5py

In [7]:
h = h5py.File(data_path+'original/data.hdf5', 'r')

In [15]:
evrData=h['/Configure:0000/Run:0000/CalibCycle:0000/EvrData::DataV3/NoDetector.0:Evr.0/data']

KeyError: 'Unable to open object (Component not found)'

In [13]:
c = np.load(data_path+'original/arrival-clusters.pkl')

OSError: Failed to interpret file '/data/bckenstler/data/taxi/original/arrival-clusters.pkl' as a pickle

### hd5f files

In [10]:
from fuel.utils import find_in_data_path
from fuel.datasets import H5PYDataset


In [7]:
original_path = '/data/bckenstler/data/taxi/original/'

In [33]:
train_set = H5PYDataset(original_path+'data.hdf5', which_sets=('train',),load_in_memory=True)

In [48]:
valid_set = H5PYDataset(original_path+'valid.hdf5', which_sets=('cuts/test_times_0',),load_in_memory=True)

In [34]:
print(train_set.num_examples)

1710670


In [28]:
print(valid_set.num_examples)

304


In [37]:
data = train_set.data_sources

In [44]:
data[0]

array([2, 1, 2, ..., 2, 1, 1], dtype=int8)

In [49]:
valid_data = valid_set.data_sources

In [89]:
valid_data[4][0]

array([ 41.1542,  41.1542,  41.154 ,  41.1539,  41.1542,  41.1544,  41.1542,  41.1538,  41.1533,
        41.1528,  41.1525,  41.1525,  41.1527,  41.1527,  41.1527,  41.1526,  41.1524,  41.1526,
        41.1526,  41.1522,  41.1508,  41.1507,  41.1497,  41.1489,  41.1489,  41.1486,  41.1479,
        41.1475,  41.1468,  41.1461,  41.1463,  41.1464,  41.146 ,  41.1449,  41.1451,  41.1454,
        41.1458,  41.1459,  41.1458,  41.1459,  41.146 ,  41.146 ], dtype=float32)

In [77]:
stamps = valid_data[-3]

In [99]:
stamps[0]

1376502576

In [115]:
for i in range(0,304):    
    print(np.any([t==int(stamps[i]) for t in X_val['TIMESTAMP']]))

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals

In [101]:
type(X_train['TIMESTAMP'][0])

int

In [83]:
type(stamps[0])

numpy.int32

In [78]:
check = [s in stamps for s in X_val['TIMESTAMP']]

In [86]:
for s in X_val['TIMESTAMP']:
    print(datetime.datetime.fromtimestamp(s))

2013-08-14 10:07:32
2013-08-14 10:14:21
2013-08-14 10:28:47
2013-08-14 10:36:23
2013-08-14 10:25:13
2013-08-14 10:31:23
2013-08-14 10:14:21
2013-08-14 10:14:13
2013-08-14 10:03:40
2013-08-14 11:06:08
2013-08-14 11:00:40
2013-08-14 11:18:32
2013-08-14 10:51:01
2013-08-14 10:15:37
2013-08-14 10:42:00
2013-08-14 09:15:51
2013-08-14 10:35:23
2013-08-14 11:05:51
2013-08-14 11:16:11
2013-08-14 11:47:27
2013-08-14 11:35:11
2013-08-14 11:43:53
2013-08-14 12:01:14
2013-08-14 11:09:23
2013-08-14 10:26:21
2013-08-14 11:22:43
2013-08-14 12:07:18
2013-08-14 10:29:38
2013-08-14 11:57:18
2013-08-14 11:23:06
2013-08-14 12:15:02
2013-08-14 11:06:17
2013-08-14 12:33:55
2013-08-13 22:42:40
2013-08-14 12:07:26
2013-08-14 09:02:36
2013-08-14 13:08:03
2013-08-14 07:25:36
2013-08-14 13:37:10
2013-08-14 13:52:50
2013-08-14 14:24:04
2013-08-14 15:15:05
2013-08-14 15:41:34
2013-08-14 19:15:39
2013-08-14 20:28:13
2013-08-14 19:58:07
2013-08-14 21:43:57
2013-08-14 21:41:07
2013-08-14 22:46:27
2013-08-14 23:11:28


In [85]:
for s in stamps:
    print(datetime.datetime.fromtimestamp(s))

2013-08-14 10:49:36
2013-08-14 10:59:06
2013-08-14 10:55:42
2013-08-14 10:50:04
2013-08-14 10:50:11
2013-08-14 10:56:57
2013-08-14 10:36:51
2013-08-14 10:44:15
2013-08-14 10:55:50
2013-08-14 10:50:35
2013-08-14 10:50:27
2013-08-14 10:43:57
2013-08-14 10:16:48
2013-08-14 10:40:47
2013-08-14 10:45:55
2013-08-14 10:43:00
2013-08-14 10:53:22
2013-08-14 10:50:03
2013-08-14 10:26:22
2013-08-14 10:59:15
2013-08-14 10:50:17
2013-08-14 10:56:34
2013-08-14 10:53:42
2013-08-14 10:47:46
2013-08-14 10:58:46
2013-08-14 10:24:23
2013-08-14 10:55:19
2013-08-14 10:57:03
2013-08-14 10:56:11
2013-08-14 10:56:52
2013-08-14 10:57:57
2013-08-14 10:08:15
2013-08-14 10:51:14
2013-08-14 10:58:31
2013-08-14 10:47:31
2013-08-14 10:30:36
2013-08-14 10:17:59
2013-08-14 10:48:03
2013-08-14 10:55:52
2013-08-14 10:49:06
2013-08-14 10:58:55
2013-08-14 10:51:24
2013-08-14 10:54:12
2013-08-14 10:54:26
2013-08-14 10:51:18
2013-08-14 10:59:56
2013-08-14 10:48:31
2013-08-14 10:51:56
2013-08-14 10:39:22
2013-08-14 10:57:25


In [71]:
ids = valid_data[-1]

In [74]:
type(ids[0])

numpy.bytes_

In [70]:
ids

["b'1376502576620000126'",
 "b'1376503146620000161'",
 "b'1376502942620000500'",
 "b'1376502604620000105'",
 "b'1376502611620000022'",
 "b'1376503017620000272'",
 "b'1376501811620000617'",
 "b'1376502255620000663'",
 "b'1376502950620000005'",
 "b'1376502635620000276'",
 "b'1376502627620000596'",
 "b'1376502237620000675'",
 "b'1376500608620000409'",
 "b'1376502047620000574'",
 "b'1376502355620000338'",
 "b'1376502180620000080'",
 "b'1376502802620000680'",
 "b'1376502603620000142'",
 "b'1376501182620000651'",
 "b'1376503155620000026'",
 "b'1376502617620000657'",
 "b'1376502994620000604'",
 "b'1376502822620000093'",
 "b'1376502466620000561'",
 "b'1376503126620000410'",
 "b'1376501063620000343'",
 "b'1376502919620000166'",
 "b'1376503023620000010'",
 "b'1376502971620000517'",
 "b'1376503012620000273'",
 "b'1376503077620000470'",
 "b'1376500095620000569'",
 "b'1376502674620000426'",
 "b'1376503111620000674'",
 "b'1376502451620000310'",
 "b'1376501436620000344'",
 "b'1376500679620000108'",
 

In [64]:
X_val

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,LATITUDE,LONGITUDE,TARGET,COORD_FEATURES,DAY_OF_WEEK,QUARTER_HOUR,WEEK_OF_YEAR
0,1376500052620000184,C,0,0,115,1376500052,0,False,"[[-8.649891,41.154399],[-8.649981,41.154417],[...","[-0.0392686, -0.0390627, -0.0440035, -0.049458...","[-0.590024, -0.591592, -0.596627, -0.596793, -...","[-8.61043, 41.1411]","[-0.590024, -0.591592, -0.596627, -0.596793, -...",2,40,33
1,1376500461620000525,C,0,0,214,1376500461,0,False,"[[-8.610876,41.145759],[-8.610849,41.145759],[...","[-0.155839, -0.155839, -0.151619, -0.14673, -0...","[0.0920491, 0.0925159, 0.0985014, 0.105587, 0....","[-8.63072, 41.1547]","[0.0920491, 0.0925159, 0.0985014, 0.105587, 0....",2,40,33
2,1376501327620000095,B,0,11,367,1376501327,0,False,"[[-8.613243,41.166873],[-8.613252,41.166747],[...","[0.129025, 0.127327, 0.125474, 0.118835, 0.104...","[0.0506678, 0.0505178, 0.0497175, 0.0700247, 0...","[-8.61534, 41.1407]","[0.0506678, 0.0505178, 0.0497175, 0.0700247, 0...",2,41,33
3,1376501783620000173,B,0,10,39,1376501783,0,False,"[[-8.606988,41.15025],[-8.607213,41.150007],[-...","[-0.0952637, -0.0985575, -0.112865, -0.113843,...","[0.160023, 0.156088, 0.148386, 0.145868, 0.144...","[-8.55426, 41.1628]","[0.160023, 0.156088, 0.148386, 0.145868, 0.144...",2,42,33
4,1376501113620000252,B,0,13,364,1376501113,0,False,"[[-8.628273,41.157405],[-8.628255,41.157423],[...","[0.00128665, 0.00149252, 0.00236744, 0.0135356...","[-0.212091, -0.211775, -0.209724, -0.20894, -0...","[-8.61928, 41.1786]","[-0.212091, -0.211775, -0.209724, -0.20894, -0...",2,41,33
5,1376501483620000424,B,0,19,25,1376501483,0,False,"[[-8.605818,41.153391],[-8.607339,41.153427],[...","[-0.0528556, -0.0523924, -0.0513116, -0.050694...","[0.18048, 0.153888, 0.112506, 0.0797781, 0.071...","[-8.64643, 41.1616]","[0.18048, 0.153888, 0.112506, 0.0797781, 0.071...",2,42,33
6,1376500461620000326,B,0,14,240,1376500461,0,False,"[[-8.611137,41.149332],[-8.611263,41.149161],[...","[-0.107667, -0.109931, -0.110086, -0.110086, -...","[0.0874808, 0.08528, 0.0849633, 0.0848132, 0.0...","[-8.61446, 41.1422]","[0.0874808, 0.08528, 0.0849633, 0.0848132, 0.0...",2,40,33
7,1376500453620000263,C,0,0,407,1376500453,0,False,"[[-8.586396,41.149224],[-8.586378,41.149026],[...","[-0.109108, -0.111784, -0.11199, -0.107873, -0...","[0.520016, 0.520333, 0.513247, 0.49249, 0.4643...","[-8.58591, 41.1486]","[0.520016, 0.520333, 0.513247, 0.49249, 0.4643...",2,40,33
8,1376499820620000467,C,0,0,270,1376499820,0,False,"[[-8.625177,41.157333],[-8.625609,41.157405],[...","[0.000308796, 0.00128665, 0.00494074, 0.006021...","[-0.157972, -0.165525, -0.194935, -0.202171, -...","[-8.64726, 41.1732]","[-0.157972, -0.165525, -0.194935, -0.202171, -...",2,40,33
9,1376503568620000213,B,0,28,431,1376503568,0,False,"[[-8.584335,41.163111],[-8.585127,41.162922],[...","[0.0782799, 0.0757066, 0.0835809, 0.0913522, 0...","[0.556046, 0.542208, 0.51058, 0.479736, 0.4769...","[-8.58525, 41.1689]","[0.556046, 0.542208, 0.51058, 0.479736, 0.4769...",2,44,33
