In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import  cross_validate
from sklearn.base import BaseEstimator
from sklearn.metrics import (make_scorer, mean_absolute_error)

In [2]:
from sklearn import __version__
__version__


'0.21.2'

In [3]:
from _compute_median import _read_all_data

In [4]:
data = _read_all_data()

In [5]:
data.head(4)

Unnamed: 0,address_type,agency,agency_name,bbl,borough,bridge_highway_direction,bridge_highway_name,bridge_highway_segment,city,closed_date,...,resolution_description,road_ramp,status,street_name,taxi_company_borough,taxi_pick_up_location,unique_key,x_coordinate_state_plane,y_coordinate_state_plane,vehicle_type
0,ADDRESS,DOHMH,Department of Health and Mental Hygiene,5080220000.0,STATEN ISLAND,,,,STATEN ISLAND,NaT,...,The Department of Health and Mental Hygiene wi...,,Open,WOOD AVENUE,,,43058507,916296.0,126389.0,
1,ADDRESS,DOHMH,Department of Health and Mental Hygiene,4097000000.0,QUEENS,,,,Jamaica,NaT,...,The Department of Health and Mental Hygiene wi...,,Open,87 AVENUE,,,43058506,1035684.0,196858.0,
2,INTERSECTION,DOHMH,Department of Health and Mental Hygiene,,BROOKLYN,,,,BROOKLYN,NaT,...,The Department of Health and Mental Hygiene wi...,,Open,,,,43060680,1023962.0,182899.0,
3,ADDRESS,DOHMH,Department of Health and Mental Hygiene,4104670000.0,QUEENS,,,,Hollis,2019-06-25,...,The Department of Health and Mental Hygiene wi...,,Closed,196 STREET,,,43056246,1049383.0,200048.0,


In [6]:
data.columns

Index(['address_type', 'agency', 'agency_name', 'bbl', 'borough',
       'bridge_highway_direction', 'bridge_highway_name',
       'bridge_highway_segment', 'city', 'closed_date', 'community_board',
       'complaint_type', 'created_date', 'cross_street_1', 'cross_street_2',
       'descriptor', 'due_date', 'facility_type', 'incident_address',
       'incident_zip', 'intersection_street_1', 'intersection_street_2',
       'landmark', 'latitude', 'location', 'location_type', 'longitude',
       'open_data_channel_type', 'park_borough', 'park_facility_name',
       'resolution_action_updated_date', 'resolution_description', 'road_ramp',
       'status', 'street_name', 'taxi_company_borough',
       'taxi_pick_up_location', 'unique_key', 'x_coordinate_state_plane',
       'y_coordinate_state_plane', 'vehicle_type'],
      dtype='object')

In [7]:
features = ['complaint_type', 'latitude','longitude', 'created_date']

In [8]:
data['time_to_action'] = (data['resolution_action_updated_date'] - data['created_date']) # / pd.np.timedelta64(1, 'M')

In [9]:
data_ = data.loc[data.complaint_type.str.contains('Noise'), features + ['time_to_action']]
data_ = data_[data_.notnull().all(1)]

In [10]:
data_['time_to_action'] = (data_['time_to_action'].dt.seconds / 3600).astype(int)

In [11]:
y = data_['time_to_action']
X = data_.drop('time_to_action', axis=1)

In [12]:
len(X)

40698

# Little cleaning

In [13]:
X['complaint_type'].unique()

array(['Noise - Commercial', 'Noise - Street/Sidewalk', 'Noise - Vehicle',
       'Noise - Residential', 'Noise', 'Noise - Park',
       'Noise - House of Worship', 'Collection Truck Noise'], dtype=object)

In [14]:
proper_names = {
    'Noise - Commercial':'commercial', 
    'Noise - Residential':'residential',
    'Noise - Street/Sidewalk':'street',
    'Noise - Vehicle':'vehicle', 
    'Noise - Park':'park',
    'Noise':'other', 
    'Noise - House of Worship':'worship', 
    'Collection Truck Noise':'truck'
}

In [15]:
X['complaint_type'] = X['complaint_type'].map(proper_names)

In [16]:
X.head(5)

Unnamed: 0,complaint_type,latitude,longitude,created_date
7,commercial,40.717302,-73.949248,2019-06-23 00:00:00
10,street,40.837576,-73.889396,2019-06-23 00:00:08
11,vehicle,40.833693,-73.913846,2019-06-23 00:00:16
12,residential,40.823469,-73.92446,2019-06-23 00:00:25
13,street,40.848693,-73.903279,2019-06-23 00:00:28


## Feature Generation

In [17]:
# class TimeTransformer(BaseEstimator):
#     cols = None
    
#     def __init__(self, cols=None):
#         self.cols = cols
    
#     def fit(self, X=None, y=None, groups=None):
        
#         if self.cols is None:
#             self.cols = X.select_dtypes(include=pd.np.datetime64).columns
#         return self
    
#     def transform(self, X, y=None, groups=None, cols=None):
        
#         for col in self.cols:
#             dates = X[col]
#             X = X.drop(col, axis=1)
#             X[f'{col}_dow'] = dates.dt.dayofweek
#             X[f'{col}_doy'] = dates.dt.dayofyear
#             X[f'{col}_tod'] = dates.dt.second

#         return X

from ml import TimeTransformer

In [18]:
t = TimeTransformer(cols=['created_date'])

In [19]:
# X.select_dtypes(include=pd.np.datetime64)

In [20]:
t.fit(X).transform(X).head(3)

Unnamed: 0,complaint_type,latitude,longitude,created_date_dow,created_date_doy,created_date_tod
7,commercial,40.717302,-73.949248,6,174,0
10,street,40.837576,-73.889396,6,174,8
11,vehicle,40.833693,-73.913846,6,174,16


In [21]:
cats = X['complaint_type'].unique().tolist()

In [22]:
ct = ColumnTransformer(
        transformers=[
            ('ordinal', OrdinalEncoder(categories=[cats,]), [0]),
            ('time', TimeTransformer(cols=['created_date']), [3])
        ], remainder='passthrough')

In [23]:
model = RandomForestRegressor(n_estimators=100, random_state=2019)

In [24]:
pipe = Pipeline(steps=[('preprocessor', ct),
                        ('model', model)])

## Cross-validate

In [25]:
cv = cross_validate(pipe, X, y, cv=5, scoring=make_scorer(mean_absolute_error),
                    verbose=1, n_jobs=3)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:   27.7s finished


In [26]:
pd.DataFrame(cv)

Unnamed: 0,fit_time,score_time,test_score
0,13.226047,0.170526,2.843741
1,13.476237,0.21102,3.919784
2,13.026504,0.177489,3.015327
3,12.111729,0.160368,3.072551
4,12.182463,0.100503,2.752961


In [27]:
pd.DataFrame(cv)['test_score'].mean()

3.1208729127942547

## Train and store Model

In [28]:
pipe.fit(X, y)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('ordinal',
                                                  OrdinalEncoder(categories=[['commercial',
                                                                              'street',
                                                                              'vehicle',
                                                                              'residential',
                                                                              'other',
                                                                              'park',
                                                                              'worship',
                                                                     

In [29]:
pipe.predict(X.head(1))[0]

11.37

In [32]:
# import joblib
import pickle

In [33]:
with open('./model.pkl', 'wb') as f:
    joblib.dump(pipe, f)


## Testing

In [67]:
singleton = pd.DataFrame([{'complaint_type':'dummy', 
                           'latitude':1.1111, 
                           'longitude':1.1111,
                           'created_date':pd.to_datetime('2019-01-01')}])

In [68]:
BODY = {
    'complaint_type': 'residential',
    'lat': "40.636626",
    'lon': "-73.951694",
    "date": "2019-06-08 00:00:09"
}

In [69]:
mapping = {
    'lon': 'longitude',
    'lat': 'latitude',
    'date': 'created_date'
}

dtypes = {
    'lon': float,
    'lat': float,
    'date': pd.to_datetime
}

In [70]:
singleton.loc[0, 'complaint_type'] = BODY['complaint_type']

for k, col in mapping.items():
    singleton.loc[0, col] = dtypes[k](BODY.get(k, pd.np.nan))

In [71]:
singleton

Unnamed: 0,complaint_type,created_date,latitude,longitude
0,residential,2019-06-08 00:00:09,40.636626,-73.951694


In [72]:
singleton.dtypes

complaint_type            object
created_date      datetime64[ns]
latitude                 float64
longitude                float64
dtype: object

In [73]:
X.dtypes

complaint_type            object
latitude                 float64
longitude                float64
created_date      datetime64[ns]
dtype: object

In [74]:
pipe.predict(singleton[['complaint_type', 'latitude', 'longitude','created_date']])[0]

0.89

In [75]:
singleton[['complaint_type', 'latitude', 'longitude','created_date']].dtypes

complaint_type            object
latitude                 float64
longitude                float64
created_date      datetime64[ns]
dtype: object