{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.model_selection import cross_validate\n", "from sklearn.base import BaseEstimator\n", "from sklearn.metrics import (make_scorer, mean_absolute_error)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'0.21.2'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn import __version__\n", "__version__\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from _compute_median import _read_all_data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "data = _read_all_data()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
address_typeagencyagency_namebblboroughbridge_highway_directionbridge_highway_namebridge_highway_segmentcityclosed_date...resolution_descriptionroad_rampstatusstreet_nametaxi_company_boroughtaxi_pick_up_locationunique_keyx_coordinate_state_planey_coordinate_state_planevehicle_type
0ADDRESSDOHMHDepartment of Health and Mental Hygiene5.080220e+09STATEN ISLANDNaNNaNNaNSTATEN ISLANDNaT...The Department of Health and Mental Hygiene wi...NaNOpenWOOD AVENUENaNNaN43058507916296.0126389.0NaN
1ADDRESSDOHMHDepartment of Health and Mental Hygiene4.097000e+09QUEENSNaNNaNNaNJamaicaNaT...The Department of Health and Mental Hygiene wi...NaNOpen87 AVENUENaNNaN430585061035684.0196858.0NaN
2INTERSECTIONDOHMHDepartment of Health and Mental HygieneNaNBROOKLYNNaNNaNNaNBROOKLYNNaT...The Department of Health and Mental Hygiene wi...NaNOpenNaNNaNNaN430606801023962.0182899.0NaN
3ADDRESSDOHMHDepartment of Health and Mental Hygiene4.104670e+09QUEENSNaNNaNNaNHollis2019-06-25...The Department of Health and Mental Hygiene wi...NaNClosed196 STREETNaNNaN430562461049383.0200048.0NaN
\n", "

4 rows × 41 columns

\n", "
" ], "text/plain": [ " address_type agency agency_name bbl \\\n", "0 ADDRESS DOHMH Department of Health and Mental Hygiene 5.080220e+09 \n", "1 ADDRESS DOHMH Department of Health and Mental Hygiene 4.097000e+09 \n", "2 INTERSECTION DOHMH Department of Health and Mental Hygiene NaN \n", "3 ADDRESS DOHMH Department of Health and Mental Hygiene 4.104670e+09 \n", "\n", " borough bridge_highway_direction bridge_highway_name \\\n", "0 STATEN ISLAND NaN NaN \n", "1 QUEENS NaN NaN \n", "2 BROOKLYN NaN NaN \n", "3 QUEENS NaN NaN \n", "\n", " bridge_highway_segment city closed_date ... \\\n", "0 NaN STATEN ISLAND NaT ... \n", "1 NaN Jamaica NaT ... \n", "2 NaN BROOKLYN NaT ... \n", "3 NaN Hollis 2019-06-25 ... \n", "\n", " resolution_description road_ramp status \\\n", "0 The Department of Health and Mental Hygiene wi... NaN Open \n", "1 The Department of Health and Mental Hygiene wi... NaN Open \n", "2 The Department of Health and Mental Hygiene wi... NaN Open \n", "3 The Department of Health and Mental Hygiene wi... NaN Closed \n", "\n", " street_name taxi_company_borough taxi_pick_up_location unique_key \\\n", "0 WOOD AVENUE NaN NaN 43058507 \n", "1 87 AVENUE NaN NaN 43058506 \n", "2 NaN NaN NaN 43060680 \n", "3 196 STREET NaN NaN 43056246 \n", "\n", " x_coordinate_state_plane y_coordinate_state_plane vehicle_type \n", "0 916296.0 126389.0 NaN \n", "1 1035684.0 196858.0 NaN \n", "2 1023962.0 182899.0 NaN \n", "3 1049383.0 200048.0 NaN \n", "\n", "[4 rows x 41 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head(4)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['address_type', 'agency', 'agency_name', 'bbl', 'borough',\n", " 'bridge_highway_direction', 'bridge_highway_name',\n", " 'bridge_highway_segment', 'city', 'closed_date', 'community_board',\n", " 'complaint_type', 'created_date', 'cross_street_1', 'cross_street_2',\n", " 'descriptor', 'due_date', 'facility_type', 'incident_address',\n", " 'incident_zip', 'intersection_street_1', 'intersection_street_2',\n", " 'landmark', 'latitude', 'location', 'location_type', 'longitude',\n", " 'open_data_channel_type', 'park_borough', 'park_facility_name',\n", " 'resolution_action_updated_date', 'resolution_description', 'road_ramp',\n", " 'status', 'street_name', 'taxi_company_borough',\n", " 'taxi_pick_up_location', 'unique_key', 'x_coordinate_state_plane',\n", " 'y_coordinate_state_plane', 'vehicle_type'],\n", " dtype='object')" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.columns" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "features = ['complaint_type', 'latitude','longitude', 'created_date']" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "data['time_to_action'] = (data['resolution_action_updated_date'] - data['created_date']) # / pd.np.timedelta64(1, 'M')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "data_ = data.loc[data.complaint_type.str.contains('Noise'), features + ['time_to_action']]\n", "data_ = data_[data_.notnull().all(1)]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "data_['time_to_action'] = (data_['time_to_action'].dt.seconds / 3600).astype(int)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "y = data_['time_to_action']\n", "X = data_.drop('time_to_action', axis=1)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "40698" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(X)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Little cleaning" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Noise - Commercial', 'Noise - Street/Sidewalk', 'Noise - Vehicle',\n", " 'Noise - Residential', 'Noise', 'Noise - Park',\n", " 'Noise - House of Worship', 'Collection Truck Noise'], dtype=object)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X['complaint_type'].unique()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "proper_names = {\n", " 'Noise - Commercial':'commercial', \n", " 'Noise - Residential':'residential',\n", " 'Noise - Street/Sidewalk':'street',\n", " 'Noise - Vehicle':'vehicle', \n", " 'Noise - Park':'park',\n", " 'Noise':'other', \n", " 'Noise - House of Worship':'worship', \n", " 'Collection Truck Noise':'truck'\n", "}" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "X['complaint_type'] = X['complaint_type'].map(proper_names)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
complaint_typelatitudelongitudecreated_date
7commercial40.717302-73.9492482019-06-23 00:00:00
10street40.837576-73.8893962019-06-23 00:00:08
11vehicle40.833693-73.9138462019-06-23 00:00:16
12residential40.823469-73.9244602019-06-23 00:00:25
13street40.848693-73.9032792019-06-23 00:00:28
\n", "
" ], "text/plain": [ " complaint_type latitude longitude created_date\n", "7 commercial 40.717302 -73.949248 2019-06-23 00:00:00\n", "10 street 40.837576 -73.889396 2019-06-23 00:00:08\n", "11 vehicle 40.833693 -73.913846 2019-06-23 00:00:16\n", "12 residential 40.823469 -73.924460 2019-06-23 00:00:25\n", "13 street 40.848693 -73.903279 2019-06-23 00:00:28" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Generation" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# class TimeTransformer(BaseEstimator):\n", "# cols = None\n", " \n", "# def __init__(self, cols=None):\n", "# self.cols = cols\n", " \n", "# def fit(self, X=None, y=None, groups=None):\n", " \n", "# if self.cols is None:\n", "# self.cols = X.select_dtypes(include=pd.np.datetime64).columns\n", "# return self\n", " \n", "# def transform(self, X, y=None, groups=None, cols=None):\n", " \n", "# for col in self.cols:\n", "# dates = X[col]\n", "# X = X.drop(col, axis=1)\n", "# X[f'{col}_dow'] = dates.dt.dayofweek\n", "# X[f'{col}_doy'] = dates.dt.dayofyear\n", "# X[f'{col}_tod'] = dates.dt.second\n", "\n", "# return X\n", "\n", "from ml import TimeTransformer" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "t = TimeTransformer(cols=['created_date'])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# X.select_dtypes(include=pd.np.datetime64)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
complaint_typelatitudelongitudecreated_date_dowcreated_date_doycreated_date_tod
7commercial40.717302-73.94924861740
10street40.837576-73.88939661748
11vehicle40.833693-73.913846617416
\n", "
" ], "text/plain": [ " complaint_type latitude longitude created_date_dow created_date_doy \\\n", "7 commercial 40.717302 -73.949248 6 174 \n", "10 street 40.837576 -73.889396 6 174 \n", "11 vehicle 40.833693 -73.913846 6 174 \n", "\n", " created_date_tod \n", "7 0 \n", "10 8 \n", "11 16 " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "t.fit(X).transform(X).head(3)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "cats = X['complaint_type'].unique().tolist()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "ct = ColumnTransformer(\n", " transformers=[\n", " ('ordinal', OrdinalEncoder(categories=[cats,]), [0]),\n", " ('time', TimeTransformer(cols=['created_date']), [3])\n", " ], remainder='passthrough')" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "model = RandomForestRegressor(n_estimators=100, random_state=2019)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "pipe = Pipeline(steps=[('preprocessor', ct),\n", " ('model', model)])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cross-validate" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.\n", "[Parallel(n_jobs=3)]: Done 5 out of 5 | elapsed: 27.7s finished\n" ] } ], "source": [ "cv = cross_validate(pipe, X, y, cv=5, scoring=make_scorer(mean_absolute_error),\n", " verbose=1, n_jobs=3)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fit_timescore_timetest_score
013.2260470.1705262.843741
113.4762370.2110203.919784
213.0265040.1774893.015327
312.1117290.1603683.072551
412.1824630.1005032.752961
\n", "
" ], "text/plain": [ " fit_time score_time test_score\n", "0 13.226047 0.170526 2.843741\n", "1 13.476237 0.211020 3.919784\n", "2 13.026504 0.177489 3.015327\n", "3 12.111729 0.160368 3.072551\n", "4 12.182463 0.100503 2.752961" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(cv)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3.1208729127942547" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(cv)['test_score'].mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train and store Model" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pipeline(memory=None,\n", " steps=[('preprocessor',\n", " ColumnTransformer(n_jobs=None, remainder='passthrough',\n", " sparse_threshold=0.3,\n", " transformer_weights=None,\n", " transformers=[('ordinal',\n", " OrdinalEncoder(categories=[['commercial',\n", " 'street',\n", " 'vehicle',\n", " 'residential',\n", " 'other',\n", " 'park',\n", " 'worship',\n", " 'truck']],\n", " dtype=),\n", " [0]),\n", " ('time',\n", " TimeTransformer(cols=['create...\n", " verbose=False)),\n", " ('model',\n", " RandomForestRegressor(bootstrap=True, criterion='mse',\n", " max_depth=None, max_features='auto',\n", " max_leaf_nodes=None,\n", " min_impurity_decrease=0.0,\n", " min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0,\n", " n_estimators=100, n_jobs=None,\n", " oob_score=False, random_state=2019,\n", " verbose=0, warm_start=False))],\n", " verbose=False)" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe.fit(X, y)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "11.37" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe.predict(X.head(1))[0]" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "# import joblib\n", "import pickle" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "with open('./model.pkl', 'wb') as f:\n", " joblib.dump(pipe, f)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Testing" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "singleton = pd.DataFrame([{'complaint_type':'dummy', \n", " 'latitude':1.1111, \n", " 'longitude':1.1111,\n", " 'created_date':pd.to_datetime('2019-01-01')}])" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "BODY = {\n", " 'complaint_type': 'residential',\n", " 'lat': \"40.636626\",\n", " 'lon': \"-73.951694\",\n", " \"date\": \"2019-06-08 00:00:09\"\n", "}" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "mapping = {\n", " 'lon': 'longitude',\n", " 'lat': 'latitude',\n", " 'date': 'created_date'\n", "}\n", "\n", "dtypes = {\n", " 'lon': float,\n", " 'lat': float,\n", " 'date': pd.to_datetime\n", "}" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "singleton.loc[0, 'complaint_type'] = BODY['complaint_type']\n", "\n", "for k, col in mapping.items():\n", " singleton.loc[0, col] = dtypes[k](BODY.get(k, pd.np.nan))" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
complaint_typecreated_datelatitudelongitude
0residential2019-06-08 00:00:0940.636626-73.951694
\n", "
" ], "text/plain": [ " complaint_type created_date latitude longitude\n", "0 residential 2019-06-08 00:00:09 40.636626 -73.951694" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "singleton" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "complaint_type object\n", "created_date datetime64[ns]\n", "latitude float64\n", "longitude float64\n", "dtype: object" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "singleton.dtypes" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "complaint_type object\n", "latitude float64\n", "longitude float64\n", "created_date datetime64[ns]\n", "dtype: object" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.dtypes" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.89" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe.predict(singleton[['complaint_type', 'latitude', 'longitude','created_date']])[0]" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "complaint_type object\n", "latitude float64\n", "longitude float64\n", "created_date datetime64[ns]\n", "dtype: object" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "singleton[['complaint_type', 'latitude', 'longitude','created_date']].dtypes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }