{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Bike Sharing Dataset using Decision Tree Regressor\n", "\n", "+ Based on Bike Sharing dataset from [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset)\n", "+ This notebook is based upon the hourly data file, i.e. hour.csv\n", "+ This notebook showcases regression using Decision Trees" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Problem Statement\n", "Given the Bike Sharing dataset with hourly level information of bikes along with weather and other attributes, model a system which can predict the bike count." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import required packages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "\n", "# data manuipulation\n", "import numpy as np\n", "import pandas as pd\n", "\n", "# modeling utilities\n", "import pydotplus \n", "from sklearn import tree\n", "from sklearn import metrics\n", "from sklearn import preprocessing\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.model_selection import train_test_split\n", "\n", "\n", "# plotting libraries\n", "import seaborn as sn\n", "import matplotlib.pyplot as plt\n", "import seaborn as sn\n", "\n", "\n", "sn.set_style('whitegrid')\n", "sn.set_context('talk')\n", "params = {'legend.fontsize': 'x-large',\n", " 'figure.figsize': (30, 10),\n", " 'axes.labelsize': 'x-large',\n", " 'axes.titlesize':'x-large',\n", " 'xtick.labelsize':'x-large',\n", " 'ytick.labelsize':'x-large'}\n", "\n", "plt.rcParams.update(params)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape of dataset::(17379, 17)\n" ] } ], "source": [ "hour_df = pd.read_csv('hour.csv')\n", "print(\"Shape of dataset::{}\".format(hour_df.shape))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preprocessing\n", "+ Standarize column names\n", "+ Typecast attributes\n", "+ Encode Categoricals using One Hot Encoding" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Standarize Column Names" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "hour_df.rename(columns={'instant':'rec_id',\n", " 'dteday':'datetime',\n", " 'holiday':'is_holiday',\n", " 'workingday':'is_workingday',\n", " 'weathersit':'weather_condition',\n", " 'hum':'humidity',\n", " 'mnth':'month',\n", " 'cnt':'total_count',\n", " 'hr':'hour',\n", " 'yr':'year'},inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Typecast Attributes" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# date time conversion\n", "hour_df['datetime'] = pd.to_datetime(hour_df.datetime)\n", "\n", "# categorical variables\n", "hour_df['season'] = hour_df.season.astype('category')\n", "hour_df['is_holiday'] = hour_df.is_holiday.astype('category')\n", "hour_df['weekday'] = hour_df.weekday.astype('category')\n", "hour_df['weather_condition'] = hour_df.weather_condition.astype('category')\n", "hour_df['is_workingday'] = hour_df.is_workingday.astype('category')\n", "hour_df['month'] = hour_df.month.astype('category')\n", "hour_df['year'] = hour_df.year.astype('category')\n", "hour_df['hour'] = hour_df.hour.astype('category')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "### Encode Categoricals (One Hot Encoding)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def fit_transform_ohe(df,col_name):\n", " \"\"\"This function performs one hot encoding for the specified\n", " column.\n", "\n", " Args:\n", " df(pandas.DataFrame): the data frame containing the mentioned column name\n", " col_name: the column to be one hot encoded\n", "\n", " Returns:\n", " tuple: label_encoder, one_hot_encoder, transformed column as pandas Series\n", "\n", " \"\"\"\n", " # label encode the column\n", " le = preprocessing.LabelEncoder()\n", " le_labels = le.fit_transform(df[col_name])\n", " df[col_name+'_label'] = le_labels\n", " \n", " # one hot encoding\n", " ohe = preprocessing.OneHotEncoder()\n", " feature_arr = ohe.fit_transform(df[[col_name+'_label']]).toarray()\n", " feature_labels = [col_name+'_'+str(cls_label) for cls_label in le.classes_]\n", " features_df = pd.DataFrame(feature_arr, columns=feature_labels)\n", " \n", " return le,ohe,features_df\n", "\n", "# given label encoder and one hot encoder objects, \n", "# encode attribute to ohe\n", "def transform_ohe(df,le,ohe,col_name):\n", " \"\"\"This function performs one hot encoding for the specified\n", " column using the specified encoder objects.\n", "\n", " Args:\n", " df(pandas.DataFrame): the data frame containing the mentioned column name\n", " le(Label Encoder): the label encoder object used to fit label encoding\n", " ohe(One Hot Encoder): the onen hot encoder object used to fit one hot encoding\n", " col_name: the column to be one hot encoded\n", "\n", " Returns:\n", " tuple: transformed column as pandas Series\n", "\n", " \"\"\"\n", " # label encode\n", " col_labels = le.transform(df[col_name])\n", " df[col_name+'_label'] = col_labels\n", " \n", " # ohe \n", " feature_arr = ohe.fit_transform(df[[col_name+'_label']]).toarray()\n", " feature_labels = [col_name+'_'+str(cls_label) for cls_label in le.classes_]\n", " features_df = pd.DataFrame(feature_arr, columns=feature_labels)\n", " \n", " return features_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train-Test Split" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training set::(11643, 15)(11643, 2)\n", "Testing set::(5736, 15)\n" ] } ], "source": [ "X, X_test, y, y_test = train_test_split(hour_df.iloc[:,0:-3], hour_df.iloc[:,-1], \n", " test_size=0.33, random_state=42)\n", "\n", "X.reset_index(inplace=True)\n", "y = y.reset_index()\n", "\n", "X_test.reset_index(inplace=True)\n", "y_test = y_test.reset_index()\n", "\n", "print(\"Training set::{}{}\".format(X.shape,y.shape))\n", "print(\"Testing set::{}\".format(X_test.shape))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "cat_attr_list = ['season','is_holiday',\n", " 'weather_condition','is_workingday',\n", " 'hour','weekday','month','year']\n", "numeric_feature_cols = ['temp','humidity','windspeed','hour','weekday','month','year']\n", "subset_cat_features = ['season','is_holiday','weather_condition','is_workingday']" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "encoded_attr_list = []\n", "for col in cat_attr_list:\n", " return_obj = fit_transform_ohe(X,col)\n", " encoded_attr_list.append({'label_enc':return_obj[0],\n", " 'ohe_enc':return_obj[1],\n", " 'feature_df':return_obj[2],\n", " 'col_name':col})" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape::(11643, 19)\n" ] } ], "source": [ "feature_df_list = [X[numeric_feature_cols]]\n", "feature_df_list.extend([enc['feature_df'] \\\n", " for enc in encoded_attr_list \\\n", " if enc['col_name'] in subset_cat_features])\n", "\n", "train_df_new = pd.concat(feature_df_list, axis=1)\n", "print(\"Shape::{}\".format(train_df_new.shape))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Decision Tree based Regression" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X = train_df_new\n", "y= y.total_count.values.reshape(-1,1)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((11643, 19), (11643, 1))" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.shape,y.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sample Decision Tree Regressor" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,\n", " max_leaf_nodes=10, min_impurity_split=1e-07, min_samples_leaf=1,\n", " min_samples_split=5, min_weight_fraction_leaf=0.0,\n", " presort=False, random_state=None, splitter='best')" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dtr = DecisionTreeRegressor(max_depth=4,\n", " min_samples_split=5,\n", " max_leaf_nodes=10)\n", "dtr.fit(X,y)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.60565765621037793" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dtr.score(X,y)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Plot the Learnt Model" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dot_data = tree.export_graphviz(dtr, out_file=None) \n", "graph = pydotplus.graph_from_dot_data(dot_data) \n", "graph.write_pdf(\"bikeshare.pdf\") " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Grid Search With Cross Validation" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "param_grid = {\"criterion\": [\"mse\", \"mae\"],\n", " \"min_samples_split\": [10, 20, 40],\n", " \"max_depth\": [2, 6, 8],\n", " \"min_samples_leaf\": [20, 40, 100],\n", " \"max_leaf_nodes\": [5, 20, 100, 500, 800],\n", " }" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "grid_cv_dtr = GridSearchCV(dtr, param_grid, cv=5)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "GridSearchCV(cv=5, error_score='raise',\n", " estimator=DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,\n", " max_leaf_nodes=10, min_impurity_split=1e-07, min_samples_leaf=1,\n", " min_samples_split=5, min_weight_fraction_leaf=0.0,\n", " presort=False, random_state=None, splitter='best'),\n", " fit_params={}, iid=True, n_jobs=1,\n", " param_grid={'min_samples_split': [10, 20, 40], 'max_leaf_nodes': [5, 20, 100, 500, 800], 'min_samples_leaf': [20, 40, 100], 'max_depth': [2, 6, 8], 'criterion': ['mse', 'mae']},\n", " pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n", " scoring=None, verbose=0)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "grid_cv_dtr.fit(X,y)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Cross Validation: Best Model Details" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "R-Squared::0.85891903233008\n", "Best Hyperparameters::\n", "{'min_samples_split': 10, 'max_leaf_nodes': 500, 'min_samples_leaf': 20, 'max_depth': 8, 'criterion': 'mse'}\n" ] } ], "source": [ "print(\"R-Squared::{}\".format(grid_cv_dtr.best_score_))\n", "print(\"Best Hyperparameters::\\n{}\".format(grid_cv_dtr.best_params_))" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
| \n", " | mean_fit_time | \n", "mean_score_time | \n", "mean_test_score | \n", "mean_train_score | \n", "param_criterion | \n", "param_max_depth | \n", "param_max_leaf_nodes | \n", "param_min_samples_leaf | \n", "param_min_samples_split | \n", "params | \n", "... | \n", "split2_test_score | \n", "split2_train_score | \n", "split3_test_score | \n", "split3_train_score | \n", "split4_test_score | \n", "split4_train_score | \n", "std_fit_time | \n", "std_score_time | \n", "std_test_score | \n", "std_train_score | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "0.025334 | \n", "0.004203 | \n", "0.48401 | \n", "0.48875 | \n", "mse | \n", "2 | \n", "5 | \n", "20 | \n", "10 | \n", "{'min_samples_split': 10, 'max_leaf_nodes': 5,... | \n", "... | \n", "0.486478 | \n", "0.48915 | \n", "0.475286 | \n", "0.491167 | \n", "0.491096 | \n", "0.487932 | \n", "0.001725 | \n", "0.000401 | \n", "0.007223 | \n", "0.002883 | \n", "
| 1 | \n", "0.023615 | \n", "0.004169 | \n", "0.48401 | \n", "0.48875 | \n", "mse | \n", "2 | \n", "5 | \n", "20 | \n", "20 | \n", "{'min_samples_split': 20, 'max_leaf_nodes': 5,... | \n", "... | \n", "0.486478 | \n", "0.48915 | \n", "0.475286 | \n", "0.491167 | \n", "0.491096 | \n", "0.487932 | \n", "0.001019 | \n", "0.000494 | \n", "0.007223 | \n", "0.002883 | \n", "
| 2 | \n", "0.024118 | \n", "0.004603 | \n", "0.48401 | \n", "0.48875 | \n", "mse | \n", "2 | \n", "5 | \n", "20 | \n", "40 | \n", "{'min_samples_split': 40, 'max_leaf_nodes': 5,... | \n", "... | \n", "0.486478 | \n", "0.48915 | \n", "0.475286 | \n", "0.491167 | \n", "0.491096 | \n", "0.487932 | \n", "0.002801 | \n", "0.000800 | \n", "0.007223 | \n", "0.002883 | \n", "
| 3 | \n", "0.025617 | \n", "0.004905 | \n", "0.48401 | \n", "0.48875 | \n", "mse | \n", "2 | \n", "5 | \n", "40 | \n", "10 | \n", "{'min_samples_split': 10, 'max_leaf_nodes': 5,... | \n", "... | \n", "0.486478 | \n", "0.48915 | \n", "0.475286 | \n", "0.491167 | \n", "0.491096 | \n", "0.487932 | \n", "0.002578 | \n", "0.001360 | \n", "0.007223 | \n", "0.002883 | \n", "
| 4 | \n", "0.025018 | \n", "0.004804 | \n", "0.48401 | \n", "0.48875 | \n", "mse | \n", "2 | \n", "5 | \n", "40 | \n", "20 | \n", "{'min_samples_split': 20, 'max_leaf_nodes': 5,... | \n", "... | \n", "0.486478 | \n", "0.48915 | \n", "0.475286 | \n", "0.491167 | \n", "0.491096 | \n", "0.487932 | \n", "0.002550 | \n", "0.000749 | \n", "0.007223 | \n", "0.002883 | \n", "
5 rows × 25 columns
\n", "