{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.metrics import log_loss\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.ensemble import VotingClassifier\n", "from sklearn.pipeline import Pipeline, make_pipeline\n", "from catboost import CatBoostClassifier\n", "from xgboost import XGBClassifier\n", "import sys" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "all_data = pd.read_csv('data_for_model.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# for 2016-17 season train on 2012-13 through 2015-16\n", "training_seasons = [12, 13, 14, 15]\n", "testing_season = 16" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "all_model_features = ['Period', 'StartScoreDifferential', 'Time', 'Putback', 'IsRegularSeason', 'SecondsSincePlayStarted', 'ShotDistance', 'ShotAngle', 'is_OffDeadball', 'is_OffFTMake', 'is_OffFTMiss', 'is_OffFTOreb', 'is_OffLiveBallTurnover', 'is_OffTeamBlockedOreb', 'is_OffTeamOreb', 'is_OffTimeout', 'is_OffBlockedOreb', 'is_OffBlock', 'is_OffMadeFG', 'is_OffOreb', 'is_OffMissedFG']" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RA training log loss: -0.6247\n", "2pt non-RA training log loss: -0.6685\n", "3pt training log loss: -0.6486\n", "RA testing log loss: 0.6197\n", "2pt non-RA testing log loss: 0.6739\n", "3pt testing log loss: 0.6503\n", "All shots testing log loss: 0.6478\n" ] } ], "source": [ "training_data = all_data[(all_data.Season.isin(training_seasons))]\n", "\n", "at_rim = training_data[(training_data.ShotDistance < 5)]\n", "non_rim_2pt = training_data[((training_data.ShotDistance >= 5) & (training_data.ShotValue == 2))]\n", "jump_shots_3pt = training_data[((training_data.ShotValue == 3) & (training_data.ShotDistance < 35))]\n", "\n", "at_rim_xgboost_model = XGBClassifier(\n", " random_state=909,\n", " n_estimators=300,\n", " max_depth=5,\n", " learning_rate=0.1,\n", " min_child_weight=3\n", ")\n", "\n", "at_rim_catboost_classifier = CatBoostClassifier(\n", " bagging_temperature=1,\n", " rsm=0.1,\n", " learning_rate=0.1,\n", " depth=7,\n", " verbose=False, \n", " random_seed=909\n", ")\n", "\n", "at_rim_pipe = Pipeline(\n", " [\n", " ('model', VotingClassifier(\n", " estimators=[\n", " ('xgb', at_rim_xgboost_model), \n", " ('catboost', at_rim_catboost_classifier)\n", " ], \n", " voting='soft'\n", " ))\n", " ]\n", ")\n", "\n", "at_rim_weights = [[1, 1]]\n", "\n", "at_rim_param_grid = {\n", " 'model__weights': at_rim_weights,\n", "}\n", "\n", "at_rim_grid = GridSearchCV(at_rim_pipe, cv=10, scoring='neg_log_loss', n_jobs=1, param_grid=at_rim_param_grid)\n", "\n", "X_at_rim = at_rim[all_model_features]\n", "y_at_rim = at_rim.Made\n", "\n", "at_rim_grid.fit(X_at_rim.values, y_at_rim.values)\n", "print('RA training log loss: %.4f' % at_rim_grid.best_score_)\n", "\n", "non_rim_2pt_xgboost_model = XGBClassifier(\n", " random_state=909,\n", " n_estimators=100,\n", " max_depth=5,\n", " learning_rate=0.1,\n", " min_child_weight=5\n", ")\n", "\n", "non_rim_2pt_catboost_classifier = CatBoostClassifier(\n", " bagging_temperature=1,\n", " rsm=0.1,\n", " learning_rate=0.04,\n", " depth=5,\n", " verbose=False, \n", " random_seed=909\n", ")\n", "\n", "non_rim_2pt_pipe = Pipeline(\n", " [ \n", " ('model', VotingClassifier(\n", " estimators=[\n", " ('xgb', non_rim_2pt_xgboost_model), \n", " ('catboost', non_rim_2pt_catboost_classifier)\n", " ], \n", " voting='soft'\n", " ))\n", " ]\n", ")\n", "\n", "non_rim_2pt_weights = [[1, 1]]\n", "\n", "non_rim_2pt_param_grid = {\n", " 'model__weights': non_rim_2pt_weights,\n", "}\n", "\n", "non_rim_2pt_grid = GridSearchCV(non_rim_2pt_pipe, cv=10, scoring='neg_log_loss', n_jobs=1, param_grid=non_rim_2pt_param_grid)\n", "\n", "X_non_rim_2pt = non_rim_2pt[all_model_features]\n", "y_non_rim_2pt = non_rim_2pt.Made\n", "\n", "non_rim_2pt_grid.fit(X_non_rim_2pt.values, y_non_rim_2pt.values)\n", "print('2pt non-RA training log loss: %.4f' % non_rim_2pt_grid.best_score_)\n", "\n", "jump_shots_3pt_xgboost_model = XGBClassifier(\n", " random_state=909,\n", " n_estimators=100,\n", " max_depth=5,\n", " learning_rate=0.1,\n", " min_child_weight=3\n", ")\n", "\n", "jump_shots_3pt_catboost_classifier = CatBoostClassifier(\n", " bagging_temperature=1,\n", " rsm=0.1,\n", " learning_rate=0.08,\n", " depth=5,\n", " verbose=False, \n", " random_seed=909\n", ")\n", "\n", "jump_shots_3pt_pipe = Pipeline(\n", " [\n", " ('model', VotingClassifier(\n", " estimators=[\n", " ('xgb', jump_shots_3pt_xgboost_model), \n", " ('catboost', jump_shots_3pt_catboost_classifier)\n", " ], \n", " voting='soft'\n", " ))\n", " ]\n", ")\n", "\n", "jump_shots_3pt_weights = [[1, 1]]\n", "\n", "jump_shots_3pt_param_grid = {\n", " 'model__weights': jump_shots_3pt_weights,\n", "}\n", "\n", "jump_shots_3pt_grid = GridSearchCV(jump_shots_3pt_pipe, cv=10, scoring='neg_log_loss', n_jobs=1, param_grid=jump_shots_3pt_param_grid)\n", "\n", "X_jump_shots_3pt = jump_shots_3pt[all_model_features]\n", "y_jump_shots_3pt = jump_shots_3pt.Made\n", "\n", "jump_shots_3pt_grid.fit(X_jump_shots_3pt.values, y_jump_shots_3pt.values)\n", "print('3pt training log loss: %.4f' % jump_shots_3pt_grid.best_score_)\n", "\n", "testing_data = all_data[(all_data.Season == testing_season)]\n", "\n", "testing_at_rim = testing_data[(testing_data.ShotDistance < 5)]\n", "testing_non_rim_2pt = testing_data[((testing_data.ShotDistance >= 5) & (testing_data.ShotValue == 2))]\n", "testing_jump_shots_3pt = testing_data[((testing_data.ShotValue == 3) & (testing_data.ShotDistance < 35))]\n", "\n", "X_test_at_rim = testing_at_rim[all_model_features]\n", "y_test_at_rim = testing_at_rim.Made\n", "\n", "X_test_non_rim_2pt = testing_non_rim_2pt[all_model_features]\n", "y_test_non_rim_2pt = testing_non_rim_2pt.Made\n", "\n", "X_test_jump_shots_3pt = testing_jump_shots_3pt[all_model_features]\n", "y_test_jump_shots_3pt = testing_jump_shots_3pt.Made\n", "\n", "at_rim_predictions = at_rim_grid.predict_proba(X_test_at_rim.values)\n", "at_rim_predictions_df = pd.DataFrame(at_rim_predictions, columns=['miss','make'])\n", "print(\"RA testing log loss: %.4f\" % log_loss(y_test_at_rim, at_rim_predictions_df.make.values))\n", "\n", "non_rim_2pt_predictions = non_rim_2pt_grid.predict_proba(X_test_non_rim_2pt.values)\n", "non_rim_2pt_predictions_df = pd.DataFrame(non_rim_2pt_predictions, columns=['miss','make'])\n", "print(\"2pt non-RA testing log loss: %.4f\" % log_loss(y_test_non_rim_2pt, non_rim_2pt_predictions_df.make.values))\n", "\n", "jump_shot_3pt_predictions = jump_shots_3pt_grid.predict_proba(X_test_jump_shots_3pt.values)\n", "jump_shot_3pt_predictions_df = pd.DataFrame(jump_shot_3pt_predictions, columns=['miss','make'])\n", "print(\"3pt testing log loss: %.4f\" % log_loss(y_test_jump_shots_3pt, jump_shot_3pt_predictions_df.make.values))\n", "\n", "test_prediction_df = pd.concat([at_rim_predictions_df, non_rim_2pt_predictions_df, jump_shot_3pt_predictions_df], ignore_index=True)\n", "y_test = pd.concat([y_test_at_rim, y_test_non_rim_2pt, y_test_jump_shots_3pt])\n", "print(\"All shots testing log loss: %.4f\" % log_loss(y_test, test_prediction_df.make.values))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Feature Importance with SHAP Values" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "