{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import OneHotEncoder, LabelEncoder\n", "from sklearn.model_selection import GridSearchCV, cross_validate\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.preprocessing import FunctionTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import roc_auc_score, log_loss, make_scorer, brier_score_loss\n", "from sklearn.preprocessing import StandardScaler\n", "from lightgbm import LGBMClassifier\n", "from joblib import dump, load\n", "from sklearn.calibration import calibration_curve\n", "from sklearn.calibration import CalibratedClassifierCV\n", "from sklearn.inspection import permutation_importance\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from scipy.stats import spearmanr\n", "from scipy.cluster import hierarchy\n", "# monkey patch for bayesseachcv (https://github.com/scikit-optimize/scikit-optimize/issues/902)\n", "from numpy.ma import MaskedArray\n", "import sklearn.utils.fixes\n", "sklearn.utils.fixes.MaskedArray = MaskedArray\n", "from skopt import BayesSearchCV\n", "from skopt.space import Real, Integer\n", "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Random state" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "seed = 42" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Setup metrics (see: http://business-analytic.co.uk/blog/evaluating-expected-goals-models/)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# define Mcfadden's pseduo r-squared\n", "def mcfadden_r2(y, y_pred):\n", " ll = log_loss(y, y_pred)\n", " ll_null = log_loss(y, np.full(len(y), y.mean()))\n", " return 1 - (ll/ll_null)\n", "pseudo_r2_scorer = make_scorer(mcfadden_r2, needs_proba=True, greater_is_better=True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "scoring = {'roc_aug': 'roc_auc', 'mcfaddens_r2': pseudo_r2_scorer}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Setup folder for storing models" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load the data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df = pd.read_parquet(os.path.join('..', 'data', 'shots.parquet'))\n", "df.drop(['match_id', 'statsbomb_id', 'statsbomb_team_id', 'player_id_statsbomb', 'competition_gender', 'team_name',\n", " 'player_id', 'firstName', 'middleName', 'lastName', 'Name', 'dataset', 'wyscout_id', 'wyscout_team_id', 'team_id',\n", " 'player_id_wyscout'], axis=1, inplace=True)\n", "X = df.drop('goal', axis=1)\n", "y = df.goal" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Split into train, calibration and test datasets" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=seed, stratify=y)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shots train 51335 ;Number goals 5443 ;Goals %: 10.6\n", "Shots test 12834 ;Number goals 1361 ;Goals %: 10.6\n" ] } ], "source": [ "print('Shots train', len(y_train), ';Number goals', y_train.sum(),\n", " ';Goals %: ', round(y_train.mean()*100, 1))\n", "print('Shots test', len(y_test), ';Number goals', y_test.sum(),\n", " ';Goals %: ', round(y_test.mean()*100, 1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load and split fake data" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df_fake = pd.read_parquet(os.path.join('..', 'data', 'fake_shots.parquet'))\n", "df_fake.index = ['a'+str(idx) for idx in df_fake.index]\n", "y_fake = df_fake.goal\n", "X_fake = df_fake.drop('goal', axis=1)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shots fake 1000 ;Goals %: 3.4\n" ] } ], "source": [ "print('Shots fake', len(y_fake), ';Goals %: ', round(y_fake.mean()*100, 1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Logistic regression" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Subset dataset for logistic regression" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# drop colum\n", "logistic_drop_cols = ['x', 'y', # logistic regression does not deal well with dependent features\n", " # The model will use the distance/ angle features capture these location features instead\n", " # lots of missings for the below features as they come from StatsBomb data only.\n", " # It's not fair to impute these as they are not missing at random\n", " # while logistic regression does not allow missings so I removed them\n", " 'pass_end_y', 'pass_end_x', # <- note these were in Wyscout, but often were just the shot location\n", " 'eventSec', 'period', 'player_id_goalkeeper',\n", " 'goalkeeper_x', 'goalkeeper_y', 'carry_length', 'shot_one_on_one', 'shot_open_goal',\n", " 'under_pressure', 'area_shot', 'area_goal', 'n_angle', 'smart_pass']\n", "X_train_logistic = X_train.drop(logistic_drop_cols, axis=1).copy()\n", "X_test_logistic = X_test.drop(logistic_drop_cols, axis=1).copy()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Split dataset for logistic regession into passes / other assists" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def split(X, y):\n", " mask = X.assist_type == 'pass'\n", " X_pass = X[mask].drop('assist_type', axis=1).copy()\n", " y_pass = y[mask]\n", " X_other = X[~mask].dropna(axis=1, how='all').copy()\n", " y_other = y[~mask]\n", " return X_pass, y_pass, X_other, y_other" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "X_train_pass, y_train_pass, X_train_other, y_train_other = split(X_train_logistic, y_train)\n", "X_test_pass, y_test_pass, X_test_other, y_test_other = split(X_test_logistic, y_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pipeline for cleaning pass assists" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "cols = ['shot_type_name', 'body_part_name', 'pass_technique_name', 'pass_height_name']\n", "cats = [['open_play', 'free_kick', 'corner', 'throw_in'],\n", " ['Right Foot', 'Left Foot', 'Other'],\n", " ['other', 'Through Ball', 'Straight', 'Inswinging', 'Outswinging'],\n", " ['Ground/ Low Pass', 'High Pass']]\n", "pass_one_hot = ColumnTransformer([('encoder', OneHotEncoder(drop='first', categories=cats), cols)], remainder='passthrough')\n", "pipe_pass = Pipeline([('one_hot', pass_one_hot),\n", " ('impute', SimpleImputer()),\n", " ('scale', StandardScaler()),\n", " ('lr', LogisticRegression(random_state=seed))])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Column names of transformed pass data" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "original_cols_remain = [col for col in X_train_pass.columns if col not in cols]\n", "new_cols_pass = [item for sublist in cats for i, item in enumerate(sublist) if (i>0)]\n", "new_cols_pass.extend(original_cols_remain)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['free_kick', 'corner', 'throw_in', 'Left Foot', 'Other', 'Through Ball', 'Straight', 'Inswinging', 'Outswinging', 'High Pass', 'counter_attack', 'fast_break', 'strong_foot', 'pass_switch', 'pass_cross', 'pass_cut_back', 'visible_angle', 'middle_angle', 'distance_to_goal', 'distance_visible_angle', 'log_distance_to_goal']\n" ] } ], "source": [ "print(new_cols_pass)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pipeline for cleaning other assists" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# setting direct to recovery so does not not encoded twice ( also covered by shot_type_name == 'direct_set_piece')\n", "X_train_other.loc[X_train_other.assist_type == 'direct', 'assist_type'] = 'recovery'\n", "X_test_other.loc[X_test_other.assist_type == 'direct', 'assist_type'] = 'recovery'\n", "\n", "cols = ['shot_type_name', 'body_part_name', 'assist_type']\n", "cats = [['open_play', 'free_kick', 'corner', 'throw_in', 'direct_set_piece'],\n", " ['Right Foot', 'Left Foot', 'Other'],\n", " ['recovery', 'clearance', 'rebound']]\n", "other_one_hot = ColumnTransformer([('encoder', OneHotEncoder(drop='first', categories=cats), cols)], remainder='passthrough')\n", "pipe_other = Pipeline([('one_hot', other_one_hot),\n", " ('impute', SimpleImputer()),\n", " ('scale', StandardScaler()),\n", " ('lr', LogisticRegression(random_state=seed))])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Column names of transformed passes" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "original_cols_remain = [col for col in X_train_other.columns if col not in cols]\n", "new_cols_other = [item for sublist in cats for i, item in enumerate(sublist) if (i>0)]\n", "new_cols_other.extend(original_cols_remain)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['free_kick', 'corner', 'throw_in', 'direct_set_piece', 'Left Foot', 'Other', 'clearance', 'rebound', 'counter_attack', 'fast_break', 'strong_foot', 'visible_angle', 'middle_angle', 'distance_to_goal', 'distance_visible_angle', 'log_distance_to_goal']\n" ] } ], "source": [ "print(new_cols_other)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Search parameters for gridsearchcv" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "param_grid = {'lr__C': np.logspace(-3, 0.1, 100)}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fit the inner grid search for shots assisted by passes" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C: 0.04248961816344005\n" ] } ], "source": [ "clf_pass = GridSearchCV(estimator=pipe_pass, param_grid=param_grid, scoring='neg_log_loss', n_jobs=-1)\n", "clf_pass.fit(X_train_pass, y_train_pass)\n", "print('C:', clf_pass.best_estimator_.named_steps.lr.C)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fit the inner grid search for shots assisted other than passes" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C: 0.42687726983178853\n" ] } ], "source": [ "clf_other = GridSearchCV(estimator=pipe_other, param_grid=param_grid, scoring='neg_log_loss', n_jobs=-1)\n", "clf_other.fit(X_train_other, y_train_other)\n", "print('C:', clf_other.best_estimator_.named_steps.lr.C)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Outer loops for unbiased estimates of the model accuracy" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ROC AUC for shots assisted by passes: 0.7817401551132565\n", "McFadden's Pseudo R-squared shots assisted by passes: 0.15855492654518005\n" ] } ], "source": [ "nested_score_pass = cross_validate(clf_pass, X=X_train_pass, y=y_train_pass, scoring=scoring, n_jobs=-1)\n", "print('ROC AUC for shots assisted by passes:', nested_score_pass['test_roc_aug'].mean())\n", "print(\"McFadden's Pseudo R-squared shots assisted by passes:\", nested_score_pass['test_mcfaddens_r2'].mean())" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ROC AUC for other model: 0.8033677417704617\n", "McFadden's Pseudo R-squared for other model: 0.19094339212253655\n" ] } ], "source": [ "nested_score_other = cross_validate(clf_other, X=X_train_other, y=y_train_other, scoring=scoring, n_jobs=-1)\n", "print('ROC AUC for other model:', nested_score_other['test_roc_aug'].mean())\n", "print(\"McFadden's Pseudo R-squared for other model:\", nested_score_other['test_mcfaddens_r2'].mean())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# LightGBM model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Add fake training data. I am not adding this to the test data as want this to be realistic of real data." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "X_train = pd.concat([X_train, X_fake])\n", "y_train = pd.concat([y_train, y_fake])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Clean data. Categories to numbers. Drop distance and angle measures as just want raw locations for my models." ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "def clean_lightgbm(df):\n", " df = df.copy()\n", " # replace categorical columns\n", " shot_type_cat = {'free_kick': 0, 'corner': 1, 'throw_in': 2, 'direct_set_piece': 3, 'open_play': 4}\n", " body_type_cat = {'Right Foot': 0, 'Left Foot': 1, 'Other': 2}\n", " assist_type_cat = {'pass': 0, 'recovery': 1, 'clearance': 2, 'direct': 3, 'rebound': 4}\n", " pass_height_cat = {'High Pass': 0, 'Ground/ Low Pass': 1}\n", " pass_technique_cat = {'Through Ball': 0, 'Straight': 1, 'Inswinging': 2, 'Outswinging': 3, 'other': 4}\n", " df.shot_type_name.replace(shot_type_cat, inplace=True)\n", " df.body_part_name.replace(body_type_cat, inplace=True)\n", " df.assist_type.replace(assist_type_cat, inplace=True)\n", " df.pass_height_name.replace(pass_height_cat, inplace=True)\n", " df.pass_technique_name.replace(pass_technique_cat, inplace=True)\n", "\n", " # replace boolean type columns (not really as have nans)\n", " for col in ['pass_switch', 'pass_cross', 'pass_cut_back', 'shot_one_on_one',\n", " 'shot_open_goal', 'under_pressure', 'smart_pass']:\n", " df[col] = df[col].astype(np.float32)\n", " # drop some distance/ angle columns\n", " drop_cols = ['visible_angle', 'middle_angle', 'distance_to_goal', 'distance_visible_angle',\n", " 'log_distance_to_goal', 'eventSec', 'period', 'player_id_goalkeeper']\n", " df.drop(drop_cols, axis=1, inplace=True)\n", " return df\n", " \n", "X_train = clean_lightgbm(X_train)\n", "X_test = clean_lightgbm(X_test)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['shot_type_name', 'x', 'y', 'counter_attack', 'fast_break',\n", " 'strong_foot', 'body_part_name', 'assist_type', 'pass_end_y',\n", " 'pass_end_x', 'pass_switch', 'pass_cross', 'pass_cut_back',\n", " 'pass_height_name', 'pass_technique_name', 'carry_length',\n", " 'shot_one_on_one', 'shot_open_goal', 'under_pressure', 'area_shot',\n", " 'area_goal', 'n_angle', 'goalkeeper_x', 'goalkeeper_y', 'smart_pass'],\n", " dtype='object')\n" ] } ], "source": [ "print(X_train.columns)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fit the nested 5-fold cross validation using Bayesian optimisation." ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "BayesSearchCV(cv=5,\n", " estimator=CalibratedClassifierCV(base_estimator=LGBMClassifier(random_state=42),\n", " cv=3, method='isotonic'),\n", " n_iter=100, n_jobs=-1,\n", " search_spaces={'base_estimator__max_depth': Integer(low=0, high=500, prior='uniform', transform='identity'),\n", " 'base_estimator__min_child_samples': Integer(low=0, high=200, prior='uniform', transform='identity'),\n", " 'base_estimator__num_leaves': Integer(low=2, high=500, prior='uniform', transform='identity'),\n", " 'base_estimator__reg_alpha': Real(low=0, high=1, prior='uniform', transform='identity'),\n", " 'base_estimator__reg_lambda': Real(low=0, high=1, prior='uniform', transform='identity')})" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#lgbm = LGBMClassifier(random_state=42)\n", "lgbm = CalibratedClassifierCV(LGBMClassifier(random_state=42), method='isotonic', cv=3)\n", "lgbm_param_grid = {'base_estimator__min_child_samples': Integer(0, 200),\n", " 'base_estimator__num_leaves': Integer(2, 500),\n", " 'base_estimator__reg_lambda': Real(0, 1),\n", " 'base_estimator__reg_alpha': Real(0, 1),\n", " 'base_estimator__max_depth': Integer(0, 500)}\n", "# Nested resampling using skopt. see: https://github.com/scikit-optimize/scikit-optimize/issues/725\n", "searchcv = BayesSearchCV(estimator=lgbm,\n", " n_iter=100,\n", " search_spaces=lgbm_param_grid,\n", " cv=5,\n", " n_jobs=-1)\n", "searchcv.fit(X_train, y_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Permutation importance" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "# note not using fake data for permutation importance\n", "perm_result = permutation_importance(searchcv.best_estimator_, X_train, y_train, n_repeats=10, random_state=seed)\n", "df_perm_importance = pd.DataFrame({'Feature':X_train.columns,\n", " 'importance': perm_result.importances.mean(axis=1),\n", " 'std_dev': perm_result.importances.std(axis=1)})\n", "df_perm_importance.sort_values('importance', ascending=False, inplace=True)\n", "df_perm_importance.reset_index(drop=True, inplace=True)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Featureimportancestd_dev
0x1.435942e-020.000504
1y1.153148e-020.000537
2goalkeeper_x1.199962e-030.000086
3n_angle9.439190e-040.000156
4shot_type_name8.350053e-040.000145
5goalkeeper_y7.413777e-040.000124
6body_part_name5.254610e-040.000176
7carry_length5.216394e-040.000117
8pass_technique_name4.681380e-040.000101
9shot_open_goal4.337441e-040.000046
10area_shot3.993503e-040.000115
11pass_height_name3.917073e-040.000154
12pass_end_x2.770612e-040.000081
13assist_type2.541320e-040.000090
14strong_foot2.197382e-040.000116
15pass_end_y2.159167e-040.000110
16area_goal1.948983e-040.000166
17pass_cross1.318429e-040.000153
18counter_attack9.362759e-050.000065
19pass_switch5.159071e-050.000059
20fast_break2.101844e-050.000039
21under_pressure1.110223e-170.000052
22smart_pass-5.732302e-060.000028
23shot_one_on_one-1.146460e-050.000018
24pass_cut_back-2.101844e-050.000025
\n", "
" ], "text/plain": [ " Feature importance std_dev\n", "0 x 1.435942e-02 0.000504\n", "1 y 1.153148e-02 0.000537\n", "2 goalkeeper_x 1.199962e-03 0.000086\n", "3 n_angle 9.439190e-04 0.000156\n", "4 shot_type_name 8.350053e-04 0.000145\n", "5 goalkeeper_y 7.413777e-04 0.000124\n", "6 body_part_name 5.254610e-04 0.000176\n", "7 carry_length 5.216394e-04 0.000117\n", "8 pass_technique_name 4.681380e-04 0.000101\n", "9 shot_open_goal 4.337441e-04 0.000046\n", "10 area_shot 3.993503e-04 0.000115\n", "11 pass_height_name 3.917073e-04 0.000154\n", "12 pass_end_x 2.770612e-04 0.000081\n", "13 assist_type 2.541320e-04 0.000090\n", "14 strong_foot 2.197382e-04 0.000116\n", "15 pass_end_y 2.159167e-04 0.000110\n", "16 area_goal 1.948983e-04 0.000166\n", "17 pass_cross 1.318429e-04 0.000153\n", "18 counter_attack 9.362759e-05 0.000065\n", "19 pass_switch 5.159071e-05 0.000059\n", "20 fast_break 2.101844e-05 0.000039\n", "21 under_pressure 1.110223e-17 0.000052\n", "22 smart_pass -5.732302e-06 0.000028\n", "23 shot_one_on_one -1.146460e-05 0.000018\n", "24 pass_cut_back -2.101844e-05 0.000025" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_perm_importance" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "fig, ax = plt.subplots(figsize=(16, 9))\n", "sorted_idx = perm_result.importances_mean.argsort()\n", "bar_plot = ax.boxplot(perm_result.importances[sorted_idx].T, vert=False, labels=X_train.columns[sorted_idx])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Test" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Calculate calibration curve on test data" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "y_pred_lgbm_calibrated = searchcv.best_estimator_.predict_proba(X_test)[:, 1]\n", "fraction_of_positives_lgbm, mean_predicted_value_lgbm = calibration_curve(y_test, y_pred_lgbm_calibrated, n_bins=10)\n", "# logistic regression\n", "y_pred_lr_pass = clf_pass.predict_proba(X_test_pass)[:, 1]\n", "y_pred_lr_other = clf_other.predict_proba(X_test_other)[:, 1]\n", "y_pred_lr = np.concatenate([y_pred_lr_pass, y_pred_lr_other])\n", "y_true_test = np.concatenate([y_test_pass, y_test_other])\n", "fraction_of_positives_lr, mean_predicted_value_lr = calibration_curve(y_true_test, y_pred_lr, n_bins=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Plot calibration curve on test data" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.style.use('dark_background')\n", "fig = plt.figure(constrained_layout=True, figsize=(10, 15))\n", "gs = fig.add_gridspec(ncols=1, nrows=2, height_ratios=(2/3, 1/3))\n", "ax1 = fig.add_subplot(gs[0])\n", "ax1.plot(mean_predicted_value_lgbm, fraction_of_positives_lgbm, \"-o\", color='#aabced', label='Calibrated Light GBM')\n", "ax1.plot(mean_predicted_value_lr, fraction_of_positives_lr, \"-o\", color='#dbdf4a', label='Logistic regression')\n", "ax1.plot([0, 1], [0, 1], \"--\", color='#e7aeca', label=\"Perfectly calibrated\")\n", "ax1.set_xlabel('Mean predicted value', fontsize=15)\n", "ax1.set_ylabel('Fraction of positives', fontsize=15)\n", "ax1.set_title('Calibration curve', fontsize=20, pad=10)\n", "ax1.legend(fontsize=15)\n", "ax1.tick_params(labelsize=15)\n", "ax2 = fig.add_subplot(gs[1])\n", "sns.distplot(y_pred_lr, color='#4fe4e4', label='Logistic regression', kde=False, ax=ax2)\n", "sns.distplot(y_pred_lgbm_calibrated, color='#aabced', label='Calibrated Light GBM', kde=False, ax=ax2)\n", "ax2.set_xlabel('Predicted value', fontsize=15)\n", "ax2.set_ylabel('Count', fontsize=15)\n", "ax2.tick_params(labelsize=15)\n", "ax2.legend(fontsize=15)\n", "ax2.set_title('Distribution of predictions', fontsize=20, pad=10);\n", "fig.savefig(os.path.join('..', 'figures', '22_calibration_curve.png'), bbox_inches = 'tight', pad_inches = 0.2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "From scikit-learn docs: \"The smaller the Brier score, the better, hence the naming with “loss”. Across all items in a set N predictions, the Brier score measures the mean squared difference between (1) the predicted probability assigned to the possible outcomes for item i, and (2) the actual outcome.\"" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Brier score, Light GBM: 0.08044014131802528\n", "ROC AUC, Light GBM: 0.7851386121829785\n", "Pseudo R-squared, Light GBM: 0.16991224608139832\n" ] } ], "source": [ "print('Brier score, Light GBM:', brier_score_loss(y_test, y_pred_lgbm_calibrated, pos_label=y_test.max()))\n", "print('ROC AUC, Light GBM:', roc_auc_score(y_test, y_pred_lgbm_calibrated))\n", "print('Pseudo R-squared, Light GBM:', mcfadden_r2(y_test, y_pred_lgbm_calibrated))" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Brier score, logistic regression: 0.08150355125036453\n", "ROC AUC, logistic regression: 0.786687948249966\n", "Pseudo R-squared, logistic regression: 0.16477712349951212\n" ] } ], "source": [ "print('Brier score, logistic regression:',brier_score_loss(y_true_test, y_pred_lr, pos_label=y_true_test.max()))\n", "print('ROC AUC, logistic regression:', roc_auc_score(y_true_test, y_pred_lr))\n", "print('Pseudo R-squared, logistic regression:', mcfadden_r2(y_true_test, y_pred_lr))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Save models" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['..\\\\models\\\\lgbm_model.joblib']" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dump(searchcv.best_estimator_, os.path.join('..', 'models', 'lgbm_model.joblib'))" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['..\\\\models\\\\lr_pass.joblib']" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dump(clf_pass.best_estimator_, os.path.join('..', 'models', 'lr_pass.joblib'))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['..\\\\models\\\\lr_other.joblib']" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dump(clf_other.best_estimator_, os.path.join('..', 'models', 'lr_other.joblib'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Save data" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "# reload shot dataset for ids\n", "df = pd.read_parquet(os.path.join('..', 'data', 'shots.parquet'))\n", "df = df[['match_id', 'wyscout_id', 'statsbomb_id']].copy()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "X_train_other['goal'] = y_train_other\n", "X_train_other['split'] = 'train'\n", "X_test_other['goal'] = y_test_other\n", "X_test_other['split'] = 'test'\n", "df_other = pd.concat([X_train_other, X_test_other])\n", "df_other = df_other.merge(df, left_index=True, right_index=True, validate='1:1', how='left')\n", "df_other.reset_index(drop=True, inplace=True)\n", "df_other.to_parquet(os.path.join('..', 'data', 'modelling', 'lr_other.parquet'))" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "X_train_pass['goal'] = y_train_pass\n", "X_train_pass['split'] = 'train'\n", "X_test_pass['goal'] = y_test_pass\n", "X_test_pass['split'] = 'test'\n", "df_pass = pd.concat([X_train_pass, X_test_pass])\n", "df_pass = df_pass.merge(df, left_index=True, right_index=True, validate='1:1', how='left')\n", "df_pass.reset_index(drop=True, inplace=True)\n", "df_pass.to_parquet(os.path.join('..', 'data', 'modelling', 'lr_pass.parquet'))" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "X_train['goal'] = y_train\n", "X_train['split'] = 'train'\n", "X_test['goal'] = y_test\n", "X_test['split'] = 'test'\n", "df_lgbm = pd.concat([X_train, X_test])\n", "# exlcude fake shots\n", "df_lgbm = df_lgbm[df_lgbm.index.isin(df.index)].copy()\n", "df_lgbm = df_lgbm.merge(df, how='left', left_index=True, right_index=True, validate='1:1')\n", "df_lgbm.to_parquet(os.path.join('..', 'data', 'modelling', 'lgbm.parquet'))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }