{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import OneHotEncoder, LabelEncoder\n", "from sklearn.model_selection import GridSearchCV, cross_validate\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.preprocessing import FunctionTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import roc_auc_score, log_loss, make_scorer, brier_score_loss\n", "from sklearn.preprocessing import StandardScaler\n", "from lightgbm import LGBMClassifier\n", "from joblib import dump, load\n", "from sklearn.calibration import calibration_curve\n", "from sklearn.calibration import CalibratedClassifierCV\n", "from sklearn.inspection import permutation_importance\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from scipy.stats import spearmanr\n", "from scipy.cluster import hierarchy\n", "# monkey patch for bayesseachcv (https://github.com/scikit-optimize/scikit-optimize/issues/902)\n", "from numpy.ma import MaskedArray\n", "import sklearn.utils.fixes\n", "sklearn.utils.fixes.MaskedArray = MaskedArray\n", "from skopt import BayesSearchCV\n", "from skopt.space import Real, Integer\n", "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Random state" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "seed = 42" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Setup metrics (see: http://business-analytic.co.uk/blog/evaluating-expected-goals-models/)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# define Mcfadden's pseduo r-squared\n", "def mcfadden_r2(y, y_pred):\n", " ll = log_loss(y, y_pred)\n", " ll_null = log_loss(y, np.full(len(y), y.mean()))\n", " return 1 - (ll/ll_null)\n", "pseudo_r2_scorer = make_scorer(mcfadden_r2, needs_proba=True, greater_is_better=True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "scoring = {'roc_aug': 'roc_auc', 'mcfaddens_r2': pseudo_r2_scorer}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Setup folder for storing models" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load the data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df = pd.read_parquet(os.path.join('..', 'data', 'shots.parquet'))\n", "df.drop(['match_id', 'statsbomb_id', 'statsbomb_team_id', 'player_id_statsbomb', 'competition_gender', 'team_name',\n", " 'player_id', 'firstName', 'middleName', 'lastName', 'Name', 'dataset', 'wyscout_id', 'wyscout_team_id', 'team_id',\n", " 'player_id_wyscout'], axis=1, inplace=True)\n", "X = df.drop('goal', axis=1)\n", "y = df.goal" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Split into train, calibration and test datasets" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=seed, stratify=y)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shots train 51335 ;Number goals 5443 ;Goals %: 10.6\n", "Shots test 12834 ;Number goals 1361 ;Goals %: 10.6\n" ] } ], "source": [ "print('Shots train', len(y_train), ';Number goals', y_train.sum(),\n", " ';Goals %: ', round(y_train.mean()*100, 1))\n", "print('Shots test', len(y_test), ';Number goals', y_test.sum(),\n", " ';Goals %: ', round(y_test.mean()*100, 1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load and split fake data" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df_fake = pd.read_parquet(os.path.join('..', 'data', 'fake_shots.parquet'))\n", "df_fake.index = ['a'+str(idx) for idx in df_fake.index]\n", "y_fake = df_fake.goal\n", "X_fake = df_fake.drop('goal', axis=1)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shots fake 1000 ;Goals %: 3.4\n" ] } ], "source": [ "print('Shots fake', len(y_fake), ';Goals %: ', round(y_fake.mean()*100, 1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Logistic regression" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Subset dataset for logistic regression" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# drop colum\n", "logistic_drop_cols = ['x', 'y', # logistic regression does not deal well with dependent features\n", " # The model will use the distance/ angle features capture these location features instead\n", " # lots of missings for the below features as they come from StatsBomb data only.\n", " # It's not fair to impute these as they are not missing at random\n", " # while logistic regression does not allow missings so I removed them\n", " 'pass_end_y', 'pass_end_x', # <- note these were in Wyscout, but often were just the shot location\n", " 'eventSec', 'period', 'player_id_goalkeeper',\n", " 'goalkeeper_x', 'goalkeeper_y', 'carry_length', 'shot_one_on_one', 'shot_open_goal',\n", " 'under_pressure', 'area_shot', 'area_goal', 'n_angle', 'smart_pass']\n", "X_train_logistic = X_train.drop(logistic_drop_cols, axis=1).copy()\n", "X_test_logistic = X_test.drop(logistic_drop_cols, axis=1).copy()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Split dataset for logistic regession into passes / other assists" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def split(X, y):\n", " mask = X.assist_type == 'pass'\n", " X_pass = X[mask].drop('assist_type', axis=1).copy()\n", " y_pass = y[mask]\n", " X_other = X[~mask].dropna(axis=1, how='all').copy()\n", " y_other = y[~mask]\n", " return X_pass, y_pass, X_other, y_other" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "X_train_pass, y_train_pass, X_train_other, y_train_other = split(X_train_logistic, y_train)\n", "X_test_pass, y_test_pass, X_test_other, y_test_other = split(X_test_logistic, y_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pipeline for cleaning pass assists" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "cols = ['shot_type_name', 'body_part_name', 'pass_technique_name', 'pass_height_name']\n", "cats = [['open_play', 'free_kick', 'corner', 'throw_in'],\n", " ['Right Foot', 'Left Foot', 'Other'],\n", " ['other', 'Through Ball', 'Straight', 'Inswinging', 'Outswinging'],\n", " ['Ground/ Low Pass', 'High Pass']]\n", "pass_one_hot = ColumnTransformer([('encoder', OneHotEncoder(drop='first', categories=cats), cols)], remainder='passthrough')\n", "pipe_pass = Pipeline([('one_hot', pass_one_hot),\n", " ('impute', SimpleImputer()),\n", " ('scale', StandardScaler()),\n", " ('lr', LogisticRegression(random_state=seed))])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Column names of transformed pass data" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "original_cols_remain = [col for col in X_train_pass.columns if col not in cols]\n", "new_cols_pass = [item for sublist in cats for i, item in enumerate(sublist) if (i>0)]\n", "new_cols_pass.extend(original_cols_remain)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['free_kick', 'corner', 'throw_in', 'Left Foot', 'Other', 'Through Ball', 'Straight', 'Inswinging', 'Outswinging', 'High Pass', 'counter_attack', 'fast_break', 'strong_foot', 'pass_switch', 'pass_cross', 'pass_cut_back', 'visible_angle', 'middle_angle', 'distance_to_goal', 'distance_visible_angle', 'log_distance_to_goal']\n" ] } ], "source": [ "print(new_cols_pass)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pipeline for cleaning other assists" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# setting direct to recovery so does not not encoded twice ( also covered by shot_type_name == 'direct_set_piece')\n", "X_train_other.loc[X_train_other.assist_type == 'direct', 'assist_type'] = 'recovery'\n", "X_test_other.loc[X_test_other.assist_type == 'direct', 'assist_type'] = 'recovery'\n", "\n", "cols = ['shot_type_name', 'body_part_name', 'assist_type']\n", "cats = [['open_play', 'free_kick', 'corner', 'throw_in', 'direct_set_piece'],\n", " ['Right Foot', 'Left Foot', 'Other'],\n", " ['recovery', 'clearance', 'rebound']]\n", "other_one_hot = ColumnTransformer([('encoder', OneHotEncoder(drop='first', categories=cats), cols)], remainder='passthrough')\n", "pipe_other = Pipeline([('one_hot', other_one_hot),\n", " ('impute', SimpleImputer()),\n", " ('scale', StandardScaler()),\n", " ('lr', LogisticRegression(random_state=seed))])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Column names of transformed passes" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "original_cols_remain = [col for col in X_train_other.columns if col not in cols]\n", "new_cols_other = [item for sublist in cats for i, item in enumerate(sublist) if (i>0)]\n", "new_cols_other.extend(original_cols_remain)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['free_kick', 'corner', 'throw_in', 'direct_set_piece', 'Left Foot', 'Other', 'clearance', 'rebound', 'counter_attack', 'fast_break', 'strong_foot', 'visible_angle', 'middle_angle', 'distance_to_goal', 'distance_visible_angle', 'log_distance_to_goal']\n" ] } ], "source": [ "print(new_cols_other)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Search parameters for gridsearchcv" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "param_grid = {'lr__C': np.logspace(-3, 0.1, 100)}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fit the inner grid search for shots assisted by passes" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C: 0.04248961816344005\n" ] } ], "source": [ "clf_pass = GridSearchCV(estimator=pipe_pass, param_grid=param_grid, scoring='neg_log_loss', n_jobs=-1)\n", "clf_pass.fit(X_train_pass, y_train_pass)\n", "print('C:', clf_pass.best_estimator_.named_steps.lr.C)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fit the inner grid search for shots assisted other than passes" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C: 0.42687726983178853\n" ] } ], "source": [ "clf_other = GridSearchCV(estimator=pipe_other, param_grid=param_grid, scoring='neg_log_loss', n_jobs=-1)\n", "clf_other.fit(X_train_other, y_train_other)\n", "print('C:', clf_other.best_estimator_.named_steps.lr.C)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Outer loops for unbiased estimates of the model accuracy" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ROC AUC for shots assisted by passes: 0.7817401551132565\n", "McFadden's Pseudo R-squared shots assisted by passes: 0.15855492654518005\n" ] } ], "source": [ "nested_score_pass = cross_validate(clf_pass, X=X_train_pass, y=y_train_pass, scoring=scoring, n_jobs=-1)\n", "print('ROC AUC for shots assisted by passes:', nested_score_pass['test_roc_aug'].mean())\n", "print(\"McFadden's Pseudo R-squared shots assisted by passes:\", nested_score_pass['test_mcfaddens_r2'].mean())" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ROC AUC for other model: 0.8033677417704617\n", "McFadden's Pseudo R-squared for other model: 0.19094339212253655\n" ] } ], "source": [ "nested_score_other = cross_validate(clf_other, X=X_train_other, y=y_train_other, scoring=scoring, n_jobs=-1)\n", "print('ROC AUC for other model:', nested_score_other['test_roc_aug'].mean())\n", "print(\"McFadden's Pseudo R-squared for other model:\", nested_score_other['test_mcfaddens_r2'].mean())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# LightGBM model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Add fake training data. I am not adding this to the test data as want this to be realistic of real data." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "X_train = pd.concat([X_train, X_fake])\n", "y_train = pd.concat([y_train, y_fake])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Clean data. Categories to numbers. Drop distance and angle measures as just want raw locations for my models." ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "def clean_lightgbm(df):\n", " df = df.copy()\n", " # replace categorical columns\n", " shot_type_cat = {'free_kick': 0, 'corner': 1, 'throw_in': 2, 'direct_set_piece': 3, 'open_play': 4}\n", " body_type_cat = {'Right Foot': 0, 'Left Foot': 1, 'Other': 2}\n", " assist_type_cat = {'pass': 0, 'recovery': 1, 'clearance': 2, 'direct': 3, 'rebound': 4}\n", " pass_height_cat = {'High Pass': 0, 'Ground/ Low Pass': 1}\n", " pass_technique_cat = {'Through Ball': 0, 'Straight': 1, 'Inswinging': 2, 'Outswinging': 3, 'other': 4}\n", " df.shot_type_name.replace(shot_type_cat, inplace=True)\n", " df.body_part_name.replace(body_type_cat, inplace=True)\n", " df.assist_type.replace(assist_type_cat, inplace=True)\n", " df.pass_height_name.replace(pass_height_cat, inplace=True)\n", " df.pass_technique_name.replace(pass_technique_cat, inplace=True)\n", "\n", " # replace boolean type columns (not really as have nans)\n", " for col in ['pass_switch', 'pass_cross', 'pass_cut_back', 'shot_one_on_one',\n", " 'shot_open_goal', 'under_pressure', 'smart_pass']:\n", " df[col] = df[col].astype(np.float32)\n", " # drop some distance/ angle columns\n", " drop_cols = ['visible_angle', 'middle_angle', 'distance_to_goal', 'distance_visible_angle',\n", " 'log_distance_to_goal', 'eventSec', 'period', 'player_id_goalkeeper']\n", " df.drop(drop_cols, axis=1, inplace=True)\n", " return df\n", " \n", "X_train = clean_lightgbm(X_train)\n", "X_test = clean_lightgbm(X_test)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['shot_type_name', 'x', 'y', 'counter_attack', 'fast_break',\n", " 'strong_foot', 'body_part_name', 'assist_type', 'pass_end_y',\n", " 'pass_end_x', 'pass_switch', 'pass_cross', 'pass_cut_back',\n", " 'pass_height_name', 'pass_technique_name', 'carry_length',\n", " 'shot_one_on_one', 'shot_open_goal', 'under_pressure', 'area_shot',\n", " 'area_goal', 'n_angle', 'goalkeeper_x', 'goalkeeper_y', 'smart_pass'],\n", " dtype='object')\n" ] } ], "source": [ "print(X_train.columns)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fit the nested 5-fold cross validation using Bayesian optimisation." ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "BayesSearchCV(cv=5,\n", " estimator=CalibratedClassifierCV(base_estimator=LGBMClassifier(random_state=42),\n", " cv=3, method='isotonic'),\n", " n_iter=100, n_jobs=-1,\n", " search_spaces={'base_estimator__max_depth': Integer(low=0, high=500, prior='uniform', transform='identity'),\n", " 'base_estimator__min_child_samples': Integer(low=0, high=200, prior='uniform', transform='identity'),\n", " 'base_estimator__num_leaves': Integer(low=2, high=500, prior='uniform', transform='identity'),\n", " 'base_estimator__reg_alpha': Real(low=0, high=1, prior='uniform', transform='identity'),\n", " 'base_estimator__reg_lambda': Real(low=0, high=1, prior='uniform', transform='identity')})" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#lgbm = LGBMClassifier(random_state=42)\n", "lgbm = CalibratedClassifierCV(LGBMClassifier(random_state=42), method='isotonic', cv=3)\n", "lgbm_param_grid = {'base_estimator__min_child_samples': Integer(0, 200),\n", " 'base_estimator__num_leaves': Integer(2, 500),\n", " 'base_estimator__reg_lambda': Real(0, 1),\n", " 'base_estimator__reg_alpha': Real(0, 1),\n", " 'base_estimator__max_depth': Integer(0, 500)}\n", "# Nested resampling using skopt. see: https://github.com/scikit-optimize/scikit-optimize/issues/725\n", "searchcv = BayesSearchCV(estimator=lgbm,\n", " n_iter=100,\n", " search_spaces=lgbm_param_grid,\n", " cv=5,\n", " n_jobs=-1)\n", "searchcv.fit(X_train, y_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Permutation importance" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "# note not using fake data for permutation importance\n", "perm_result = permutation_importance(searchcv.best_estimator_, X_train, y_train, n_repeats=10, random_state=seed)\n", "df_perm_importance = pd.DataFrame({'Feature':X_train.columns,\n", " 'importance': perm_result.importances.mean(axis=1),\n", " 'std_dev': perm_result.importances.std(axis=1)})\n", "df_perm_importance.sort_values('importance', ascending=False, inplace=True)\n", "df_perm_importance.reset_index(drop=True, inplace=True)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Feature | \n", "importance | \n", "std_dev | \n", "
---|---|---|---|
0 | \n", "x | \n", "1.435942e-02 | \n", "0.000504 | \n", "
1 | \n", "y | \n", "1.153148e-02 | \n", "0.000537 | \n", "
2 | \n", "goalkeeper_x | \n", "1.199962e-03 | \n", "0.000086 | \n", "
3 | \n", "n_angle | \n", "9.439190e-04 | \n", "0.000156 | \n", "
4 | \n", "shot_type_name | \n", "8.350053e-04 | \n", "0.000145 | \n", "
5 | \n", "goalkeeper_y | \n", "7.413777e-04 | \n", "0.000124 | \n", "
6 | \n", "body_part_name | \n", "5.254610e-04 | \n", "0.000176 | \n", "
7 | \n", "carry_length | \n", "5.216394e-04 | \n", "0.000117 | \n", "
8 | \n", "pass_technique_name | \n", "4.681380e-04 | \n", "0.000101 | \n", "
9 | \n", "shot_open_goal | \n", "4.337441e-04 | \n", "0.000046 | \n", "
10 | \n", "area_shot | \n", "3.993503e-04 | \n", "0.000115 | \n", "
11 | \n", "pass_height_name | \n", "3.917073e-04 | \n", "0.000154 | \n", "
12 | \n", "pass_end_x | \n", "2.770612e-04 | \n", "0.000081 | \n", "
13 | \n", "assist_type | \n", "2.541320e-04 | \n", "0.000090 | \n", "
14 | \n", "strong_foot | \n", "2.197382e-04 | \n", "0.000116 | \n", "
15 | \n", "pass_end_y | \n", "2.159167e-04 | \n", "0.000110 | \n", "
16 | \n", "area_goal | \n", "1.948983e-04 | \n", "0.000166 | \n", "
17 | \n", "pass_cross | \n", "1.318429e-04 | \n", "0.000153 | \n", "
18 | \n", "counter_attack | \n", "9.362759e-05 | \n", "0.000065 | \n", "
19 | \n", "pass_switch | \n", "5.159071e-05 | \n", "0.000059 | \n", "
20 | \n", "fast_break | \n", "2.101844e-05 | \n", "0.000039 | \n", "
21 | \n", "under_pressure | \n", "1.110223e-17 | \n", "0.000052 | \n", "
22 | \n", "smart_pass | \n", "-5.732302e-06 | \n", "0.000028 | \n", "
23 | \n", "shot_one_on_one | \n", "-1.146460e-05 | \n", "0.000018 | \n", "
24 | \n", "pass_cut_back | \n", "-2.101844e-05 | \n", "0.000025 | \n", "