In [1]:
%load_ext autoreload
%autoreload 2
import os
import warnings
import tqdm
import pandas as pd
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

import socceraction.vaep.features as fs
import socceraction.vaep.labels as lab

In [2]:
## Configure file and folder names
datafolder = "../data-fifa"
spadl_h5 = os.path.join(datafolder, "spadl-statsbomb.h5")
features_h5 = os.path.join(datafolder, "features.h5")
labels_h5 = os.path.join(datafolder, "labels.h5")
predictions_h5 = os.path.join(datafolder, "predictions.h5")

In [3]:
games = pd.read_hdf(spadl_h5, "games")
games = games[games.competition_id == 43]
traingames = games
testgames = games
print("nb of games:", len(games))

nb of games: 64


In [4]:
# 1. Select feature set X
xfns = [
 fs.actiontype,
 fs.actiontype_onehot,
 #fs.bodypart,
 fs.bodypart_onehot,
 fs.result,
 fs.result_onehot,
 fs.goalscore,
 fs.startlocation,
 fs.endlocation,
 fs.movement,
 fs.space_delta,
 fs.startpolar,
 fs.endpolar,
 fs.team,
 #fs.time,
 fs.time_delta,
 #fs.actiontype_result_onehot
]
nb_prev_actions = 1

Xcols = fs.feature_column_names(xfns, nb_prev_actions)

def getXY(games,Xcols):
 # generate the columns of the selected feature
 X = []
 for game_id in tqdm.tqdm(games.game_id, desc="Selecting features"):
 Xi = pd.read_hdf(features_h5, f"game_{game_id}")
 X.append(Xi[Xcols])
 X = pd.concat(X).reset_index(drop=True)

 # 2. Select label Y
 Ycols = ["scores","concedes"]
 Y = []
 for game_id in tqdm.tqdm(games.game_id, desc="Selecting label"):
 Yi = pd.read_hdf(labels_h5, f"game_{game_id}")
 Y.append(Yi[Ycols])
 Y = pd.concat(Y).reset_index(drop=True)
 return X, Y

X, Y = getXY(traingames,Xcols)
print("X:", list(X.columns))
print("Y:", list(Y.columns))
X = X.fillna(0)

Selecting features: 100%|██████████| 64/64 [00:02<00:00, 31.97it/s]
Selecting label: 100%|██████████| 64/64 [00:00<00:00, 87.65it/s]

X: ['type_id_a0', 'type_pass_a0', 'type_cross_a0', 'type_throw_in_a0', 'type_freekick_crossed_a0', 'type_freekick_short_a0', 'type_corner_crossed_a0', 'type_corner_short_a0', 'type_take_on_a0', 'type_foul_a0', 'type_tackle_a0', 'type_interception_a0', 'type_shot_a0', 'type_shot_penalty_a0', 'type_shot_freekick_a0', 'type_keeper_save_a0', 'type_keeper_claim_a0', 'type_keeper_punch_a0', 'type_keeper_pick_up_a0', 'type_clearance_a0', 'type_bad_touch_a0', 'type_non_action_a0', 'type_dribble_a0', 'type_goalkick_a0', 'bodypart_foot_a0', 'bodypart_head_a0', 'bodypart_other_a0', 'bodypart_head/other_a0', 'result_id_a0', 'result_fail_a0', 'result_success_a0', 'result_offside_a0', 'result_owngoal_a0', 'result_yellow_card_a0', 'result_red_card_a0', 'goalscore_team', 'goalscore_opponent', 'goalscore_diff', 'start_x_a0', 'start_y_a0', 'end_x_a0', 'end_y_a0', 'dx_a0', 'dy_a0', 'movement_a0', 'start_dist_to_goal_a0', 'start_angle_to_goal_a0', 'end_dist_to_goal_a0', 'end_angle_to_goal_a0']
Y: ['scores




In [5]:
%%time
# 3. train classifiers F(X) = Y
import xgboost

Y_hat = pd.DataFrame()
models = {}
for col in list(Y.columns):
 model = xgboost.XGBClassifier(n_estimators=50, max_depth=3, n_jobs=-3, verbosity=2)
 model.fit(X, Y[col])
 models[col] = model



[16:03:01] INFO: /tmp/pip-build-5lk6kad5/xgboost/build/temp.linux-x86_64-3.6/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 12 extra nodes, 0 pruned nodes, max_depth=3
[16:03:01] INFO: /tmp/pip-build-5lk6kad5/xgboost/build/temp.linux-x86_64-3.6/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:03:01] INFO: /tmp/pip-build-5lk6kad5/xgboost/build/temp.linux-x86_64-3.6/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:03:01] INFO: /tmp/pip-build-5lk6kad5/xgboost/build/temp.linux-x86_64-3.6/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:03:01] INFO: /tmp/pip-build-5lk6kad5/xgboost/build/temp.linux-x86_64-3.6/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:03:01] INFO: /tmp/pip-build-5lk6kad5/xgboost/build/temp.linux-x86_64-3.6/xgboost/src/tree/updater_prune.cc

[16:03:02] INFO: /tmp/pip-build-5lk6kad5/xgboost/build/temp.linux-x86_64-3.6/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:03:02] INFO: /tmp/pip-build-5lk6kad5/xgboost/build/temp.linux-x86_64-3.6/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 12 extra nodes, 0 pruned nodes, max_depth=3
[16:03:02] INFO: /tmp/pip-build-5lk6kad5/xgboost/build/temp.linux-x86_64-3.6/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:03:03] INFO: /tmp/pip-build-5lk6kad5/xgboost/build/temp.linux-x86_64-3.6/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 10 extra nodes, 0 pruned nodes, max_depth=3
[16:03:03] INFO: /tmp/pip-build-5lk6kad5/xgboost/build/temp.linux-x86_64-3.6/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 10 extra nodes, 0 pruned nodes, max_depth=3
[16:03:03] INFO: /tmp/pip-build-5lk6kad5/xgboost/build/temp.linux-x86_64-3.6/xgboost/src/tree/updater_prune.cc

[16:03:04] INFO: /tmp/pip-build-5lk6kad5/xgboost/build/temp.linux-x86_64-3.6/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 12 extra nodes, 0 pruned nodes, max_depth=3
[16:03:04] INFO: /tmp/pip-build-5lk6kad5/xgboost/build/temp.linux-x86_64-3.6/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 12 extra nodes, 0 pruned nodes, max_depth=3
[16:03:04] INFO: /tmp/pip-build-5lk6kad5/xgboost/build/temp.linux-x86_64-3.6/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:03:04] INFO: /tmp/pip-build-5lk6kad5/xgboost/build/temp.linux-x86_64-3.6/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 12 extra nodes, 0 pruned nodes, max_depth=3
[16:03:04] INFO: /tmp/pip-build-5lk6kad5/xgboost/build/temp.linux-x86_64-3.6/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 8 extra nodes, 0 pruned nodes, max_depth=3
[16:03:04] INFO: /tmp/pip-build-5lk6kad5/xgboost/build/temp.linux-x86_64-3.6/xgboost/src/tree/updater_prune.cc:

In [6]:
from sklearn.metrics import brier_score_loss, roc_auc_score, log_loss

testX, testY = X, Y

def evaluate(y, y_hat):
 p = sum(y) / len(y)
 base = [p] * len(y)
 brier = brier_score_loss(y, y_hat)
 print(f" Brier score: %.5f (%.5f)" % (brier, brier / brier_score_loss(y, base)))
 ll = log_loss(y, y_hat)
 print(f" log loss score: %.5f (%.5f)" % (ll, ll / log_loss(y, base)))
 print(f" ROC AUC: %.5f" % roc_auc_score(y, y_hat))

for col in testY.columns:
 Y_hat[col] = [p[1] for p in models[col].predict_proba(testX)]
 print(f"### Y: {col} ###")
 evaluate(testY[col], Y_hat[col])

### Y: scores ###
 Brier score: 0.00907 (0.82108)
 log loss score: 0.04501 (0.73379)
 ROC AUC: 0.85998
### Y: concedes ###
 Brier score: 0.00235 (0.80988)
 log loss score: 0.01342 (0.67484)
 ROC AUC: 0.89972


### Save predictions

In [7]:
# get rows with game id per action
A = []
for game_id in tqdm.tqdm(games.game_id, "Loading game ids"):
 Ai = pd.read_hdf(spadl_h5, f"actions/game_{game_id}")
 A.append(Ai[["game_id"]])
A = pd.concat(A)
A = A.reset_index(drop=True)

# concatenate action game id rows with predictions and save per game
grouped_predictions = pd.concat([A, Y_hat], axis=1).groupby("game_id")
for k,df in tqdm.tqdm(grouped_predictions, desc="Saving predictions per game"):
 df = df.reset_index(drop=True)
 df[Y_hat.columns].to_hdf(predictions_h5, f"game_{int(k)}")

Loading game ids: 100%|██████████| 64/64 [00:01<00:00, 49.43it/s]
Saving predictions per game: 100%|██████████| 64/64 [00:03<00:00, 16.09it/s]
