{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from mplsoccer.pitch import Pitch\n",
"\n",
"from sklearn.cluster import KMeans"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"#import data\n",
"df = pd.read_csv('kmeanstutorial.csv')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Unnamed: 0 \n",
" ball_receipt_outcome \n",
" ball_recovery_recovery_failure \n",
" block_deflection \n",
" carry_end_location \n",
" clearance_aerial_won \n",
" counterpress \n",
" dribble_outcome \n",
" dribble_overrun \n",
" duel_outcome \n",
" ... \n",
" shot_statsbomb_xg \n",
" shot_technique \n",
" shot_type \n",
" substitution_outcome \n",
" substitution_replacement \n",
" tactics \n",
" team \n",
" timestamp \n",
" type \n",
" under_pressure \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 0 \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" ... \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" {'formation': 442, 'lineup': [{'player': {'id'... \n",
" France \n",
" 00:00:00.000 \n",
" Starting XI \n",
" NaN \n",
" \n",
" \n",
" 1 \n",
" 1 \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" ... \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" {'formation': 433, 'lineup': [{'player': {'id'... \n",
" Croatia \n",
" 00:00:00.000 \n",
" Starting XI \n",
" NaN \n",
" \n",
" \n",
" 2 \n",
" 2 \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" ... \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" France \n",
" 00:00:00.000 \n",
" Half Start \n",
" NaN \n",
" \n",
" \n",
" 3 \n",
" 3 \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" ... \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" Croatia \n",
" 00:00:00.000 \n",
" Half Start \n",
" NaN \n",
" \n",
" \n",
" 4 \n",
" 4 \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" ... \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" NaN \n",
" Croatia \n",
" 00:00:00.000 \n",
" Half Start \n",
" NaN \n",
" \n",
" \n",
"
\n",
"
5 rows × 74 columns
\n",
"
"
],
"text/plain": [
" Unnamed: 0 ball_receipt_outcome ball_recovery_recovery_failure \\\n",
"0 0 NaN NaN \n",
"1 1 NaN NaN \n",
"2 2 NaN NaN \n",
"3 3 NaN NaN \n",
"4 4 NaN NaN \n",
"\n",
" block_deflection carry_end_location clearance_aerial_won counterpress \\\n",
"0 NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN \n",
"3 NaN NaN NaN NaN \n",
"4 NaN NaN NaN NaN \n",
"\n",
" dribble_outcome dribble_overrun duel_outcome ... shot_statsbomb_xg \\\n",
"0 NaN NaN NaN ... NaN \n",
"1 NaN NaN NaN ... NaN \n",
"2 NaN NaN NaN ... NaN \n",
"3 NaN NaN NaN ... NaN \n",
"4 NaN NaN NaN ... NaN \n",
"\n",
" shot_technique shot_type substitution_outcome substitution_replacement \\\n",
"0 NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN \n",
"3 NaN NaN NaN NaN \n",
"4 NaN NaN NaN NaN \n",
"\n",
" tactics team timestamp \\\n",
"0 {'formation': 442, 'lineup': [{'player': {'id'... France 00:00:00.000 \n",
"1 {'formation': 433, 'lineup': [{'player': {'id'... Croatia 00:00:00.000 \n",
"2 NaN France 00:00:00.000 \n",
"3 NaN Croatia 00:00:00.000 \n",
"4 NaN Croatia 00:00:00.000 \n",
"\n",
" type under_pressure \n",
"0 Starting XI NaN \n",
"1 Starting XI NaN \n",
"2 Half Start NaN \n",
"3 Half Start NaN \n",
"4 Half Start NaN \n",
"\n",
"[5 rows x 74 columns]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Unnamed: 0', 'ball_receipt_outcome', 'ball_recovery_recovery_failure',\n",
" 'block_deflection', 'carry_end_location', 'clearance_aerial_won',\n",
" 'counterpress', 'dribble_outcome', 'dribble_overrun', 'duel_outcome',\n",
" 'duel_type', 'duration', 'foul_committed_advantage',\n",
" 'foul_committed_card', 'foul_committed_penalty', 'foul_committed_type',\n",
" 'foul_won_advantage', 'foul_won_defensive', 'goalkeeper_body_part',\n",
" 'goalkeeper_end_location', 'goalkeeper_outcome', 'goalkeeper_position',\n",
" 'goalkeeper_technique', 'goalkeeper_type', 'id', 'index',\n",
" 'injury_stoppage_in_chain', 'interception_outcome', 'location',\n",
" 'match_id', 'minute', 'pass_aerial_won', 'pass_angle',\n",
" 'pass_assisted_shot_id', 'pass_backheel', 'pass_body_part',\n",
" 'pass_cross', 'pass_cut_back', 'pass_deflected', 'pass_end_location',\n",
" 'pass_goal_assist', 'pass_height', 'pass_length', 'pass_outcome',\n",
" 'pass_recipient', 'pass_shot_assist', 'pass_switch', 'pass_type',\n",
" 'period', 'play_pattern', 'player', 'position', 'possession',\n",
" 'possession_team', 'related_events', 'second', 'shot_aerial_won',\n",
" 'shot_body_part', 'shot_deflected', 'shot_end_location',\n",
" 'shot_first_time', 'shot_freeze_frame', 'shot_key_pass_id',\n",
" 'shot_outcome', 'shot_statsbomb_xg', 'shot_technique', 'shot_type',\n",
" 'substitution_outcome', 'substitution_replacement', 'tactics', 'team',\n",
" 'timestamp', 'type', 'under_pressure'],\n",
" dtype='object')"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"df = df[['team','type','location','pass_end_location']]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" team \n",
" type \n",
" location \n",
" pass_end_location \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" France \n",
" Starting XI \n",
" NaN \n",
" NaN \n",
" \n",
" \n",
" 1 \n",
" Croatia \n",
" Starting XI \n",
" NaN \n",
" NaN \n",
" \n",
" \n",
" 2 \n",
" France \n",
" Half Start \n",
" NaN \n",
" NaN \n",
" \n",
" \n",
" 3 \n",
" Croatia \n",
" Half Start \n",
" NaN \n",
" NaN \n",
" \n",
" \n",
" 4 \n",
" Croatia \n",
" Half Start \n",
" NaN \n",
" NaN \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" team type location pass_end_location\n",
"0 France Starting XI NaN NaN\n",
"1 Croatia Starting XI NaN NaN\n",
"2 France Half Start NaN NaN\n",
"3 Croatia Half Start NaN NaN\n",
"4 Croatia Half Start NaN NaN"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"df = df[(df['team']=='France')&(df['type']=='Pass')].reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" index \n",
" team \n",
" type \n",
" location \n",
" pass_end_location \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 11 \n",
" France \n",
" Pass \n",
" [48.0, 50.0] \n",
" [48.0, 60.0] \n",
" \n",
" \n",
" 1 \n",
" 24 \n",
" France \n",
" Pass \n",
" [49.0, 80.0] \n",
" [46.0, 61.0] \n",
" \n",
" \n",
" 2 \n",
" 25 \n",
" France \n",
" Pass \n",
" [65.0, 64.0] \n",
" [66.0, 69.0] \n",
" \n",
" \n",
" 3 \n",
" 28 \n",
" France \n",
" Pass \n",
" [63.0, 73.0] \n",
" [65.0, 79.0] \n",
" \n",
" \n",
" 4 \n",
" 29 \n",
" France \n",
" Pass \n",
" [58.0, 79.0] \n",
" [26.0, 69.0] \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index team type location pass_end_location\n",
"0 11 France Pass [48.0, 50.0] [48.0, 60.0]\n",
"1 24 France Pass [49.0, 80.0] [46.0, 61.0]\n",
"2 25 France Pass [65.0, 64.0] [66.0, 69.0]\n",
"3 28 France Pass [63.0, 73.0] [65.0, 79.0]\n",
"4 29 France Pass [58.0, 79.0] [26.0, 69.0]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dtype('O')"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.location.dtype"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"df[['x','y']] = df.location.str.split(expand=True)\n",
"df[['endX','endY']] = df.pass_end_location.str.split(expand=True)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" index \n",
" team \n",
" type \n",
" location \n",
" pass_end_location \n",
" x \n",
" y \n",
" endX \n",
" endY \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 11 \n",
" France \n",
" Pass \n",
" [48.0, 50.0] \n",
" [48.0, 60.0] \n",
" [48.0, \n",
" 50.0] \n",
" [48.0, \n",
" 60.0] \n",
" \n",
" \n",
" 1 \n",
" 24 \n",
" France \n",
" Pass \n",
" [49.0, 80.0] \n",
" [46.0, 61.0] \n",
" [49.0, \n",
" 80.0] \n",
" [46.0, \n",
" 61.0] \n",
" \n",
" \n",
" 2 \n",
" 25 \n",
" France \n",
" Pass \n",
" [65.0, 64.0] \n",
" [66.0, 69.0] \n",
" [65.0, \n",
" 64.0] \n",
" [66.0, \n",
" 69.0] \n",
" \n",
" \n",
" 3 \n",
" 28 \n",
" France \n",
" Pass \n",
" [63.0, 73.0] \n",
" [65.0, 79.0] \n",
" [63.0, \n",
" 73.0] \n",
" [65.0, \n",
" 79.0] \n",
" \n",
" \n",
" 4 \n",
" 29 \n",
" France \n",
" Pass \n",
" [58.0, 79.0] \n",
" [26.0, 69.0] \n",
" [58.0, \n",
" 79.0] \n",
" [26.0, \n",
" 69.0] \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index team type location pass_end_location x y endX \\\n",
"0 11 France Pass [48.0, 50.0] [48.0, 60.0] [48.0, 50.0] [48.0, \n",
"1 24 France Pass [49.0, 80.0] [46.0, 61.0] [49.0, 80.0] [46.0, \n",
"2 25 France Pass [65.0, 64.0] [66.0, 69.0] [65.0, 64.0] [66.0, \n",
"3 28 France Pass [63.0, 73.0] [65.0, 79.0] [63.0, 73.0] [65.0, \n",
"4 29 France Pass [58.0, 79.0] [26.0, 69.0] [58.0, 79.0] [26.0, \n",
"\n",
" endY \n",
"0 60.0] \n",
"1 61.0] \n",
"2 69.0] \n",
"3 79.0] \n",
"4 69.0] "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"df['x'] = df.loc[:,'x'] = df.x.map(lambda x: x[1:-1]).astype(float)\n",
"df['y'] = df.loc[:,'y'] = df.y.map(lambda x: x[0:-1]).astype(float)\n",
"df['endX'] = df.loc[:,'endX'] = df.endX.map(lambda x: x[1:-1]).astype(float)\n",
"df['endY'] = df.loc[:,'endY'] = df.endY.map(lambda x: x[0:-1]).astype(float)\n",
"df = df.drop(['location','pass_end_location'],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" index \n",
" team \n",
" type \n",
" x \n",
" y \n",
" endX \n",
" endY \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 11 \n",
" France \n",
" Pass \n",
" 48.0 \n",
" 50.0 \n",
" 48.0 \n",
" 60.0 \n",
" \n",
" \n",
" 1 \n",
" 24 \n",
" France \n",
" Pass \n",
" 49.0 \n",
" 80.0 \n",
" 46.0 \n",
" 61.0 \n",
" \n",
" \n",
" 2 \n",
" 25 \n",
" France \n",
" Pass \n",
" 65.0 \n",
" 64.0 \n",
" 66.0 \n",
" 69.0 \n",
" \n",
" \n",
" 3 \n",
" 28 \n",
" France \n",
" Pass \n",
" 63.0 \n",
" 73.0 \n",
" 65.0 \n",
" 79.0 \n",
" \n",
" \n",
" 4 \n",
" 29 \n",
" France \n",
" Pass \n",
" 58.0 \n",
" 79.0 \n",
" 26.0 \n",
" 69.0 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index team type x y endX endY\n",
"0 11 France Pass 48.0 50.0 48.0 60.0\n",
"1 24 France Pass 49.0 80.0 46.0 61.0\n",
"2 25 France Pass 65.0 64.0 66.0 69.0\n",
"3 28 France Pass 63.0 73.0 65.0 79.0\n",
"4 29 France Pass 58.0 79.0 26.0 69.0"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#implement the kmeans "
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"X = np.array(df[['x','y','endX','endY']])\n",
"kmeans = KMeans(n_clusters = 10,random_state=100)\n",
"kmeans.fit(X)\n",
"df['cluster'] = kmeans.predict(X)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" index \n",
" team \n",
" type \n",
" x \n",
" y \n",
" endX \n",
" endY \n",
" cluster \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 11 \n",
" France \n",
" Pass \n",
" 48.0 \n",
" 50.0 \n",
" 48.0 \n",
" 60.0 \n",
" 0 \n",
" \n",
" \n",
" 1 \n",
" 24 \n",
" France \n",
" Pass \n",
" 49.0 \n",
" 80.0 \n",
" 46.0 \n",
" 61.0 \n",
" 0 \n",
" \n",
" \n",
" 2 \n",
" 25 \n",
" France \n",
" Pass \n",
" 65.0 \n",
" 64.0 \n",
" 66.0 \n",
" 69.0 \n",
" 0 \n",
" \n",
" \n",
" 3 \n",
" 28 \n",
" France \n",
" Pass \n",
" 63.0 \n",
" 73.0 \n",
" 65.0 \n",
" 79.0 \n",
" 0 \n",
" \n",
" \n",
" 4 \n",
" 29 \n",
" France \n",
" Pass \n",
" 58.0 \n",
" 79.0 \n",
" 26.0 \n",
" 69.0 \n",
" 0 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index team type x y endX endY cluster\n",
"0 11 France Pass 48.0 50.0 48.0 60.0 0\n",
"1 24 France Pass 49.0 80.0 46.0 61.0 0\n",
"2 25 France Pass 65.0 64.0 66.0 69.0 0\n",
"3 28 France Pass 63.0 73.0 65.0 79.0 0\n",
"4 29 France Pass 58.0 79.0 26.0 69.0 0"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 41\n",
"1 40\n",
"6 38\n",
"8 31\n",
"3 31\n",
"7 28\n",
"2 26\n",
"9 25\n",
"4 18\n",
"5 14\n",
"Name: cluster, dtype: int64"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.cluster.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig, ax = plt.subplots(figsize=(10,10))\n",
"fig.set_facecolor('#38383b')\n",
"ax.patch.set_facecolor('#38383b')\n",
"\n",
"pitch = Pitch(pitch_type='statsbomb',orientation='horizontal',\n",
" pitch_color='#38383b',line_color='white',figsize=(10,10),\n",
" constrained_layout=False,tight_layout=True,view='full')\n",
"\n",
"pitch.draw(ax=ax)\n",
"\n",
"for x in range(len(df['cluster'])):\n",
" \n",
" if df['cluster'][x] ==0:\n",
" pitch.lines(xstart=df['x'][x],ystart=df['y'][x],xend=df['endX'][x],yend=df['endY'][x],\n",
" color='#74c69d',lw=3,zorder=2,comet=True,ax=ax)\n",
" \n",
" if df['cluster'][x] ==5:\n",
" pitch.lines(xstart=df['x'][x],ystart=df['y'][x],xend=df['endX'][x],yend=df['endY'][x],\n",
" color='#add8e6',lw=3,zorder=2,comet=True,ax=ax)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}