{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "from mplsoccer.pitch import Pitch\n", "\n", "from sklearn.cluster import KMeans" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "#import data\n", "df = pd.read_csv('kmeanstutorial.csv')" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0ball_receipt_outcomeball_recovery_recovery_failureblock_deflectioncarry_end_locationclearance_aerial_woncounterpressdribble_outcomedribble_overrunduel_outcome...shot_statsbomb_xgshot_techniqueshot_typesubstitution_outcomesubstitution_replacementtacticsteamtimestamptypeunder_pressure
00NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaN{'formation': 442, 'lineup': [{'player': {'id'...France00:00:00.000Starting XINaN
11NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaN{'formation': 433, 'lineup': [{'player': {'id'...Croatia00:00:00.000Starting XINaN
22NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNFrance00:00:00.000Half StartNaN
33NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNCroatia00:00:00.000Half StartNaN
44NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNCroatia00:00:00.000Half StartNaN
\n", "

5 rows × 74 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 ball_receipt_outcome ball_recovery_recovery_failure \\\n", "0 0 NaN NaN \n", "1 1 NaN NaN \n", "2 2 NaN NaN \n", "3 3 NaN NaN \n", "4 4 NaN NaN \n", "\n", " block_deflection carry_end_location clearance_aerial_won counterpress \\\n", "0 NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN \n", "\n", " dribble_outcome dribble_overrun duel_outcome ... shot_statsbomb_xg \\\n", "0 NaN NaN NaN ... NaN \n", "1 NaN NaN NaN ... NaN \n", "2 NaN NaN NaN ... NaN \n", "3 NaN NaN NaN ... NaN \n", "4 NaN NaN NaN ... NaN \n", "\n", " shot_technique shot_type substitution_outcome substitution_replacement \\\n", "0 NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN \n", "\n", " tactics team timestamp \\\n", "0 {'formation': 442, 'lineup': [{'player': {'id'... France 00:00:00.000 \n", "1 {'formation': 433, 'lineup': [{'player': {'id'... Croatia 00:00:00.000 \n", "2 NaN France 00:00:00.000 \n", "3 NaN Croatia 00:00:00.000 \n", "4 NaN Croatia 00:00:00.000 \n", "\n", " type under_pressure \n", "0 Starting XI NaN \n", "1 Starting XI NaN \n", "2 Half Start NaN \n", "3 Half Start NaN \n", "4 Half Start NaN \n", "\n", "[5 rows x 74 columns]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Unnamed: 0', 'ball_receipt_outcome', 'ball_recovery_recovery_failure',\n", " 'block_deflection', 'carry_end_location', 'clearance_aerial_won',\n", " 'counterpress', 'dribble_outcome', 'dribble_overrun', 'duel_outcome',\n", " 'duel_type', 'duration', 'foul_committed_advantage',\n", " 'foul_committed_card', 'foul_committed_penalty', 'foul_committed_type',\n", " 'foul_won_advantage', 'foul_won_defensive', 'goalkeeper_body_part',\n", " 'goalkeeper_end_location', 'goalkeeper_outcome', 'goalkeeper_position',\n", " 'goalkeeper_technique', 'goalkeeper_type', 'id', 'index',\n", " 'injury_stoppage_in_chain', 'interception_outcome', 'location',\n", " 'match_id', 'minute', 'pass_aerial_won', 'pass_angle',\n", " 'pass_assisted_shot_id', 'pass_backheel', 'pass_body_part',\n", " 'pass_cross', 'pass_cut_back', 'pass_deflected', 'pass_end_location',\n", " 'pass_goal_assist', 'pass_height', 'pass_length', 'pass_outcome',\n", " 'pass_recipient', 'pass_shot_assist', 'pass_switch', 'pass_type',\n", " 'period', 'play_pattern', 'player', 'position', 'possession',\n", " 'possession_team', 'related_events', 'second', 'shot_aerial_won',\n", " 'shot_body_part', 'shot_deflected', 'shot_end_location',\n", " 'shot_first_time', 'shot_freeze_frame', 'shot_key_pass_id',\n", " 'shot_outcome', 'shot_statsbomb_xg', 'shot_technique', 'shot_type',\n", " 'substitution_outcome', 'substitution_replacement', 'tactics', 'team',\n", " 'timestamp', 'type', 'under_pressure'],\n", " dtype='object')" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "df = df[['team','type','location','pass_end_location']]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
teamtypelocationpass_end_location
0FranceStarting XINaNNaN
1CroatiaStarting XINaNNaN
2FranceHalf StartNaNNaN
3CroatiaHalf StartNaNNaN
4CroatiaHalf StartNaNNaN
\n", "
" ], "text/plain": [ " team type location pass_end_location\n", "0 France Starting XI NaN NaN\n", "1 Croatia Starting XI NaN NaN\n", "2 France Half Start NaN NaN\n", "3 Croatia Half Start NaN NaN\n", "4 Croatia Half Start NaN NaN" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "df = df[(df['team']=='France')&(df['type']=='Pass')].reset_index()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexteamtypelocationpass_end_location
011FrancePass[48.0, 50.0][48.0, 60.0]
124FrancePass[49.0, 80.0][46.0, 61.0]
225FrancePass[65.0, 64.0][66.0, 69.0]
328FrancePass[63.0, 73.0][65.0, 79.0]
429FrancePass[58.0, 79.0][26.0, 69.0]
\n", "
" ], "text/plain": [ " index team type location pass_end_location\n", "0 11 France Pass [48.0, 50.0] [48.0, 60.0]\n", "1 24 France Pass [49.0, 80.0] [46.0, 61.0]\n", "2 25 France Pass [65.0, 64.0] [66.0, 69.0]\n", "3 28 France Pass [63.0, 73.0] [65.0, 79.0]\n", "4 29 France Pass [58.0, 79.0] [26.0, 69.0]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dtype('O')" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.location.dtype" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "df[['x','y']] = df.location.str.split(expand=True)\n", "df[['endX','endY']] = df.pass_end_location.str.split(expand=True)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexteamtypelocationpass_end_locationxyendXendY
011FrancePass[48.0, 50.0][48.0, 60.0][48.0,50.0][48.0,60.0]
124FrancePass[49.0, 80.0][46.0, 61.0][49.0,80.0][46.0,61.0]
225FrancePass[65.0, 64.0][66.0, 69.0][65.0,64.0][66.0,69.0]
328FrancePass[63.0, 73.0][65.0, 79.0][63.0,73.0][65.0,79.0]
429FrancePass[58.0, 79.0][26.0, 69.0][58.0,79.0][26.0,69.0]
\n", "
" ], "text/plain": [ " index team type location pass_end_location x y endX \\\n", "0 11 France Pass [48.0, 50.0] [48.0, 60.0] [48.0, 50.0] [48.0, \n", "1 24 France Pass [49.0, 80.0] [46.0, 61.0] [49.0, 80.0] [46.0, \n", "2 25 France Pass [65.0, 64.0] [66.0, 69.0] [65.0, 64.0] [66.0, \n", "3 28 France Pass [63.0, 73.0] [65.0, 79.0] [63.0, 73.0] [65.0, \n", "4 29 France Pass [58.0, 79.0] [26.0, 69.0] [58.0, 79.0] [26.0, \n", "\n", " endY \n", "0 60.0] \n", "1 61.0] \n", "2 69.0] \n", "3 79.0] \n", "4 69.0] " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "df['x'] = df.loc[:,'x'] = df.x.map(lambda x: x[1:-1]).astype(float)\n", "df['y'] = df.loc[:,'y'] = df.y.map(lambda x: x[0:-1]).astype(float)\n", "df['endX'] = df.loc[:,'endX'] = df.endX.map(lambda x: x[1:-1]).astype(float)\n", "df['endY'] = df.loc[:,'endY'] = df.endY.map(lambda x: x[0:-1]).astype(float)\n", "df = df.drop(['location','pass_end_location'],axis=1)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexteamtypexyendXendY
011FrancePass48.050.048.060.0
124FrancePass49.080.046.061.0
225FrancePass65.064.066.069.0
328FrancePass63.073.065.079.0
429FrancePass58.079.026.069.0
\n", "
" ], "text/plain": [ " index team type x y endX endY\n", "0 11 France Pass 48.0 50.0 48.0 60.0\n", "1 24 France Pass 49.0 80.0 46.0 61.0\n", "2 25 France Pass 65.0 64.0 66.0 69.0\n", "3 28 France Pass 63.0 73.0 65.0 79.0\n", "4 29 France Pass 58.0 79.0 26.0 69.0" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#implement the kmeans " ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "X = np.array(df[['x','y','endX','endY']])\n", "kmeans = KMeans(n_clusters = 10,random_state=100)\n", "kmeans.fit(X)\n", "df['cluster'] = kmeans.predict(X)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexteamtypexyendXendYcluster
011FrancePass48.050.048.060.00
124FrancePass49.080.046.061.00
225FrancePass65.064.066.069.00
328FrancePass63.073.065.079.00
429FrancePass58.079.026.069.00
\n", "
" ], "text/plain": [ " index team type x y endX endY cluster\n", "0 11 France Pass 48.0 50.0 48.0 60.0 0\n", "1 24 France Pass 49.0 80.0 46.0 61.0 0\n", "2 25 France Pass 65.0 64.0 66.0 69.0 0\n", "3 28 France Pass 63.0 73.0 65.0 79.0 0\n", "4 29 France Pass 58.0 79.0 26.0 69.0 0" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 41\n", "1 40\n", "6 38\n", "8 31\n", "3 31\n", "7 28\n", "2 26\n", "9 25\n", "4 18\n", "5 14\n", "Name: cluster, dtype: int64" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cluster.value_counts()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig, ax = plt.subplots(figsize=(10,10))\n", "fig.set_facecolor('#38383b')\n", "ax.patch.set_facecolor('#38383b')\n", "\n", "pitch = Pitch(pitch_type='statsbomb',orientation='horizontal',\n", " pitch_color='#38383b',line_color='white',figsize=(10,10),\n", " constrained_layout=False,tight_layout=True,view='full')\n", "\n", "pitch.draw(ax=ax)\n", "\n", "for x in range(len(df['cluster'])):\n", " \n", " if df['cluster'][x] ==0:\n", " pitch.lines(xstart=df['x'][x],ystart=df['y'][x],xend=df['endX'][x],yend=df['endY'][x],\n", " color='#74c69d',lw=3,zorder=2,comet=True,ax=ax)\n", " \n", " if df['cluster'][x] ==5:\n", " pitch.lines(xstart=df['x'][x],ystart=df['y'][x],xend=df['endX'][x],yend=df['endY'][x],\n", " color='#add8e6',lw=3,zorder=2,comet=True,ax=ax)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }