{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# PCA-based model to find similar players" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "#RUN ALL THE CELLS TILL TO RUN THE APP" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "by Parth Athale (@ParthAthale)\n", "\n", "Data credits to FBref/StatsBomb\n", "Read methodology here: https://xgpershot.wordpress.com/2020/08/06/pca-based-model-to-identify-similar-players/\n", "Read code here: https://github.com/parth1902/PCA_Player_Finder\n", "\n", "Some examples as a guide to do this:\n", "If you want to find out a Pierre-Emile Højbjerg replacement for Southampton, choose Højbjerg, Southampton, Overall\n", "If you want to find out a player like Leroy Sané for Barcelona, choose Sané, Barcelona, Overall\n", "If you want to find out a player with the defensive ability of Wilfried Ndidi without any team constraint, choose Ndidi, Overall, Defensive work\n" ] } ], "source": [ "class color:\n", " PURPLE = '\\033[95m'\n", " CYAN = '\\033[96m'\n", " DARKCYAN = '\\033[36m'\n", " BLUE = '\\033[94m'\n", " GREEN = '\\033[92m'\n", " YELLOW = '\\033[93m'\n", " RED = '\\033[91m'\n", " BOLD = '\\033[1m'\n", " UNDERLINE = '\\033[4m'\n", " END = '\\033[0m'\n", "\n", "#print(color.BOLD + 'PCA-based model to find similar players' + color.END)\n", "print('by Parth Athale (@ParthAthale)\\n')\n", "print('Data credits to FBref/StatsBomb')\n", "print('Read methodology here: https://xgpershot.wordpress.com/2020/08/06/pca-based-model-to-identify-similar-players/')\n", "print('Read code here: https://github.com/parth1902/PCA_Player_Finder\\n')\n", "print('Some examples as a guide to do this:')\n", "print('If you want to find out a Pierre-Emile Højbjerg replacement for Southampton, choose Højbjerg, Southampton, Overall')\n", "print('If you want to find out a player like Leroy Sané for Barcelona, choose Sané, Barcelona, Overall')\n", "print('If you want to find out a player with the defensive ability of Wilfried Ndidi without any team constraint, choose Ndidi, Overall, Defensive work')\n", "\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from __future__ import print_function\n", "from ipywidgets import interact, interactive, fixed, interact_manual\n", "import ipywidgets as widgets\n", "import pandas as pd\n", "import numpy as np\n", "pd.set_option('expand_frame_repr', False)\n", "pd.set_option('display.max_columns', 10)\n", "from IPython.display import HTML, display\n", "\n", "display(HTML(''''''))\n", " \n", " " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "url = 'https://raw.githubusercontent.com/parth1902/test/master/games.csv'\n", "df = pd.read_csv(url,sep = ',')\n", "\n", "url = 'https://raw.githubusercontent.com/parth1902/test/master/players.csv'\n", "df_players = pd.read_csv(url,sep = ',')\n", "\n", "df_players = df_players[df_players['minutes'] > 500]\n", "df_players = df_players[df_players['position'] != 'GK']\n", "df_players = df_players.drop_duplicates(subset=['player'], keep='last')\n", "\n", "def f(df):\n", " if df['result'] == 'W':\n", " val = 3\n", " elif df['result'] == 'D':\n", " val = 1\n", " else:\n", " val = 0\n", " return val\n", "df['Points taken'] = df.apply(f, axis = 1)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "arr = ['minutes',\n", " 'goals',\n", " 'assists',\n", " 'pens_made',\n", " 'pens_att',\n", " 'xg',\n", " 'npxg',\n", " 'xa',\n", " 'shots_total',\n", " 'shots_on_target',\n", " 'shots_free_kicks',\n", " 'xg_net',\n", " 'npxg_net',\n", " 'passes_completed',\n", " 'passes',\n", " 'passes_total_distance',\n", " 'passes_progressive_distance',\n", " 'passes_completed_short',\n", " 'passes_short',\n", " 'passes_completed_medium',\n", " 'passes_medium',\n", " 'passes_completed_long',\n", " 'passes_long',\n", " 'assisted_shots',\n", " 'passes_into_final_third',\n", " 'passes_into_penalty_area',\n", " 'crosses_into_penalty_area',\n", " 'progressive_passes',\n", " 'passes_live',\n", " 'passes_dead',\n", " 'passes_free_kicks',\n", " 'through_balls',\n", " 'passes_pressure',\n", " 'passes_switches',\n", " 'crosses',\n", " 'corner_kicks',\n", " 'corner_kicks_in',\n", " 'corner_kicks_out',\n", " 'corner_kicks_straight',\n", " 'passes_ground',\n", " 'passes_low',\n", " 'passes_high',\n", " 'passes_left_foot',\n", " 'passes_right_foot',\n", " 'passes_head',\n", " 'throw_ins',\n", " 'passes_other_body',\n", " 'passes_offsides',\n", " 'passes_oob',\n", " 'passes_intercepted',\n", " 'passes_blocked',\n", " 'sca',\n", " 'sca_passes_live',\n", " 'sca_passes_dead',\n", " 'sca_dribbles',\n", " 'sca_shots',\n", " 'sca_fouled',\n", " 'gca',\n", " 'gca_passes_live',\n", " 'gca_passes_dead',\n", " 'gca_dribbles',\n", " 'gca_shots',\n", " 'gca_fouled',\n", " 'gca_og_for',\n", " 'tackles',\n", " 'tackles_won',\n", " 'tackles_def_3rd',\n", " 'tackles_mid_3rd',\n", " 'tackles_att_3rd',\n", " 'dribble_tackles',\n", " 'dribbles_vs',\n", " 'dribbled_past',\n", " 'pressures',\n", " 'pressure_regains',\n", " 'pressures_def_3rd',\n", " 'pressures_mid_3rd',\n", " 'pressures_att_3rd',\n", " 'blocks',\n", " 'blocked_shots',\n", " 'blocked_shots_saves',\n", " 'blocked_passes',\n", " 'interceptions',\n", " 'clearances',\n", " 'errors',\n", " 'touches',\n", " 'touches_def_pen_area',\n", " 'touches_def_3rd',\n", " 'touches_mid_3rd',\n", " 'touches_att_3rd',\n", " 'touches_att_pen_area',\n", " 'touches_live_ball',\n", " 'dribbles_completed',\n", " 'dribbles',\n", " 'players_dribbled_past',\n", " 'nutmegs',\n", " 'carries',\n", " 'carry_distance',\n", " 'carry_progressive_distance',\n", " 'pass_targets',\n", " 'miscontrols',\n", " 'dispossessed']\n", "from sklearn.preprocessing import MinMaxScaler\n", "scaler = MinMaxScaler()\n", "for i in range (0,len(arr)):\n", " if arr[i]!= 'minutes':\n", " df_players[arr[i]] = (df_players[arr[i]]/df_players['minutes'])*90\n", "df_players = df_players.drop(['goals_per90',\n", "'cards_yellow',\n", "'cards_red',\n", "'assists_per90',\n", "'goals_assists_per90',\n", "'goals_pens_per90',\n", "'goals_assists_pens_per90',\n", "'xg_per90',\n", "'xa_per90',\n", "'xg_xa_per90',\n", "'npxg_per90',\n", "'npxg_xa_per90',\n", "'minutes_90s',\n", "'shots_total_per90',\n", "'shots_on_target_per90',\n", "'xa_net',\n", "'sca_per90',\n", "'gca_per90',\n", "'passes_received',\n", "'cards_yellow_red',\n", "'fouls',\n", "'fouled',\n", "'offsides',\n", "'pens_won',\n", "'pens_conceded',\n", "'own_goals',\n", "'ball_recoveries',\n", "'aerials_won',\n", "'aerials_lost',\n", "'aerials_won_pct'], axis = 1)\n", "df_playersnew = df_players.drop(['player',\n", " 'nationality',\n", " 'position',\n", " 'squad',\n", " 'age',\n", " 'birth_year',\n", " 'games',\n", " 'games_starts','minutes'], axis = 1)\n", "arr2 = list(df_playersnew.columns.values)\n", "for i in range(0,len(arr2)):\n", " df_playersnew[arr2[i]] = scaler.fit_transform(df_playersnew[[arr2[i]]])\n", " \n", "df.rename(columns={'xg_for':'xg'}, inplace=True) \n", "df_playersnew1 = df_playersnew.copy()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def shoot(df_playersnew1):\n", " return df_playersnew1[['goals',\n", " 'xg',\n", " 'npxg',\n", " 'shots_total',\n", " 'shots_on_target',\n", " 'shots_free_kicks',\n", " 'shots_on_target_pct',\n", " 'goals_per_shot',\n", " 'goals_per_shot_on_target',\n", " 'npxg_per_shot',\n", " 'xg_net',\n", " 'npxg_net']]\n", "\n", "def create(df_playersnew1):\n", " return df_playersnew1[['sca',\n", " 'sca_passes_live',\n", " 'sca_passes_dead',\n", " 'sca_dribbles',\n", " 'sca_shots',\n", " 'sca_fouled',\n", " 'assisted_shots',\n", " 'through_balls',\n", " 'gca',\n", " 'gca_passes_live',\n", " 'gca_passes_dead',\n", " 'gca_dribbles',\n", " 'gca_shots',\n", " 'gca_fouled',\n", " 'gca_og_for','assists','xa']]\n", "\n", "def passs(df_playersnew1):\n", " return df_playersnew1[['passes_completed',\n", " 'passes',\n", " 'passes_pct',\n", " 'passes_total_distance',\n", " 'passes_progressive_distance',\n", " 'passes_completed_short',\n", " 'passes_short',\n", " 'passes_pct_short',\n", " 'passes_completed_medium',\n", " 'passes_medium',\n", " 'passes_pct_medium',\n", " 'passes_completed_long',\n", " 'passes_long',\n", " 'passes_pct_long',\n", " 'passes_into_final_third',\n", " 'passes_into_penalty_area',\n", " 'crosses_into_penalty_area',\n", " 'progressive_passes',\n", " 'passes_live',\n", " 'passes_dead',\n", " 'passes_free_kicks',\n", " 'passes_pressure',\n", " 'passes_switches',\n", " 'crosses',\n", " 'corner_kicks',\n", " 'corner_kicks_in',\n", " 'corner_kicks_out',\n", " 'corner_kicks_straight',\n", " 'passes_ground',\n", " 'passes_low',\n", " 'passes_high',\n", " 'passes_left_foot',\n", " 'passes_right_foot',\n", " 'passes_head',\n", " 'throw_ins',\n", " 'passes_other_body',\n", " 'passes_offsides',\n", " 'passes_oob',\n", " 'passes_intercepted',\n", " 'passes_blocked']]\n", "\n", "def deff(df_playersnew1):\n", " return df_playersnew1[['tackles',\n", " 'tackles_won',\n", " 'tackles_def_3rd',\n", " 'tackles_mid_3rd',\n", " 'tackles_att_3rd',\n", " 'dribble_tackles',\n", " 'dribbles_vs',\n", " 'dribble_tackles_pct',\n", " 'dribbled_past',\n", " 'pressures',\n", " 'pressure_regains',\n", " 'pressure_regain_pct',\n", " 'pressures_def_3rd',\n", " 'pressures_mid_3rd',\n", " 'pressures_att_3rd',\n", " 'blocks',\n", " 'blocked_shots',\n", " 'blocked_shots_saves',\n", " 'blocked_passes',\n", " 'interceptions',\n", " 'clearances',\n", " 'errors']]\n", "\n", "def poss(df_playersnew1):\n", " return df_playersnew1[['touches',\n", " 'touches_def_pen_area',\n", " 'touches_def_3rd',\n", " 'touches_mid_3rd',\n", " 'touches_att_3rd',\n", " 'touches_att_pen_area',\n", " 'touches_live_ball',\n", " 'dribbles_completed',\n", " 'dribbles',\n", " 'dribbles_completed_pct',\n", " 'players_dribbled_past',\n", " 'nutmegs',\n", " 'carries',\n", " 'carry_distance',\n", " 'carry_progressive_distance',\n", " 'pass_targets',\n", " 'passes_received_pct',\n", " 'miscontrols',\n", " 'dispossessed']]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "pl = np.array(df_players['player'])\n", "te = np.array(df_players['squad'].unique())\n", "te = np.append('Overall',sorted(te))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": true }, "outputs": [], "source": [ "def find(player_name,team,skill,number_of_results):\n", " \n", " global df\n", " global df_players\n", " global finalDf\n", " global df_playersnew\n", " global df_playersnew2\n", " global df_playersnew1\n", " global principalDf\n", " df_playersnew1 = df_playersnew.copy()\n", " df = df.loc[:,~df.T.duplicated(keep='first')]\n", " df = df.loc[:, ~df.columns.duplicated()]\n", " if team != \"Overall\":\n", " df_new = df[df['for'] == team]\n", " corrMatrix = df_new.corr()\n", " else:\n", " corrMatrix = df.corr()\n", " arr2 = list(df_playersnew1.columns.values)\n", " for i in range(0,len(arr2)):\n", " df_playersnew1[arr2[i]] = (df_playersnew1[arr2[i]]) * (corrMatrix['Points taken'][arr2[i]])\n", " if skill == 'Overall':\n", " df_playersnew2 = df_playersnew1\n", " elif skill == 'Possession':\n", " df_playersnew2 = poss(df_playersnew1)\n", " elif skill == 'Shooting':\n", " df_playersnew2 = shoot(df_playersnew1)\n", " elif skill == 'Passing':\n", " df_playersnew2 = passs(df_playersnew1)\n", " elif skill == 'Creating':\n", " df_playersnew2 = create(df_playersnew1) \n", " elif skill == 'Defensive work':\n", " df_playersnew2 = deff(df_playersnew1)\n", " \n", " \n", " features = list(df_playersnew2.columns.values)\n", " #df_playersnew2 = df_playersnew2.fillna(0)\n", " from sklearn.preprocessing import StandardScaler\n", " # Separating out the features\n", " x = df_playersnew2.loc[:, features].values\n", " # Separating out the target\n", " y = df_players.loc[:,['player']].values\n", " # Standardizing the features\n", " #x = StandardScaler().fit_transform(x)\n", " x = np.nan_to_num(x)\n", " from sklearn.decomposition import PCA\n", " #pca = PCA(n_components=2)\n", " pca = PCA(.90)\n", " principalComponents = pca.fit_transform(x)\n", " print('Number of PCA components:',pca.n_components_)\n", " print('\\n')\n", " principalDf = pd.DataFrame(data = principalComponents)\n", " global finalDf\n", " df_players = df_players.reset_index(drop=True)\n", " finalDf = pd.concat([principalDf, df_players[['player']]], axis = 1)\n", " finalDf = pd.concat([finalDf, df_players[['squad']]], axis = 1)\n", " finalDf = pd.concat([finalDf, df_players[['position']]], axis = 1)\n", " finalDf = pd.concat([finalDf, df_players[['age']]], axis = 1)\n", " \n", " player = player_name\n", " class color:\n", " PURPLE = '\\033[95m'\n", " CYAN = '\\033[96m'\n", " DARKCYAN = '\\033[36m'\n", " BLUE = '\\033[94m'\n", " GREEN = '\\033[92m'\n", " YELLOW = '\\033[93m'\n", " RED = '\\033[91m'\n", " BOLD = '\\033[1m'\n", " UNDERLINE = '\\033[4m'\n", " END = '\\033[0m'\n", "\n", "#print(color.BOLD + 'Hello World !' + color.END)\n", " a = (finalDf[finalDf['player'] == player])[(finalDf[finalDf['player'] == player]).columns.drop(['player','squad','position','age'])]\n", " b = finalDf[finalDf.columns.drop(['player','squad','position','age'])]\n", " finalDf['distance'] = (finalDf[list(b.columns.values)] - np.array(a)).pow(2).sum(1).pow(0.5)\n", " dist = finalDf['distance'].max()\n", " dist2 = finalDf['distance'].quantile(0.95)\n", " finalDf['% match'] = 100-(finalDf['distance']/dist2)*100\n", " final = ((finalDf.sort_values(['distance'], ascending=[True])))[1:number_of_results+1]\n", " final = final.reset_index(drop=True)\n", " print(color.BOLD + 'List of similar players:' + color.END)\n", " print('\\n')\n", " print(final[['player','squad','position','age','% match']])\n", " " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def find2(player_name,team,stats,number_of_results):\n", " \n", " print('Stats selected:',stats)\n", " print('\\n')\n", " global df\n", " global df_players\n", " global finalDf\n", " global df_playersnew\n", " global df_playersnew2\n", " global df_playersnew1\n", " global principalDf\n", " df_playersnew1 = df_playersnew.copy()\n", " df = df.loc[:,~df.T.duplicated(keep='first')]\n", " df = df.loc[:, ~df.columns.duplicated()]\n", " if team != \"Overall\":\n", " df_new = df[df['for'] == team]\n", " corrMatrix = df_new.corr()\n", " else:\n", " corrMatrix = df.corr()\n", " arr2 = list(df_playersnew1.columns.values)\n", " for i in range(0,len(arr2)):\n", " df_playersnew1[arr2[i]] = (df_playersnew1[arr2[i]]) * (corrMatrix['Points taken'][arr2[i]])\n", " if not stats:\n", " print(\"Choose at least one stat to see output\")\n", " return\n", " else:\n", " df_playersnew2 = df_playersnew1[np.array(stats)]\n", " \n", " \n", " features = list(df_playersnew2.columns.values)\n", " #df_playersnew2 = df_playersnew2.fillna(0)\n", " from sklearn.preprocessing import StandardScaler\n", " # Separating out the features\n", " x = df_playersnew2.loc[:, features].values\n", " # Separating out the target\n", " y = df_players.loc[:,['player']].values\n", " # Standardizing the features\n", " #x = StandardScaler().fit_transform(x)\n", " x = np.nan_to_num(x)\n", " from sklearn.decomposition import PCA\n", " #pca = PCA(n_components=2)\n", " pca = PCA(.90)\n", " principalComponents = pca.fit_transform(x)\n", " print('Number of PCA components:',pca.n_components_)\n", " print('\\n')\n", " principalDf = pd.DataFrame(data = principalComponents)\n", " \n", " global finalDf\n", " df_players = df_players.reset_index(drop=True)\n", " finalDf = pd.concat([principalDf, df_players[['player']]], axis = 1)\n", " finalDf = pd.concat([finalDf, df_players[['squad']]], axis = 1)\n", " finalDf = pd.concat([finalDf, df_players[['position']]], axis = 1)\n", " finalDf = pd.concat([finalDf, df_players[['age']]], axis = 1)\n", " \n", " player = player_name\n", " class color:\n", " PURPLE = '\\033[95m'\n", " CYAN = '\\033[96m'\n", " DARKCYAN = '\\033[36m'\n", " BLUE = '\\033[94m'\n", " GREEN = '\\033[92m'\n", " YELLOW = '\\033[93m'\n", " RED = '\\033[91m'\n", " BOLD = '\\033[1m'\n", " UNDERLINE = '\\033[4m'\n", " END = '\\033[0m'\n", "\n", "#print(color.BOLD + 'Hello World !' + color.END)\n", " a = (finalDf[finalDf['player'] == player])[(finalDf[finalDf['player'] == player]).columns.drop(['player','squad','position','age'])]\n", " b = finalDf[finalDf.columns.drop(['player','squad','position','age'])]\n", " finalDf['distance'] = (finalDf[list(b.columns.values)] - np.array(a)).pow(2).sum(1).pow(0.5)\n", " dist = finalDf['distance'].max()\n", " dist2 = finalDf['distance'].quantile(0.95)\n", " finalDf['% match'] = 100-(finalDf['distance']/dist2)*100\n", " final = ((finalDf.sort_values(['distance'], ascending=[True])))[1:number_of_results+1]\n", " final = final.reset_index(drop=True)\n", " print(color.BOLD + 'List of similar players:' + color.END)\n", " print('\\n')\n", " print(final[['player','squad','position','age','% match']])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "scrolled": false }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f057349e02c8438491c7eb0631a155a6", "version_major": 2, "version_minor": 0 }, "text/plain": [ "interactive(children=(Dropdown(description='player_name', options=('Aaron Connolly', 'Aaron Cresswell', 'Aaron…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "interact(find, player_name = sorted(pl),team = te,skill = ['Overall','Passing','Creating','Shooting','Defensive work','Possession'],number_of_results = (range(100))[20:100]);\n", "\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": false }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a85ac1284eb048e1babbc366c4a594f7", "version_major": 2, "version_minor": 0 }, "text/plain": [ "interactive(children=(Dropdown(description='player_name', options=('Aaron Connolly', 'Aaron Cresswell', 'Aaron…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "interact(find2, player_name = sorted(pl),team = te,stats = widgets.SelectMultiple(options = list(df_playersnew.columns.values),rows=10),number_of_results = (range(100))[20:100]);\n", "#select multiple stats with ctrl/command button\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }