{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# PCA-based model to find similar players"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#RUN ALL THE CELLS TILL TO RUN THE APP"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"by Parth Athale (@ParthAthale)\n",
"\n",
"Data credits to FBref/StatsBomb\n",
"Read methodology here: https://xgpershot.wordpress.com/2020/08/06/pca-based-model-to-identify-similar-players/\n",
"Read code here: https://github.com/parth1902/PCA_Player_Finder\n",
"\n",
"Some examples as a guide to do this:\n",
"If you want to find out a Pierre-Emile Højbjerg replacement for Southampton, choose Højbjerg, Southampton, Overall\n",
"If you want to find out a player like Leroy Sané for Barcelona, choose Sané, Barcelona, Overall\n",
"If you want to find out a player with the defensive ability of Wilfried Ndidi without any team constraint, choose Ndidi, Overall, Defensive work\n"
]
}
],
"source": [
"class color:\n",
" PURPLE = '\\033[95m'\n",
" CYAN = '\\033[96m'\n",
" DARKCYAN = '\\033[36m'\n",
" BLUE = '\\033[94m'\n",
" GREEN = '\\033[92m'\n",
" YELLOW = '\\033[93m'\n",
" RED = '\\033[91m'\n",
" BOLD = '\\033[1m'\n",
" UNDERLINE = '\\033[4m'\n",
" END = '\\033[0m'\n",
"\n",
"#print(color.BOLD + 'PCA-based model to find similar players' + color.END)\n",
"print('by Parth Athale (@ParthAthale)\\n')\n",
"print('Data credits to FBref/StatsBomb')\n",
"print('Read methodology here: https://xgpershot.wordpress.com/2020/08/06/pca-based-model-to-identify-similar-players/')\n",
"print('Read code here: https://github.com/parth1902/PCA_Player_Finder\\n')\n",
"print('Some examples as a guide to do this:')\n",
"print('If you want to find out a Pierre-Emile Højbjerg replacement for Southampton, choose Højbjerg, Southampton, Overall')\n",
"print('If you want to find out a player like Leroy Sané for Barcelona, choose Sané, Barcelona, Overall')\n",
"print('If you want to find out a player with the defensive ability of Wilfried Ndidi without any team constraint, choose Ndidi, Overall, Defensive work')\n",
"\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from __future__ import print_function\n",
"from ipywidgets import interact, interactive, fixed, interact_manual\n",
"import ipywidgets as widgets\n",
"import pandas as pd\n",
"import numpy as np\n",
"pd.set_option('expand_frame_repr', False)\n",
"pd.set_option('display.max_columns', 10)\n",
"from IPython.display import HTML, display\n",
"\n",
"display(HTML(''''''))\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"url = 'https://raw.githubusercontent.com/parth1902/test/master/games.csv'\n",
"df = pd.read_csv(url,sep = ',')\n",
"\n",
"url = 'https://raw.githubusercontent.com/parth1902/test/master/players.csv'\n",
"df_players = pd.read_csv(url,sep = ',')\n",
"\n",
"df_players = df_players[df_players['minutes'] > 500]\n",
"df_players = df_players[df_players['position'] != 'GK']\n",
"df_players = df_players.drop_duplicates(subset=['player'], keep='last')\n",
"\n",
"def f(df):\n",
" if df['result'] == 'W':\n",
" val = 3\n",
" elif df['result'] == 'D':\n",
" val = 1\n",
" else:\n",
" val = 0\n",
" return val\n",
"df['Points taken'] = df.apply(f, axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"arr = ['minutes',\n",
" 'goals',\n",
" 'assists',\n",
" 'pens_made',\n",
" 'pens_att',\n",
" 'xg',\n",
" 'npxg',\n",
" 'xa',\n",
" 'shots_total',\n",
" 'shots_on_target',\n",
" 'shots_free_kicks',\n",
" 'xg_net',\n",
" 'npxg_net',\n",
" 'passes_completed',\n",
" 'passes',\n",
" 'passes_total_distance',\n",
" 'passes_progressive_distance',\n",
" 'passes_completed_short',\n",
" 'passes_short',\n",
" 'passes_completed_medium',\n",
" 'passes_medium',\n",
" 'passes_completed_long',\n",
" 'passes_long',\n",
" 'assisted_shots',\n",
" 'passes_into_final_third',\n",
" 'passes_into_penalty_area',\n",
" 'crosses_into_penalty_area',\n",
" 'progressive_passes',\n",
" 'passes_live',\n",
" 'passes_dead',\n",
" 'passes_free_kicks',\n",
" 'through_balls',\n",
" 'passes_pressure',\n",
" 'passes_switches',\n",
" 'crosses',\n",
" 'corner_kicks',\n",
" 'corner_kicks_in',\n",
" 'corner_kicks_out',\n",
" 'corner_kicks_straight',\n",
" 'passes_ground',\n",
" 'passes_low',\n",
" 'passes_high',\n",
" 'passes_left_foot',\n",
" 'passes_right_foot',\n",
" 'passes_head',\n",
" 'throw_ins',\n",
" 'passes_other_body',\n",
" 'passes_offsides',\n",
" 'passes_oob',\n",
" 'passes_intercepted',\n",
" 'passes_blocked',\n",
" 'sca',\n",
" 'sca_passes_live',\n",
" 'sca_passes_dead',\n",
" 'sca_dribbles',\n",
" 'sca_shots',\n",
" 'sca_fouled',\n",
" 'gca',\n",
" 'gca_passes_live',\n",
" 'gca_passes_dead',\n",
" 'gca_dribbles',\n",
" 'gca_shots',\n",
" 'gca_fouled',\n",
" 'gca_og_for',\n",
" 'tackles',\n",
" 'tackles_won',\n",
" 'tackles_def_3rd',\n",
" 'tackles_mid_3rd',\n",
" 'tackles_att_3rd',\n",
" 'dribble_tackles',\n",
" 'dribbles_vs',\n",
" 'dribbled_past',\n",
" 'pressures',\n",
" 'pressure_regains',\n",
" 'pressures_def_3rd',\n",
" 'pressures_mid_3rd',\n",
" 'pressures_att_3rd',\n",
" 'blocks',\n",
" 'blocked_shots',\n",
" 'blocked_shots_saves',\n",
" 'blocked_passes',\n",
" 'interceptions',\n",
" 'clearances',\n",
" 'errors',\n",
" 'touches',\n",
" 'touches_def_pen_area',\n",
" 'touches_def_3rd',\n",
" 'touches_mid_3rd',\n",
" 'touches_att_3rd',\n",
" 'touches_att_pen_area',\n",
" 'touches_live_ball',\n",
" 'dribbles_completed',\n",
" 'dribbles',\n",
" 'players_dribbled_past',\n",
" 'nutmegs',\n",
" 'carries',\n",
" 'carry_distance',\n",
" 'carry_progressive_distance',\n",
" 'pass_targets',\n",
" 'miscontrols',\n",
" 'dispossessed']\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"scaler = MinMaxScaler()\n",
"for i in range (0,len(arr)):\n",
" if arr[i]!= 'minutes':\n",
" df_players[arr[i]] = (df_players[arr[i]]/df_players['minutes'])*90\n",
"df_players = df_players.drop(['goals_per90',\n",
"'cards_yellow',\n",
"'cards_red',\n",
"'assists_per90',\n",
"'goals_assists_per90',\n",
"'goals_pens_per90',\n",
"'goals_assists_pens_per90',\n",
"'xg_per90',\n",
"'xa_per90',\n",
"'xg_xa_per90',\n",
"'npxg_per90',\n",
"'npxg_xa_per90',\n",
"'minutes_90s',\n",
"'shots_total_per90',\n",
"'shots_on_target_per90',\n",
"'xa_net',\n",
"'sca_per90',\n",
"'gca_per90',\n",
"'passes_received',\n",
"'cards_yellow_red',\n",
"'fouls',\n",
"'fouled',\n",
"'offsides',\n",
"'pens_won',\n",
"'pens_conceded',\n",
"'own_goals',\n",
"'ball_recoveries',\n",
"'aerials_won',\n",
"'aerials_lost',\n",
"'aerials_won_pct'], axis = 1)\n",
"df_playersnew = df_players.drop(['player',\n",
" 'nationality',\n",
" 'position',\n",
" 'squad',\n",
" 'age',\n",
" 'birth_year',\n",
" 'games',\n",
" 'games_starts','minutes'], axis = 1)\n",
"arr2 = list(df_playersnew.columns.values)\n",
"for i in range(0,len(arr2)):\n",
" df_playersnew[arr2[i]] = scaler.fit_transform(df_playersnew[[arr2[i]]])\n",
" \n",
"df.rename(columns={'xg_for':'xg'}, inplace=True) \n",
"df_playersnew1 = df_playersnew.copy()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def shoot(df_playersnew1):\n",
" return df_playersnew1[['goals',\n",
" 'xg',\n",
" 'npxg',\n",
" 'shots_total',\n",
" 'shots_on_target',\n",
" 'shots_free_kicks',\n",
" 'shots_on_target_pct',\n",
" 'goals_per_shot',\n",
" 'goals_per_shot_on_target',\n",
" 'npxg_per_shot',\n",
" 'xg_net',\n",
" 'npxg_net']]\n",
"\n",
"def create(df_playersnew1):\n",
" return df_playersnew1[['sca',\n",
" 'sca_passes_live',\n",
" 'sca_passes_dead',\n",
" 'sca_dribbles',\n",
" 'sca_shots',\n",
" 'sca_fouled',\n",
" 'assisted_shots',\n",
" 'through_balls',\n",
" 'gca',\n",
" 'gca_passes_live',\n",
" 'gca_passes_dead',\n",
" 'gca_dribbles',\n",
" 'gca_shots',\n",
" 'gca_fouled',\n",
" 'gca_og_for','assists','xa']]\n",
"\n",
"def passs(df_playersnew1):\n",
" return df_playersnew1[['passes_completed',\n",
" 'passes',\n",
" 'passes_pct',\n",
" 'passes_total_distance',\n",
" 'passes_progressive_distance',\n",
" 'passes_completed_short',\n",
" 'passes_short',\n",
" 'passes_pct_short',\n",
" 'passes_completed_medium',\n",
" 'passes_medium',\n",
" 'passes_pct_medium',\n",
" 'passes_completed_long',\n",
" 'passes_long',\n",
" 'passes_pct_long',\n",
" 'passes_into_final_third',\n",
" 'passes_into_penalty_area',\n",
" 'crosses_into_penalty_area',\n",
" 'progressive_passes',\n",
" 'passes_live',\n",
" 'passes_dead',\n",
" 'passes_free_kicks',\n",
" 'passes_pressure',\n",
" 'passes_switches',\n",
" 'crosses',\n",
" 'corner_kicks',\n",
" 'corner_kicks_in',\n",
" 'corner_kicks_out',\n",
" 'corner_kicks_straight',\n",
" 'passes_ground',\n",
" 'passes_low',\n",
" 'passes_high',\n",
" 'passes_left_foot',\n",
" 'passes_right_foot',\n",
" 'passes_head',\n",
" 'throw_ins',\n",
" 'passes_other_body',\n",
" 'passes_offsides',\n",
" 'passes_oob',\n",
" 'passes_intercepted',\n",
" 'passes_blocked']]\n",
"\n",
"def deff(df_playersnew1):\n",
" return df_playersnew1[['tackles',\n",
" 'tackles_won',\n",
" 'tackles_def_3rd',\n",
" 'tackles_mid_3rd',\n",
" 'tackles_att_3rd',\n",
" 'dribble_tackles',\n",
" 'dribbles_vs',\n",
" 'dribble_tackles_pct',\n",
" 'dribbled_past',\n",
" 'pressures',\n",
" 'pressure_regains',\n",
" 'pressure_regain_pct',\n",
" 'pressures_def_3rd',\n",
" 'pressures_mid_3rd',\n",
" 'pressures_att_3rd',\n",
" 'blocks',\n",
" 'blocked_shots',\n",
" 'blocked_shots_saves',\n",
" 'blocked_passes',\n",
" 'interceptions',\n",
" 'clearances',\n",
" 'errors']]\n",
"\n",
"def poss(df_playersnew1):\n",
" return df_playersnew1[['touches',\n",
" 'touches_def_pen_area',\n",
" 'touches_def_3rd',\n",
" 'touches_mid_3rd',\n",
" 'touches_att_3rd',\n",
" 'touches_att_pen_area',\n",
" 'touches_live_ball',\n",
" 'dribbles_completed',\n",
" 'dribbles',\n",
" 'dribbles_completed_pct',\n",
" 'players_dribbled_past',\n",
" 'nutmegs',\n",
" 'carries',\n",
" 'carry_distance',\n",
" 'carry_progressive_distance',\n",
" 'pass_targets',\n",
" 'passes_received_pct',\n",
" 'miscontrols',\n",
" 'dispossessed']]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"pl = np.array(df_players['player'])\n",
"te = np.array(df_players['squad'].unique())\n",
"te = np.append('Overall',sorted(te))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def find(player_name,team,skill,number_of_results):\n",
" \n",
" global df\n",
" global df_players\n",
" global finalDf\n",
" global df_playersnew\n",
" global df_playersnew2\n",
" global df_playersnew1\n",
" global principalDf\n",
" df_playersnew1 = df_playersnew.copy()\n",
" df = df.loc[:,~df.T.duplicated(keep='first')]\n",
" df = df.loc[:, ~df.columns.duplicated()]\n",
" if team != \"Overall\":\n",
" df_new = df[df['for'] == team]\n",
" corrMatrix = df_new.corr()\n",
" else:\n",
" corrMatrix = df.corr()\n",
" arr2 = list(df_playersnew1.columns.values)\n",
" for i in range(0,len(arr2)):\n",
" df_playersnew1[arr2[i]] = (df_playersnew1[arr2[i]]) * (corrMatrix['Points taken'][arr2[i]])\n",
" if skill == 'Overall':\n",
" df_playersnew2 = df_playersnew1\n",
" elif skill == 'Possession':\n",
" df_playersnew2 = poss(df_playersnew1)\n",
" elif skill == 'Shooting':\n",
" df_playersnew2 = shoot(df_playersnew1)\n",
" elif skill == 'Passing':\n",
" df_playersnew2 = passs(df_playersnew1)\n",
" elif skill == 'Creating':\n",
" df_playersnew2 = create(df_playersnew1) \n",
" elif skill == 'Defensive work':\n",
" df_playersnew2 = deff(df_playersnew1)\n",
" \n",
" \n",
" features = list(df_playersnew2.columns.values)\n",
" #df_playersnew2 = df_playersnew2.fillna(0)\n",
" from sklearn.preprocessing import StandardScaler\n",
" # Separating out the features\n",
" x = df_playersnew2.loc[:, features].values\n",
" # Separating out the target\n",
" y = df_players.loc[:,['player']].values\n",
" # Standardizing the features\n",
" #x = StandardScaler().fit_transform(x)\n",
" x = np.nan_to_num(x)\n",
" from sklearn.decomposition import PCA\n",
" #pca = PCA(n_components=2)\n",
" pca = PCA(.90)\n",
" principalComponents = pca.fit_transform(x)\n",
" print('Number of PCA components:',pca.n_components_)\n",
" print('\\n')\n",
" principalDf = pd.DataFrame(data = principalComponents)\n",
" global finalDf\n",
" df_players = df_players.reset_index(drop=True)\n",
" finalDf = pd.concat([principalDf, df_players[['player']]], axis = 1)\n",
" finalDf = pd.concat([finalDf, df_players[['squad']]], axis = 1)\n",
" finalDf = pd.concat([finalDf, df_players[['position']]], axis = 1)\n",
" finalDf = pd.concat([finalDf, df_players[['age']]], axis = 1)\n",
" \n",
" player = player_name\n",
" class color:\n",
" PURPLE = '\\033[95m'\n",
" CYAN = '\\033[96m'\n",
" DARKCYAN = '\\033[36m'\n",
" BLUE = '\\033[94m'\n",
" GREEN = '\\033[92m'\n",
" YELLOW = '\\033[93m'\n",
" RED = '\\033[91m'\n",
" BOLD = '\\033[1m'\n",
" UNDERLINE = '\\033[4m'\n",
" END = '\\033[0m'\n",
"\n",
"#print(color.BOLD + 'Hello World !' + color.END)\n",
" a = (finalDf[finalDf['player'] == player])[(finalDf[finalDf['player'] == player]).columns.drop(['player','squad','position','age'])]\n",
" b = finalDf[finalDf.columns.drop(['player','squad','position','age'])]\n",
" finalDf['distance'] = (finalDf[list(b.columns.values)] - np.array(a)).pow(2).sum(1).pow(0.5)\n",
" dist = finalDf['distance'].max()\n",
" dist2 = finalDf['distance'].quantile(0.95)\n",
" finalDf['% match'] = 100-(finalDf['distance']/dist2)*100\n",
" final = ((finalDf.sort_values(['distance'], ascending=[True])))[1:number_of_results+1]\n",
" final = final.reset_index(drop=True)\n",
" print(color.BOLD + 'List of similar players:' + color.END)\n",
" print('\\n')\n",
" print(final[['player','squad','position','age','% match']])\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def find2(player_name,team,stats,number_of_results):\n",
" \n",
" print('Stats selected:',stats)\n",
" print('\\n')\n",
" global df\n",
" global df_players\n",
" global finalDf\n",
" global df_playersnew\n",
" global df_playersnew2\n",
" global df_playersnew1\n",
" global principalDf\n",
" df_playersnew1 = df_playersnew.copy()\n",
" df = df.loc[:,~df.T.duplicated(keep='first')]\n",
" df = df.loc[:, ~df.columns.duplicated()]\n",
" if team != \"Overall\":\n",
" df_new = df[df['for'] == team]\n",
" corrMatrix = df_new.corr()\n",
" else:\n",
" corrMatrix = df.corr()\n",
" arr2 = list(df_playersnew1.columns.values)\n",
" for i in range(0,len(arr2)):\n",
" df_playersnew1[arr2[i]] = (df_playersnew1[arr2[i]]) * (corrMatrix['Points taken'][arr2[i]])\n",
" if not stats:\n",
" print(\"Choose at least one stat to see output\")\n",
" return\n",
" else:\n",
" df_playersnew2 = df_playersnew1[np.array(stats)]\n",
" \n",
" \n",
" features = list(df_playersnew2.columns.values)\n",
" #df_playersnew2 = df_playersnew2.fillna(0)\n",
" from sklearn.preprocessing import StandardScaler\n",
" # Separating out the features\n",
" x = df_playersnew2.loc[:, features].values\n",
" # Separating out the target\n",
" y = df_players.loc[:,['player']].values\n",
" # Standardizing the features\n",
" #x = StandardScaler().fit_transform(x)\n",
" x = np.nan_to_num(x)\n",
" from sklearn.decomposition import PCA\n",
" #pca = PCA(n_components=2)\n",
" pca = PCA(.90)\n",
" principalComponents = pca.fit_transform(x)\n",
" print('Number of PCA components:',pca.n_components_)\n",
" print('\\n')\n",
" principalDf = pd.DataFrame(data = principalComponents)\n",
" \n",
" global finalDf\n",
" df_players = df_players.reset_index(drop=True)\n",
" finalDf = pd.concat([principalDf, df_players[['player']]], axis = 1)\n",
" finalDf = pd.concat([finalDf, df_players[['squad']]], axis = 1)\n",
" finalDf = pd.concat([finalDf, df_players[['position']]], axis = 1)\n",
" finalDf = pd.concat([finalDf, df_players[['age']]], axis = 1)\n",
" \n",
" player = player_name\n",
" class color:\n",
" PURPLE = '\\033[95m'\n",
" CYAN = '\\033[96m'\n",
" DARKCYAN = '\\033[36m'\n",
" BLUE = '\\033[94m'\n",
" GREEN = '\\033[92m'\n",
" YELLOW = '\\033[93m'\n",
" RED = '\\033[91m'\n",
" BOLD = '\\033[1m'\n",
" UNDERLINE = '\\033[4m'\n",
" END = '\\033[0m'\n",
"\n",
"#print(color.BOLD + 'Hello World !' + color.END)\n",
" a = (finalDf[finalDf['player'] == player])[(finalDf[finalDf['player'] == player]).columns.drop(['player','squad','position','age'])]\n",
" b = finalDf[finalDf.columns.drop(['player','squad','position','age'])]\n",
" finalDf['distance'] = (finalDf[list(b.columns.values)] - np.array(a)).pow(2).sum(1).pow(0.5)\n",
" dist = finalDf['distance'].max()\n",
" dist2 = finalDf['distance'].quantile(0.95)\n",
" finalDf['% match'] = 100-(finalDf['distance']/dist2)*100\n",
" final = ((finalDf.sort_values(['distance'], ascending=[True])))[1:number_of_results+1]\n",
" final = final.reset_index(drop=True)\n",
" print(color.BOLD + 'List of similar players:' + color.END)\n",
" print('\\n')\n",
" print(final[['player','squad','position','age','% match']])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f057349e02c8438491c7eb0631a155a6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"interactive(children=(Dropdown(description='player_name', options=('Aaron Connolly', 'Aaron Cresswell', 'Aaron…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"interact(find, player_name = sorted(pl),team = te,skill = ['Overall','Passing','Creating','Shooting','Defensive work','Possession'],number_of_results = (range(100))[20:100]);\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a85ac1284eb048e1babbc366c4a594f7",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"interactive(children=(Dropdown(description='player_name', options=('Aaron Connolly', 'Aaron Cresswell', 'Aaron…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"interact(find2, player_name = sorted(pl),team = te,stats = widgets.SelectMultiple(options = list(df_playersnew.columns.values),rows=10),number_of_results = (range(100))[20:100]);\n",
"#select multiple stats with ctrl/command button\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}