Software | Version |
---|
Python | 3.6.8 64bit [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] |
IPython | 7.2.0 |
OS | Darwin 18.7.0 x86_64 i386 64bit |
pandas | 0.23.4 |
Thu Jan 14 23:50:49 2021 EST |
"
],
"text/latex": [
"\\begin{tabular}{|l|l|}\\hline\n",
"{\\bf Software} & {\\bf Version} \\\\ \\hline\\hline\n",
"Python & 3.6.8 64bit [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE\\_401/final)] \\\\ \\hline\n",
"IPython & 7.2.0 \\\\ \\hline\n",
"OS & Darwin 18.7.0 x86\\_64 i386 64bit \\\\ \\hline\n",
"pandas & 0.23.4 \\\\ \\hline\n",
"\\hline \\multicolumn{2}{|l|}{Thu Jan 14 23:50:49 2021 EST} \\\\ \\hline\n",
"\\end{tabular}\n"
],
"text/plain": [
"Software versions\n",
"Python 3.6.8 64bit [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]\n",
"IPython 7.2.0\n",
"OS Darwin 18.7.0 x86_64 i386 64bit\n",
"pandas 0.23.4\n",
"Thu Jan 14 23:50:49 2021 EST"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%load_ext version_information\n",
"%version_information pandas"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Alex Galea \n",
"\n",
"CPython 3.6.8\n",
"IPython 7.2.0\n",
"\n",
"matplotlib 3.0.2\n",
"seaborn 0.9.0\n",
"pandas 0.23.4\n"
]
}
],
"source": [
"%load_ext watermark\n",
"%watermark -a \"Alex Galea\" -v -p matplotlib,seaborn,pandas"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Goalie Pull Bayes Optimize\n",
"\n",
" - Exploratory analysis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Explore Parsed Goalie Pull Data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import re\n",
"import datetime\n",
"import time\n",
"import glob\n",
"from tqdm import tqdm_notebook\n",
"from colorama import Fore, Style"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"%config InlineBackend.figure_format='retina'\n",
"sns.set() # Revert to matplotlib defaults\n",
"plt.rcParams['figure.figsize'] = (12, 8)\n",
"plt.rcParams['legend.fancybox'] = True\n",
"plt.style.use('fivethirtyeight')\n",
"plt.rcParams['font.family'] = 'Marion'\n",
"\n",
"SMALL_SIZE, MEDIUM_SIZE, BIGGER_SIZE = 14, 16, 20\n",
"plt.rc('font', size=SMALL_SIZE)\n",
"plt.rc('axes', titlesize=SMALL_SIZE)\n",
"plt.rc('axes', labelsize=MEDIUM_SIZE)\n",
"plt.rc('xtick', labelsize=SMALL_SIZE)\n",
"plt.rc('ytick', labelsize=SMALL_SIZE)\n",
"plt.rc('legend', fontsize=MEDIUM_SIZE)\n",
"plt.rc('axes', titlesize=BIGGER_SIZE)\n",
"\n",
"plt.rcParams['grid.alpha'] = 0.2\n",
"plt.rcParams['axes.labelpad'] = 10\n",
"plt.rcParams['axes.labelpad'] = 20\n",
"plt.rcParams['axes.facecolor'] = 'white'\n",
"plt.rcParams['figure.facecolor'] = 'white'\n",
"plt.rcParams['savefig.facecolor'] = 'white'\n",
"plt.rcParams['xtick.major.pad'] = 15\n",
"plt.rcParams['xtick.minor.pad'] = 15\n",
"plt.rcParams['ytick.major.pad'] = 10\n",
"plt.rcParams['ytick.minor.pad'] = 10\n",
"\n",
"# %config InlineBackend.figure_format='retina'\n",
"# sns.set() # Revert to matplotlib defaults\n",
"# plt.rcParams['figure.figsize'] = (12, 8)\n",
"# plt.style.use('fivethirtyeight')\n",
"# plt.rcParams['grid.alpha'] = 0.2\n",
"# plt.rcParams['axes.labelpad'] = 20\n",
"# plt.rcParams['ytick.labelsize'] = 14\n",
"# plt.rcParams['xtick.labelsize'] = 14\n",
"# plt.rcParams['axes.labelsize'] = 16\n",
"# plt.rcParams['axes.facecolor'] = 'white'\n",
"# plt.rcParams['figure.facecolor'] = 'white'\n",
"# plt.rcParams['xtick.major.pad'] = 15\n",
"# plt.rcParams['xtick.minor.pad'] = 15\n",
"# plt.rcParams['ytick.major.pad'] = 10\n",
"# plt.rcParams['ytick.minor.pad'] = 10\n",
"\n",
"def savefig(name):\n",
" plt.savefig(f'../../figures/{name}.png', bbox_inches='tight', dpi=300)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# To debug UserWarning: findfont: Font family [‘Marion’] not found.\n",
"\n",
"# import matplotlib.font_manager\n",
"# matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf')\n",
"# matplotlib.font_manager._rebuild()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m\u001b[36mcsv\u001b[m\u001b[m/ \u001b[1m\u001b[36mpkl\u001b[m\u001b[m/\r\n"
]
}
],
"source": [
"ls ../../data/processed/"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"20032004_goalie_pulls_2019-03-01.pkl 20122013_goalie_pulls_2019-04-25.pkl\r\n",
"20052006_goalie_pulls_2019-03-01.pkl 20132014_goalie_pulls_2019-04-25.pkl\r\n",
"20062007_goalie_pulls_2019-03-01.pkl 20142015_goalie_pulls_2019-04-25.pkl\r\n",
"20072008_goalie_pulls_2019-04-25.pkl 20152016_goalie_pulls_2019-04-25.pkl\r\n",
"20082009_goalie_pulls_2019-04-25.pkl 20162017_goalie_pulls_2019-04-25.pkl\r\n",
"20092010_goalie_pulls_2019-04-25.pkl 20172018_goalie_pulls_2019-04-25.pkl\r\n",
"20102011_goalie_pulls_2019-04-25.pkl 20182019_goalie_pulls_2019-04-25.pkl\r\n",
"20112012_goalie_pulls_2019-04-25.pkl \u001b[1m\u001b[36mtmp\u001b[m\u001b[m/\r\n"
]
}
],
"source": [
"ls ../../data/processed/pkl"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['../../data/processed/pkl/20032004_goalie_pulls_2019-03-01.pkl',\n",
" '../../data/processed/pkl/20052006_goalie_pulls_2019-03-01.pkl',\n",
" '../../data/processed/pkl/20062007_goalie_pulls_2019-03-01.pkl',\n",
" '../../data/processed/pkl/20072008_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20082009_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20092010_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20102011_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20112012_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20122013_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20132014_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20142015_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20152016_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20162017_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20172018_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20182019_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/tmp']"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"files = sorted(glob.glob('../../data/processed/pkl/*'))\n",
"files"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"def load_data():\n",
" files = [\n",
" '../../data/processed/pkl/20032004_goalie_pulls_2019-03-01.pkl',\n",
" '../../data/processed/pkl/20052006_goalie_pulls_2019-03-01.pkl',\n",
" '../../data/processed/pkl/20062007_goalie_pulls_2019-03-01.pkl',\n",
" '../../data/processed/pkl/20072008_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20082009_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20092010_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20102011_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20112012_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20122013_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20132014_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20142015_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20152016_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20162017_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20172018_goalie_pulls_2019-04-25.pkl',\n",
" '../../data/processed/pkl/20182019_goalie_pulls_2019-04-25.pkl',\n",
" ]\n",
" return pd.concat((pd.read_pickle(f) for f in files), sort=False)\n",
"\n",
"def clean_df(df):\n",
" _df = df.copy()\n",
" \n",
" len_0 = _df.shape[0]\n",
" print('Removing goal_for_time < 15 mins')\n",
" _df = _df[~(_df.goal_for_time < datetime.timedelta(seconds=15*60))]\n",
" print(f'Removed {len_0 - _df.shape[0]} total rows')\n",
" \n",
" if 'game_end_time' in df.columns:\n",
" len_0 = _df.shape[0]\n",
" print('Removing game_end_time < 15 mins')\n",
" _df = _df[~(_df.game_end_time < datetime.timedelta(seconds=60*15))]\n",
" print(f'Removed {len_0 - _df.shape[0]} total rows')\n",
" \n",
" return _df"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Removing goal_for_time < 15 mins\n",
"Removed 115 total rows\n",
"Removing game_end_time < 15 mins\n",
"Removed 0 total rows\n"
]
}
],
"source": [
"df = load_data()\n",
"df = clean_df(df)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Label the outcomes\n",
"df['label'] = ''\n",
"label_masks = {\n",
" 'goal_for': ~(df.goal_for_time.isnull()),\n",
" 'goal_against': ~(df.goal_against_time.isnull()),\n",
" 'no_goals': ~(df.game_end_timedelta.isnull()), \n",
"}\n",
"for label, mask in label_masks.items():\n",
" df.loc[mask, 'label'] = label\n",
"df.loc[df.label == '', 'label'] = float('nan')\n",
"df.label.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"