{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# default_exp cli" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Understat-DB CLI\n", "\n", "> CLI for creating a soccer database with data from understat.com" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#hide\n", "from nbdev.showdoc import *" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Initialise the app with Typer" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#exporti\n", "import dataclasses\n", "import datetime as dt\n", "import functools\n", "import itertools\n", "import os\n", "import pathlib\n", "import typing\n", "import time\n", "\n", "import dbt.main\n", "import dotenv\n", "import playhouse.postgres_ext\n", "import pyprojroot\n", "import typer\n", "\n", "import wingback" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "\n", "\n", "class EnvTyper(typer.Typer):\n", " \"\"\"\n", " Just like typer.Typer, except it loads the environment with\n", " `dotenv.load_dotenv` before executing any command.\n", " \"\"\"\n", " def __call__(self, *args, **kwargs):\n", " dotenv.load_dotenv()\n", " return super().__call__(*args, **kwargs)\n", "\n", "\n", "\n", "app = EnvTyper()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#exporti\n", "\n", "\n", "def initialize_db():\n", " \"\"\" \n", " Load database config from environment and initialise\n", " `understatdb.db.DB` with a database connection. \n", " \"\"\"\n", " \n", " # Load database config from environment\n", " postgres_db = playhouse.postgres_ext.PostgresqlExtDatabase(\n", " host=os.environ['DB_HOST'],\n", " user=os.environ['DB_USER'],\n", " password=os.environ['DB_PASS'],\n", " database=os.environ['DB_NAME'],\n", " port=os.environ['DB_PORT'],\n", " )\n", "\n", " # Configure proxy database to use configured postgres\n", " typer.secho('Initialising database connection...', fg=typer.colors.BRIGHT_BLACK)\n", " wingback.db.DB.initialize(postgres_db)\n", " \n", " # Connect pugsql to database\n", " wingback.db.queries.connect('postgresql://{user}:{password}@{host}:{port}/{database}'.format(\n", " host=os.environ['DB_HOST'],\n", " user=os.environ['DB_USER'],\n", " password=os.environ['DB_PASS'],\n", " database=os.environ['DB_NAME'],\n", " port=os.environ['DB_PORT'],\n", " ))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Migrate\n", "\n", "Migrating the database" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "\n", "\n", "@app.command()\n", "def migrate(interactive: bool = True):\n", " \"\"\" Migrate database to the current schema (as defined in nbs/db.ipynb) \"\"\"\n", " \n", " initialize_db()\n", " \n", " # Get names of tables generated by dbt and exclude them from the migration\n", " dbt_models_path = pyprojroot.here()/'dbt'/'models'\n", " dbt_tables = [f.stem for f in dbt_models_path.glob('**/*.sql')]\n", " \n", " # Migrate database tables\n", " typer.secho('Migrating database tables...', fg=typer.colors.BRIGHT_BLACK)\n", " wingback.db.DB.evolve(\n", " ignore_tables=wingback.db.EVOLVE_IGNORE_TABLES + dbt_tables,\n", " interactive=interactive\n", " )\n", " typer.secho('Done!', fg=typer.colors.GREEN, bold=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## dbt runner\n", "\n", "(Re-)build tables from base data using dbt (data-build-tool)'s (unsupported) python API" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "\n", "\n", "@app.command()\n", "def build_tables(args: typing.List[str] = typer.Option([], help='Additional arguments passed to `dbt run`')):\n", " \"\"\" Build tables from base data using dbt \"\"\"\n", " \n", " project_dir = pyprojroot.here()/'dbt'\n", " profiles_dir = pyprojroot.here()/'.dbt'\n", " \n", " base_args = [\n", " 'run', \n", " '--profiles-dir', \n", " str(profiles_dir), \n", " '--project-dir', \n", " str(project_dir)\n", " ]\n", " \n", " # NOTE: Python API is not officially supported, so\n", " # watch out if you change dbt versions...\n", " typer.secho('Building tables with dbt', fg=typer.colors.BLUE)\n", " _ = dbt.main.handle_and_check(base_args + list(args))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Ingest\n", "\n", "Ingest base understat data and build understat tables from base data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "\n", "\n", "_DEFAULT_INGEST_LEAGUES = [l.value for l in wingback.understat.League]\n", "_DEFAULT_INGEST_SEASONS = list(range(2014, 2021))\n", "\n", "\n", "@app.command()\n", "def ingest(\n", " refresh: bool = False,\n", " leagues: typing.List[str] = typer.Option(\n", " _DEFAULT_INGEST_LEAGUES, \n", " help='Leagues to import', \n", " callback=lambda xs: [wingback.understat.League(x) for x in xs]\n", " ), \n", " seasons: typing.List[int] = typer.Option(\n", " _DEFAULT_INGEST_SEASONS, \n", " help='Seasons to import (by start year)'\n", " ),\n", "):\n", " \"\"\" Ingest match and shot data from Understat.com \"\"\"\n", " \n", " initialize_db()\n", " client = wingback.understat.Understat()\n", " \n", " for league, season in itertools.product(leagues, seasons):\n", " # Add league & season to DB\n", " with wingback.db.DB.atomic():\n", " db_league, _ = wingback.db.League.get_or_create(name=league.value)\n", " db_season, _ = wingback.db.Season.get_or_create(name=season)\n", " \n", " # Check if a record for this league and season already exists. If so, skip it.\n", " existing_record = wingback.db.Matches.get_or_none(\n", " league_id=db_league.id, \n", " season_id=db_season.id\n", " )\n", " if (not refresh) and existing_record:\n", " typer.secho(\n", " f'Data for {league.value}, {season} already exists. Skipping. '\n", " 'To update data for this league and season, use the `--refresh` flag', \n", " fg=typer.colors.BRIGHT_BLACK\n", " )\n", " continue\n", " \n", " # Add match and shot data to DB\n", " typer.secho(f'Ingesting data for {league.value}, {season}', fg=typer.colors.BLUE)\n", " with wingback.db.DB.atomic():\n", " \n", " # Fetch match data from understat\n", " matches = client.matches(league, season)\n", " \n", " # Delete any old match data\n", " if refresh:\n", " wingback.db.Matches.delete().where(\n", " (wingback.db.Matches.league_id==db_league.id) &\n", " (wingback.db.Matches.season_id==db_season.id)\n", " ).execute()\n", " \n", " db_matches = wingback.db.Matches.create(\n", " league_id=db_league.id,\n", " season_id=db_season.id,\n", " json=matches,\n", " version=wingback.__version__\n", " )\n", " \n", " with typer.progressbar(matches, label='Shots') as progress:\n", " for match in progress:\n", " if not match['isResult']:\n", " continue\n", " \n", " # Add an artificial crawl delay to avoid bombarding \n", " # understat with requests\n", " # There's no robots.txt or ToS available on the site,\n", " # So we just use a relatively conservative delay of\n", " # 5 seconds per (shots) request\n", " time.sleep(5)\n", " \n", " match_id = int(match['id'])\n", " shots = client.shots(match_id)\n", " \n", " # Delete any old shots data\n", " if refresh:\n", " wingback.db.Shots.delete().where(\n", " wingback.db.Shots.match_id==match_id\n", " ).execute()\n", " \n", " db_shots = wingback.db.Shots.create(\n", " match_id=match_id,\n", " json=shots,\n", " version=wingback.__version__\n", " )\n", " \n", " # Rebuild tables in dbt\n", " build_tables(args=['--models', 'staging'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Modelling" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "\n", "\n", "@app.command()\n", "def resimulate(\n", " refresh: bool = False,\n", " leagues: typing.List[str] = typer.Option(\n", " _DEFAULT_INGEST_LEAGUES, \n", " help='Leagues to import', \n", " callback=lambda xs: [wingback.understat.League(x) for x in xs]\n", " ),\n", " seasons: typing.List[int] = typer.Option(\n", " _DEFAULT_INGEST_SEASONS, \n", " help='Seasons to import (by start year)'\n", " ),\n", "):\n", " \"\"\" Resimulate matches based on individual shot xGs \"\"\"\n", " initialize_db()\n", " \n", " for league, season in itertools.product(leagues, seasons):\n", " # TODO: handle case where no league exists\n", " league_id = wingback.db.League.get(name=league.value).id\n", " season_id = wingback.db.Season.get(name=season).id\n", " matches = list(wingback.db.queries.fetch_matches(\n", " league_ids=[league_id], \n", " season_ids=[season_id],\n", " start=None,\n", " end=None\n", " ))\n", " \n", " typer.secho(f'Resimulating matches for {league.value}, {season}', fg=typer.colors.BLUE)\n", " with typer.progressbar(matches, label='Matches') as progress:\n", " for match in progress:\n", " # Check if match is already resimulated and skip if so\n", " if (not refresh) and wingback.db.Resimulation.get_or_none(match_id=match['id']):\n", " continue\n", " \n", " shots = list(wingback.db.queries.fetch_shots(match_id=match['id']))\n", " resims = wingback.resimulation.resimulate_match(shots)\n", " \n", " with wingback.db.DB.atomic():\n", " # Delete any old resim data\n", " if refresh:\n", " wingback.db.Resimulation.delete().where(\n", " wingback.db.Resimulation.match_id==match['id']\n", " ).execute()\n", "\n", " db_resim = wingback.db.Resimulation.create(\n", " match_id=match['id'],\n", " json=resims,\n", " version=wingback.__version__\n", " )\n", "\n", " build_tables(args=['--models', 'resimulation'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "\n", "\n", "@app.command()\n", "def backtest(\n", " refresh: bool = False,\n", " models: typing.List[str] = typer.Option(\n", " list(wingback.team_strength.MODEL_REGISTRY.keys()), \n", " help='Models to fit',\n", " ),\n", " # Because there's no overlap across leagues in the understat dataset\n", " # just pick one league at a time\n", " league: str = typer.Option(\n", " 'EPL', \n", " help='League to run team-strength model on', \n", " callback=lambda x: wingback.understat.League(x)\n", " ),\n", " start_date: str = typer.Option(\n", " '2015-07-01', \n", " help='Start fitting the model from a certain date', \n", " callback=lambda x: dt.datetime.strptime(x, '%Y-%m-%d').date()\n", " )\n", "):\n", " \"\"\" Fit team strength model(s) and persist to database \"\"\"\n", " initialize_db()\n", " \n", " db_league = wingback.db.League.get_or_none(name=league.value)\n", " if not db_league:\n", " raise ValueError(f'No data for \"{league}\" found. Do you need to import the data?')\n", " league_id = db_league.id\n", " \n", " matchdays = list(wingback.db.queries.fetch_matchdays(\n", " league_id=league_id,\n", " start=start_date,\n", " end=None\n", " ))\n", " typer.secho(f'Found {len(matchdays)} {league.value} matchdays from {start_date}', fg=typer.colors.BLUE)\n", " \n", " typer.secho(f'Backtesting models...', fg=typer.colors.BRIGHT_BLACK)\n", " for model_name in models:\n", " model = wingback.team_strength.MODEL_REGISTRY[model_name]\n", " \n", " with typer.progressbar(matchdays, label=model_name) as progress:\n", " for matchday in progress:\n", " date = matchday['date']\n", " \n", " # Check if record exists...\n", " if (not refresh) and wingback.db.Backtest.get_or_none(\n", " model=model_name,\n", " league_id=league_id,\n", " date=date\n", " ):\n", " continue\n", "\n", " # Fit the model\n", " train = model.fetch_data([league_id], date)\n", " model.fit(train)\n", " \n", " # Fetch the days' matches to test the model\n", " test = list(wingback.db.queries.fetch_matches(\n", " start=date, \n", " end=date+dt.timedelta(days=1), \n", " league_ids=[league_id],\n", " season_ids=[None]\n", " ))\n", "\n", " # Make predictions for that matchday\n", " predictions = model.predict(test)\n", "\n", " # Save model and predictions to database\n", " with wingback.db.DB.atomic():\n", " # Delete any existing records if `refresh`...\n", " if refresh:\n", " wingback.db.Backtest.delete().where(\n", " wingback.db.Backtest.model==model_name,\n", " wingback.db.Backtest.league_id==league_id,\n", " wingback.db.Backtest.date==date,\n", " ).execute()\n", "\n", " wingback.db.Backtest.create(\n", " model=model_name,\n", " league_id=league_id,\n", " date=date,\n", " json={\n", " 'model': model_name,\n", " 'parameters': model.to_dict(),\n", " 'predictions': [\n", " {'match_id': match['id'],\n", " 'scorelines': [dataclasses.asdict(p) for p in preds]} \n", " for match, preds in zip(test, predictions)\n", " ],\n", " },\n", " version=wingback.__version__\n", " )\n", " \n", " # Rebuild tables in dbt\n", " build_tables(args=['--models', 'backtest'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "\n", "Finally, make the app executable from the command line" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "\n", "# Try/except block seems to be the 'canonical'\n", "# way to export __name__ == __main__ in nbdev.\n", "# By excepting an ImportError, we don't have to\n", "# include nbdev as a runtime dependency (only a\n", "# development dependency).\n", "# \n", "# See: \n", "# * https://pete88b.github.io/fastpages/nbdev/fastai/jupyter/2020/07/24/nbdev-deep-dive.html#Export-a-if-__name__-==-\n", "# * https://forums.fast.ai/t/nbdev-is-there-a-way-to-export-a-if-name-main-clause/73050/3\n", "try:\n", " from nbdev.imports import IN_NOTEBOOK\n", "except ImportError: \n", " IN_NOTEBOOK = False\n", "\n", "if __name__ == '__main__' and not IN_NOTEBOOK:\n", " app()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 2 }