{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "97dba7b0", "metadata": {}, "outputs": [], "source": [ "# default_exp understat" ] }, { "cell_type": "markdown", "id": "55d3555a", "metadata": {}, "source": [ "# Understat\n", "\n", "> A module for fetching data from understat.com" ] }, { "cell_type": "code", "execution_count": null, "id": "412c74ea", "metadata": {}, "outputs": [], "source": [ "#hide\n", "from nbdev.showdoc import *" ] }, { "cell_type": "markdown", "id": "44ae6d75", "metadata": {}, "source": [ "## Helper functions" ] }, { "cell_type": "code", "execution_count": null, "id": "ae044919", "metadata": {}, "outputs": [], "source": [ "#export\n", "import enum\n", "import re\n", "import json\n", "\n", "import requests\n", "import bs4" ] }, { "cell_type": "code", "execution_count": null, "id": "ea2022fc", "metadata": {}, "outputs": [], "source": [ "#export\n", "\n", "\n", "def fetch_html(url):\n", " \"\"\"\n", " Fetch HTML and decode into a `bs4.BeautifulSoup` object\n", " \"\"\"\n", " r = requests.get(url)\n", " r.raise_for_status()\n", " return bs4.BeautifulSoup(str(r.content, 'unicode-escape'), features='html.parser')\n", " \n", " \n", "def extract_json(soup, json_var):\n", " \"\"\" Extract a JSON variable from understat HTML. \"\"\"\n", " node, *__ = [s for s in soup.select('script') if s.string and json_var in s.string]\n", " \n", " # Clean string by removing and newlines (\\n) and tabs (\\t)\n", " node_string = ' '.join(node.string.split())\n", " \n", " json_value = re.match(f\"var {json_var} = JSON\\.parse\\(\\'(?P.*?)\\'\\)\", node_string).group('json')\n", " return json.loads(json_value)" ] }, { "cell_type": "markdown", "id": "7429fd04", "metadata": {}, "source": [ "## Understat 'API'" ] }, { "cell_type": "code", "execution_count": null, "id": "7c0aa45f", "metadata": {}, "outputs": [], "source": [ "#export\n", "\n", "\n", "# 'Competition' might be a better name, but let's stick with understat's terminology\n", "class League(enum.Enum): \n", " \"\"\"\n", " Understat leagues\n", " \"\"\"\n", " EPL = 'EPL'\n", " LA_LIGA = 'La_Liga'\n", " SERIE_A = 'Serie_A'\n", " BUNDESLIGA = 'Bundesliga'\n", " LIGUE_1 = 'Ligue_1'\n", " RPL = 'RPL'" ] }, { "cell_type": "code", "execution_count": null, "id": "791b1e7f", "metadata": {}, "outputs": [], "source": [ "#export\n", "\n", "\n", "class Understat:\n", " \"\"\"\n", " Fetches understat data webpages\n", " \"\"\"\n", " \n", " def __init__(self, base_url: str='https://understat.com'):\n", " self.base_url = base_url\n", " \n", " def matches(self, league: League, season: int):\n", " \"\"\" Fetch match data for a given `league` and `season` (start year). \"\"\"\n", " league_url = f'{self.base_url}/league/{league.value}/{season}'\n", " soup = fetch_html(league_url)\n", " return extract_json(soup, 'datesData')\n", " \n", " def shots(self, match_id: int):\n", " match_url = f'{self.base_url}/match/{match_id}'\n", " soup = fetch_html(match_url)\n", " return extract_json(soup, 'shotsData')" ] }, { "cell_type": "markdown", "id": "25c6daa4", "metadata": {}, "source": [ "Fetch matches from Understat" ] }, { "cell_type": "code", "execution_count": null, "id": "fbb9f99e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': '11660',\n", " 'isResult': True,\n", " 'h': {'id': '238', 'title': 'Sheffield United', 'short_title': 'SHE'},\n", " 'a': {'id': '78', 'title': 'Crystal Palace', 'short_title': 'CRY'},\n", " 'goals': {'h': '1', 'a': '0'},\n", " 'xG': {'h': '1.84778', 'a': '0.241912'},\n", " 'datetime': '2019-08-18 14:00:00',\n", " 'forecast': {'w': '0.8326', 'd': '0.1408', 'l': '0.0266'}}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "understat = Understat()\n", "\n", "matches = understat.matches(League.EPL, 2019)[17]\n", "matches" ] }, { "cell_type": "markdown", "id": "eb666678", "metadata": {}, "source": [ "Fetch individual match shots" ] }, { "cell_type": "code", "execution_count": null, "id": "50c01835", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': '311085',\n", " 'minute': '25',\n", " 'result': 'BlockedShot',\n", " 'X': '0.899000015258789',\n", " 'Y': '0.5609999847412109',\n", " 'xG': '0.07507339864969254',\n", " 'player': 'Jack O'Connell',\n", " 'h_a': 'h',\n", " 'player_id': '7705',\n", " 'situation': 'FromCorner',\n", " 'season': '2019',\n", " 'shotType': 'LeftFoot',\n", " 'match_id': '11660',\n", " 'h_team': 'Sheffield United',\n", " 'a_team': 'Crystal Palace',\n", " 'h_goals': '1',\n", " 'a_goals': '0',\n", " 'date': '2019-08-18 14:00:00',\n", " 'player_assisted': 'Oliver Norwood',\n", " 'lastAction': 'Pass'}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "shots = understat.shots(11660)\n", "\n", "# Take the home team's 5th shot\n", "shots['h'][5]" ] }, { "cell_type": "code", "execution_count": null, "id": "5027c2cb", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 5 }