{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sebastian Raschka 03/01/2015 \n", "\n", "CPython 3.4.2\n", "IPython 2.3.1\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark -a 'Sebastian Raschka' -v -d" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Collecting Premier League Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sections" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- [dreamteamfc.com](#dreamteamfc.com)\n", " - [Getting General Player Statistics](#Getting-General-Player-Statistics)\n", " - [Getting Injuries and Cards Information](#Getting-Injuries-and-Cards-Information)\n", " - [Getting Player Form Information](#Getting-Player-Form-Information)\n", " - [Saving the Data to CSV](#Saving-the-Data-to-CSV)\n", "- [espnfc.com](#espnfc.com)\n", " - [Getting Team Ranks and Stats](#Getting-Team-Ranks-and-Stats)\n", " - [Saving ESPN Data to CSV](#Saving-ESPN-Data-to-CSV)\n", " - [Getting Top Scorer](#Getting-Top-Scorer)\n", " - [Getting Top Assists](#Getting-Top-Assists)\n", "- [365stats.com](#365stats.com)\n", " - [Getting Injury Data](#Getting-Injury-Data)\n", " - [Saving 365stats Data to CSV](#Saving-365stats-Data-to-CSV)\n", "- [Transfermarkt.com](#Transfermarkt.com)\n", " - [Getting Home and Away Teams](#Getting-Home-and-Away-Teams)\n", " - [Saving Home and Away Teams to CSV](#Saving-Home-and-Away-Teams-to-CSV)\n", "- [premierleague.com](#premierleague.com)\n", "- [telegraph.co.uk](#telegraph.co.uk)\n", " - [Getting Current Week Points](#Getting-Current-Week-Points)\n", " - [Getting 6-Week Points](#Getting-6---Week-Points)\n", " - [Saving telegraph.co.uk to CSV](#Saving-telegraph.co.uk-to-CSV)\n", "- [m.premierleague.com](#m.premierleague.com)\n", " - [Combined Form of Previous 6 Days](#Combined-Form-of-Previous-6-Days)\n", " - [Saving m.premierleague.com to CSV](#Saving-m.premierleague.com-to-CSV)\n", "- [fantasyfootballscout.co.uk](#fantasyfootballscout.co.uk)\n", " - [Predicted Line-Ups](#Predicted Line-Ups)\n", " - [Saving fantasyfootballscout.co.uk to CSV](#Saving-fantasyfootballscout.co.uk-to-CSV)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# dreamteamfc.com" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting General Player Statistics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "from bs4 import BeautifulSoup\n", "import bs4\n", "import requests" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found: 401\n", "O'Brien, Joey\n" ] } ], "source": [ "# Downloading and parsing the data into a Python dict\n", "\n", "player_dict = {}\n", "\n", "url = 'https://www.dreamteamfc.com/statistics/players/ALL/'\n", "r = requests.get(url)\n", "soup = BeautifulSoup(r.text, 'html5lib') \n", "# Note: html5lib deals better with broken html than lxml\n", "\n", "name_list = []\n", "\n", "for td in soup.findAll(\"td\", { \"class\" : \"tabName\" }):\n", " name = td.text.split('Statistics')[-1].strip()\n", " if name:\n", " name_list.append(name)\n", " res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]\n", " position, team, vfm, value, points = res\n", " value = value.strip('m')\n", " player_dict[name] = [name, position, team, vfm, value, points]\n", " \n", "print('Found: %s' % len(name_list))\n", "print(name_list[-1])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namepositionteamvfmvaluepts
Sigurdsson, Gylfi Sigurdsson, Gylfi MID SWA 28.67 3.0 86
van Ginkel, Marco van Ginkel, Marco MID CHE 0.00 2.0 0
Chamakh, Marouane Chamakh, Marouane STR CRY 2.67 1.5 4
Ince, Tom Ince, Tom MID HUL 6.50 2.0 13
Ireland, Stephen Ireland, Stephen MID STO 4.00 1.0 4
\n", "
" ], "text/plain": [ " name position team vfm value pts\n", "Sigurdsson, Gylfi Sigurdsson, Gylfi MID SWA 28.67 3.0 86\n", "van Ginkel, Marco van Ginkel, Marco MID CHE 0.00 2.0 0\n", "Chamakh, Marouane Chamakh, Marouane STR CRY 2.67 1.5 4\n", "Ince, Tom Ince, Tom MID HUL 6.50 2.0 13\n", "Ireland, Stephen Ireland, Stephen MID STO 4.00 1.0 4" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Reading the data into a pandas DataFrame\n", "\n", "df = pd.DataFrame.from_dict(player_dict, orient='index')\n", "df.columns = ['name', 'position', 'team', 'vfm', 'value', 'pts']\n", "df[['vfm','value']] = df[['vfm','value']].astype(float)\n", "df[['pts']] = df[['pts']].astype(int)\n", "df.tail()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
vfmvaluepts
count 401.000000 401.000000 401.000000
mean 11.185661 2.770574 29.581047
std 10.259686 1.416327 27.582405
min -13.000000 1.000000 -13.000000
25% 3.600000 1.500000 9.000000
50% 9.330000 2.500000 24.000000
75% 15.850000 3.500000 43.000000
max 93.330000 7.500000 167.000000
\n", "
" ], "text/plain": [ " vfm value pts\n", "count 401.000000 401.000000 401.000000\n", "mean 11.185661 2.770574 29.581047\n", "std 10.259686 1.416327 27.582405\n", "min -13.000000 1.000000 -13.000000\n", "25% 3.600000 1.500000 9.000000\n", "50% 9.330000 2.500000 24.000000\n", "75% 15.850000 3.500000 43.000000\n", "max 93.330000 7.500000 167.000000" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting Injuries and Cards Information" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df['status'] = pd.Series('', index=df.index)\n", "df['description'] = pd.Series('', index=df.index)\n", "df['returns'] = pd.Series('', index=df.index)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found: 81\n", "Fernando\n" ] } ], "source": [ "url = 'https://www.dreamteamfc.com/statistics/injuries-and-cards/ALL/'\n", "r = requests.get(url)\n", "soup = BeautifulSoup(r.text, 'html5lib')\n", "\n", "name_list = []\n", "\n", "for td in soup.findAll(\"td\", { \"class\" : \"tabName2\" }):\n", " name = td.text.split('stats')[-1].strip()\n", " if name:\n", " name_list.append(name)\n", " res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]\n", " position, team, status, description, returns = res\n", " df.loc[df.index==name,['status', 'description', 'returns']] = status, description, returns\n", " \n", "print('Found: %s' % len(name_list))\n", "print(name_list[-1])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namepositionteamvfmvalueptsstatusdescriptionreturns
Sigurdsson, Gylfi Sigurdsson, Gylfi MID SWA 28.67 3.0 86
van Ginkel, Marco van Ginkel, Marco MID CHE 0.00 2.0 0 Unavailable Joined AC Milan on season-long loan 25/05/2015
Chamakh, Marouane Chamakh, Marouane STR CRY 2.67 1.5 4 Injured Sustained in the encounter with Stoke on 13/12... 26/12/2014
Ince, Tom Ince, Tom MID HUL 6.50 2.0 13
Ireland, Stephen Ireland, Stephen MID STO 4.00 1.0 4 Doubtful Rated a doubt for 22/12 visit of Chelsea. 01/01/2015
\n", "
" ], "text/plain": [ " name position team vfm value pts \\\n", "Sigurdsson, Gylfi Sigurdsson, Gylfi MID SWA 28.67 3.0 86 \n", "van Ginkel, Marco van Ginkel, Marco MID CHE 0.00 2.0 0 \n", "Chamakh, Marouane Chamakh, Marouane STR CRY 2.67 1.5 4 \n", "Ince, Tom Ince, Tom MID HUL 6.50 2.0 13 \n", "Ireland, Stephen Ireland, Stephen MID STO 4.00 1.0 4 \n", "\n", " status \\\n", "Sigurdsson, Gylfi \n", "van Ginkel, Marco Unavailable \n", "Chamakh, Marouane Injured \n", "Ince, Tom \n", "Ireland, Stephen Doubtful \n", "\n", " description \\\n", "Sigurdsson, Gylfi \n", "van Ginkel, Marco Joined AC Milan on season-long loan \n", "Chamakh, Marouane Sustained in the encounter with Stoke on 13/12... \n", "Ince, Tom \n", "Ireland, Stephen Rated a doubt for 22/12 visit of Chelsea. \n", "\n", " returns \n", "Sigurdsson, Gylfi \n", "van Ginkel, Marco 25/05/2015 \n", "Chamakh, Marouane 26/12/2014 \n", "Ince, Tom \n", "Ireland, Stephen 01/01/2015 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting Player Form Information" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df['month_pts'] = pd.Series(0, index=df.index)\n", "df['week_pts'] = pd.Series(0, index=df.index)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found: 401\n", "O'Brien, Joey\n" ] } ], "source": [ "url = 'https://www.dreamteamfc.com/statistics/form-guide/all'\n", "r = requests.get(url)\n", "soup = BeautifulSoup(r.text, 'html5lib')\n", "\n", "name_list = []\n", "\n", "for td in soup.findAll(\"td\", { \"class\" : \"tabName\" }):\n", " name = td.text.strip()\n", " if name:\n", " name_list.append(name)\n", " \n", " res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]\n", " try:\n", " month_pts, week_pts = float(res[-2]), float(res[-1])\n", " df.loc[df.index==name, ['month_pts', 'week_pts']] = month_pts, week_pts\n", " except ValueError:\n", " pass\n", " \n", "print('Found: %s' % len(name_list))\n", "print(name_list[-1])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namepositionteamvfmvalueptsmonth_ptsweek_ptsstatusdescriptionreturns
Sigurdsson, Gylfi Sigurdsson, Gylfi MID SWA 28.67 3.0 86 28 5
van Ginkel, Marco van Ginkel, Marco MID CHE 0.00 2.0 0 0 0 Unavailable Joined AC Milan on season-long loan 25/05/2015
Chamakh, Marouane Chamakh, Marouane STR CRY 2.67 1.5 4 -1 0 Injured Sustained in the encounter with Stoke on 13/12... 26/12/2014
Ince, Tom Ince, Tom MID HUL 6.50 2.0 13 0 0
Ireland, Stephen Ireland, Stephen MID STO 4.00 1.0 4 -1 0 Doubtful Rated a doubt for 22/12 visit of Chelsea. 01/01/2015
\n", "
" ], "text/plain": [ " name position team vfm value pts \\\n", "Sigurdsson, Gylfi Sigurdsson, Gylfi MID SWA 28.67 3.0 86 \n", "van Ginkel, Marco van Ginkel, Marco MID CHE 0.00 2.0 0 \n", "Chamakh, Marouane Chamakh, Marouane STR CRY 2.67 1.5 4 \n", "Ince, Tom Ince, Tom MID HUL 6.50 2.0 13 \n", "Ireland, Stephen Ireland, Stephen MID STO 4.00 1.0 4 \n", "\n", " month_pts week_pts status \\\n", "Sigurdsson, Gylfi 28 5 \n", "van Ginkel, Marco 0 0 Unavailable \n", "Chamakh, Marouane -1 0 Injured \n", "Ince, Tom 0 0 \n", "Ireland, Stephen -1 0 Doubtful \n", "\n", " description \\\n", "Sigurdsson, Gylfi \n", "van Ginkel, Marco Joined AC Milan on season-long loan \n", "Chamakh, Marouane Sustained in the encounter with Stoke on 13/12... \n", "Ince, Tom \n", "Ireland, Stephen Rated a doubt for 22/12 visit of Chelsea. \n", "\n", " returns \n", "Sigurdsson, Gylfi \n", "van Ginkel, Marco 25/05/2015 \n", "Chamakh, Marouane 26/12/2014 \n", "Ince, Tom \n", "Ireland, Stephen 01/01/2015 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Reordering the columns\n", "\n", "df = df[['name', 'position', 'team', 'vfm', 'value', 'pts', 'month_pts', \n", " 'week_pts', 'status', 'description', 'returns']]\n", "\n", "df.tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Saving the Data to CSV" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "20141220\n" ] } ], "source": [ "# Getting the current time stamp for the data\n", "\n", "from datetime import datetime\n", "\n", "url = 'https://www.dreamteamfc.com/statistics/players/ALL/'\n", "r = requests.get(url)\n", "data = r.text\n", "soup = BeautifulSoup(data)\n", "\n", "raw_date = soup.find('li', {'class' : 'pointsupdateinfo' }).text\n", "raw_date = raw_date.split()[-1].replace('/', '').strip()\n", "d = datetime.strptime(raw_date, '%d%m%Y').date()\n", "date = d.strftime('%Y%m%d')\n", "print(date)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df.to_csv('../data/dreamteamfc_%s.csv' % date, index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# espnfc.com" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting Team Ranks and Stats" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "from bs4 import BeautifulSoup\n", "import bs4\n", "import requests" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Downloading and parsing the data into a Python dict\n", "\n", "team_dict = {}\n", "\n", "url = 'http://www.espnfc.com/barclays-premier-league/23/table'\n", "r = requests.get(url)\n", "soup = BeautifulSoup(r.text, 'html5lib') \n", "# Note: html5lib deals better with broken html than lxml\n", "\n", "for td in soup.findAll('td', { 'class' : 'pos' }):\n", " rank = int(td.text)\n", " res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag) and i.text!='\\xa0']\n", " team_name = res[0].strip()\n", " values = [int(i) for i in res[1:]]\n", " team_dict[team_name] = [rank] + values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Column legend:\n", "\n", "- Pos: POSITION\n", "- P: GAMES PLAYED \n", "- W: WINS \n", "- D: DRAWS \n", "- L: LOSSES \n", "- F: GOALS FOR \n", "- A: GOALS AGAINST \n", "- GD: GOAL DIFFERENCE \n", "- PTS: POINTS\n", "\n", "suffixes:\n", "- _ov: OVERALL\n", "- _hm: HOME GAMES\n", "- _aw: AWAY GAMES" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
teamPosP_ovW_ovD_ovL_ovF_ovA_ovW_hmD_hmL_hmF_hmA_hmW_awD_awL_awF_awA_awGDPTS
Chelsea Chelsea 1 20 14 4 2 44 19 9 0 0 22 3 5 4 2 22 16 25 46
Manchester City Manchester City 2 20 14 4 2 44 19 7 2 1 20 9 7 2 1 24 10 25 46
Manchester United Manchester United 3 20 10 7 3 34 20 8 1 1 22 7 2 6 2 12 13 14 37
Southampton Southampton 4 20 11 3 6 34 15 7 2 2 24 7 4 1 4 10 8 19 36
Tottenham Hotspur Tottenham Hotspur 5 20 10 4 6 29 27 5 2 4 16 13 5 2 2 13 14 2 34
Arsenal Arsenal 6 20 9 6 5 34 25 5 3 1 18 10 4 3 4 16 15 9 33
West Ham United West Ham United 7 20 9 5 6 31 24 6 2 3 16 10 3 3 3 15 14 7 32
Liverpool Liverpool 8 20 8 5 7 28 27 4 5 2 15 11 4 0 5 13 16 1 29
Swansea City Swansea City 9 20 8 5 7 25 24 6 2 2 15 7 2 3 5 10 17 1 29
Newcastle United Newcastle United 10 20 7 6 7 25 31 5 3 2 16 14 2 3 5 9 17 -6 27
Stoke City Stoke City 11 20 7 5 8 22 24 4 2 4 12 12 3 3 4 10 12 -2 26
Aston Villa Aston Villa 12 20 5 7 8 11 22 2 5 3 7 11 3 2 5 4 11-11 22
Everton Everton 13 20 5 6 9 29 33 3 3 3 16 15 2 3 6 13 18 -4 21
Sunderland Sunderland 14 20 3 11 6 18 30 1 6 3 10 15 2 5 3 8 15-12 20
Hull City Hull City 15 20 4 7 9 20 26 2 3 5 10 12 2 4 4 10 14 -6 19
Queens Park Rangers Queens Park Rangers 16 20 5 4 11 22 35 5 4 2 18 13 0 0 9 4 22-13 19
West Bromwich Albion West Bromwich Albion 17 20 4 6 10 19 29 2 3 5 13 16 2 3 5 6 13-10 18
Crystal Palace Crystal Palace 18 20 3 8 9 20 30 2 2 5 10 14 1 6 4 10 16-10 17
Burnley Burnley 19 20 3 8 9 17 32 2 4 4 7 12 1 4 5 10 20-15 17
Leicester City Leicester City 20 20 3 5 12 19 33 1 4 4 12 15 2 1 8 7 18-14 14
\n", "
" ], "text/plain": [ " team Pos P_ov W_ov D_ov L_ov F_ov \\\n", "Chelsea Chelsea 1 20 14 4 2 44 \n", "Manchester City Manchester City 2 20 14 4 2 44 \n", "Manchester United Manchester United 3 20 10 7 3 34 \n", "Southampton Southampton 4 20 11 3 6 34 \n", "Tottenham Hotspur Tottenham Hotspur 5 20 10 4 6 29 \n", "Arsenal Arsenal 6 20 9 6 5 34 \n", "West Ham United West Ham United 7 20 9 5 6 31 \n", "Liverpool Liverpool 8 20 8 5 7 28 \n", "Swansea City Swansea City 9 20 8 5 7 25 \n", "Newcastle United Newcastle United 10 20 7 6 7 25 \n", "Stoke City Stoke City 11 20 7 5 8 22 \n", "Aston Villa Aston Villa 12 20 5 7 8 11 \n", "Everton Everton 13 20 5 6 9 29 \n", "Sunderland Sunderland 14 20 3 11 6 18 \n", "Hull City Hull City 15 20 4 7 9 20 \n", "Queens Park Rangers Queens Park Rangers 16 20 5 4 11 22 \n", "West Bromwich Albion West Bromwich Albion 17 20 4 6 10 19 \n", "Crystal Palace Crystal Palace 18 20 3 8 9 20 \n", "Burnley Burnley 19 20 3 8 9 17 \n", "Leicester City Leicester City 20 20 3 5 12 19 \n", "\n", " A_ov W_hm D_hm L_hm F_hm A_hm W_aw D_aw L_aw \\\n", "Chelsea 19 9 0 0 22 3 5 4 2 \n", "Manchester City 19 7 2 1 20 9 7 2 1 \n", "Manchester United 20 8 1 1 22 7 2 6 2 \n", "Southampton 15 7 2 2 24 7 4 1 4 \n", "Tottenham Hotspur 27 5 2 4 16 13 5 2 2 \n", "Arsenal 25 5 3 1 18 10 4 3 4 \n", "West Ham United 24 6 2 3 16 10 3 3 3 \n", "Liverpool 27 4 5 2 15 11 4 0 5 \n", "Swansea City 24 6 2 2 15 7 2 3 5 \n", "Newcastle United 31 5 3 2 16 14 2 3 5 \n", "Stoke City 24 4 2 4 12 12 3 3 4 \n", "Aston Villa 22 2 5 3 7 11 3 2 5 \n", "Everton 33 3 3 3 16 15 2 3 6 \n", "Sunderland 30 1 6 3 10 15 2 5 3 \n", "Hull City 26 2 3 5 10 12 2 4 4 \n", "Queens Park Rangers 35 5 4 2 18 13 0 0 9 \n", "West Bromwich Albion 29 2 3 5 13 16 2 3 5 \n", "Crystal Palace 30 2 2 5 10 14 1 6 4 \n", "Burnley 32 2 4 4 7 12 1 4 5 \n", "Leicester City 33 1 4 4 12 15 2 1 8 \n", "\n", " F_aw A_aw GD PTS \n", "Chelsea 22 16 25 46 \n", "Manchester City 24 10 25 46 \n", "Manchester United 12 13 14 37 \n", "Southampton 10 8 19 36 \n", "Tottenham Hotspur 13 14 2 34 \n", "Arsenal 16 15 9 33 \n", "West Ham United 15 14 7 32 \n", "Liverpool 13 16 1 29 \n", "Swansea City 10 17 1 29 \n", "Newcastle United 9 17 -6 27 \n", "Stoke City 10 12 -2 26 \n", "Aston Villa 4 11 -11 22 \n", "Everton 13 18 -4 21 \n", "Sunderland 8 15 -12 20 \n", "Hull City 10 14 -6 19 \n", "Queens Park Rangers 4 22 -13 19 \n", "West Bromwich Albion 6 13 -10 18 \n", "Crystal Palace 10 16 -10 17 \n", "Burnley 10 20 -15 17 \n", "Leicester City 7 18 -14 14 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame.from_dict(team_dict, orient='index')\n", "cols = ['Pos','P_ov','W_ov','D_ov','L_ov','F_ov','A_ov',\n", " 'W_hm','D_hm','L_hm','F_hm','A_hm', 'W_aw',\n", " 'D_aw','L_aw','F_aw','A_aw','GD','PTS']\n", "df.columns = cols\n", "df = df.sort('Pos')\n", "df['team'] = df.index\n", "df = df[['team']+cols]\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Saving ESPN Data to CSV" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df.to_csv('../data/2014_epl_day_17/espn_20141222.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting Top Scorer" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameteamgoals
Diego Costa Diego Costa Chelsea 17
Sergio Agüero Sergio Agüero Manchester City 14
Charlie Austin Charlie Austin Queens Park Rangers 13
Alexis Sánchez Alexis Sánchez Arsenal 12
Papiss Demba Cisse Papiss Demba Cisse Newcastle United 9
\n", "
" ], "text/plain": [ " name team goals\n", "Diego Costa Diego Costa Chelsea 17\n", "Sergio Agüero Sergio Agüero Manchester City 14\n", "Charlie Austin Charlie Austin Queens Park Rangers 13\n", "Alexis Sánchez Alexis Sánchez Arsenal 12\n", "Papiss Demba Cisse Papiss Demba Cisse Newcastle United 9" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Downloading and parsing the data into a Python dict\n", "\n", "player_dict = {}\n", "\n", "url = 'http://www.espnfc.com/barclays-premier-league/23/statistics/scorers'\n", "\n", "r = requests.get(url)\n", "soup = BeautifulSoup(r.text, 'html5lib') \n", "# Note: html5lib deals better with broken html than lxml\n", "\n", "for td in soup.findAll('td', { 'headers' : 'player' }):\n", " name = td.text\n", " team, goals = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag) and i.text!='\\xa0']\n", " player_dict[name] = [team, int(goals)]\n", " \n", "df_essc = pd.DataFrame.from_dict(player_dict, orient='index')\n", "df_essc['name'] = df_essc.index\n", "df_essc.columns = ['team', 'goals', 'name']\n", "df_essc = df_essc[['name', 'team', 'goals']]\n", "df_essc.sort('goals', ascending=False, inplace=True)\n", "df_essc.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting Top Assists" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameteamassists
Cesc Fàbregas Cesc Fàbregas Chelsea 15
Gylfi Sigurdsson Gylfi Sigurdsson Swansea City 8
Leighton Baines Leighton Baines Everton 8
Stewart Downing Stewart Downing West Ham United 7
Dusan Tadic Dusan Tadic Southampton 7
\n", "
" ], "text/plain": [ " name team assists\n", "Cesc Fàbregas Cesc Fàbregas Chelsea 15\n", "Gylfi Sigurdsson Gylfi Sigurdsson Swansea City 8\n", "Leighton Baines Leighton Baines Everton 8\n", "Stewart Downing Stewart Downing West Ham United 7\n", "Dusan Tadic Dusan Tadic Southampton 7" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "player_dict = {}\n", "\n", "url = 'http://www.espnfc.com/barclays-premier-league/23/statistics/assists'\n", "\n", "r = requests.get(url)\n", "soup = BeautifulSoup(r.text, 'html5lib') \n", "# Note: html5lib deals better with broken html than lxml\n", "\n", "for td in soup.findAll('td', { 'headers' : 'player' }):\n", " name = td.text\n", " team, assists = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag) and i.text!='\\xa0']\n", " player_dict[name] = [team, int(assists)]\n", " \n", "df_esas = pd.DataFrame.from_dict(player_dict, orient='index')\n", "df_esas['name'] = df_esas.index\n", "df_esas.columns = ['team', 'assists', 'name']\n", "df_esas = df_esas[['name', 'team', 'assists']]\n", "df_esas.sort('assists', ascending=False, inplace=True)\n", "df_esas.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 365stats.com" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting Injury Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "from bs4 import BeautifulSoup\n", "import bs4\n", "import requests" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Downloading and parsing the data into a Python dict\n", "\n", "injury_dict = {}\n", "\n", "url = 'http://365stats.com/football/injuries'\n", "r = requests.get(url)\n", "soup = BeautifulSoup(r.text, 'html5lib') \n", "# Note: html5lib deals better with broken html than lxml\n", "\n", "for td in soup.findAll('td', { 'nowrap' : 'nowrap' }):\n", " name = td.text.split()\n", " player_info = ['%s, %s' % (' '.join(name[1:]), name[0])]\n", " for i in td.next_siblings:\n", " if isinstance(i, bs4.Tag):\n", " player_info.append(i.text)\n", " injury_dict[player_info[0]] = player_info[1:3]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameinjuryreturns
Osman, L Osman, L Calf/Shin Injury no date
Hibbert, T Hibbert, T Muscular Injury 2 Weeks
Moses, V Moses, V Thigh Muscle Strain 2 Weeks
Taarabt, A Taarabt, A Groin/Pelvis Injury no date
Ward, S Ward, S Ankle/Foot Injury no date
\n", "
" ], "text/plain": [ " name injury returns\n", "Osman, L Osman, L Calf/Shin Injury no date\n", "Hibbert, T Hibbert, T Muscular Injury 2 Weeks\n", "Moses, V Moses, V Thigh Muscle Strain 2 Weeks\n", "Taarabt, A Taarabt, A Groin/Pelvis Injury no date\n", "Ward, S Ward, S Ankle/Foot Injury no date" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame.from_dict(injury_dict, orient='index')\n", "df.columns=['injury', 'returns']\n", "df['name'] = df.index\n", "df = df[['name', 'injury', 'returns']]\n", "df.tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Saving 365stats Data to CSV" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df.to_csv('../data/2014_epl_day_17/365stats_injury_20141222.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Transfermarkt.com" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Getting Home and Away Teams" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "from bs4 import BeautifulSoup\n", "import bs4\n", "import requests" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
homeaway
0NorwichManchester City
1Manchester Utd.Crystal Palace
2BournemouthSwansea
3Stoke CitySouthampton FC
4West HamWatford
5SunderlandEverton
6Arsenal FCWest Brom
7Aston VillaSpurs
8LiverpoolChelsea FC
9Leicester CityNewcastle
\n", "
" ], "text/plain": [ " home away\n", "0 Norwich Manchester City\n", "1 Manchester Utd. Crystal Palace\n", "2 Bournemouth Swansea\n", "3 Stoke City Southampton FC\n", "4 West Ham Watford\n", "5 Sunderland Everton\n", "6 Arsenal FC West Brom\n", "7 Aston Villa Spurs\n", "8 Liverpool Chelsea FC\n", "9 Leicester City Newcastle" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Downloading and parsing the data into a Python dict \n", " \n", "url = 'http://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1'\n", "\n", "s = requests.Session()\n", "s.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36'\n", "s.headers['Host'] = 'www.transfermarkt.com'\n", "url = 'http://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1'\n", "r = s.get(url)\n", "\n", "soup = BeautifulSoup(r.text, 'html5lib') \n", "# Note: html5lib deals better with broken html than lxml\n", "\n", "# Find tab for the upcoming fixtures\n", "tab = 'spieltagtabs-2'\n", "div = soup.find('div', { 'id' : tab })\n", "tit = div.findAll('a', { 'class' : 'ergebnis-link' })\n", "if len(tit) > 0:\n", " tab = 'spieltagtabs-3'\n", "\n", "# Get fixtures\n", "home = []\n", "away = []\n", "\n", "div = soup.find('div', { 'id' : tab })\n", "for t in div.findAll('td', { 'class' : 'text-right no-border-rechts no-border-links' }):\n", " team = t.text.strip()\n", " if team:\n", " home.append(team)\n", "for t in div.findAll('td', { 'class' : 'no-border-links no-border-rechts' }):\n", " team = t.text.strip()\n", " if team:\n", " away.append(team)\n", "\n", "\n", "df = pd.DataFrame(home, columns=['home'])\n", "df['away'] = away\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Saving Home and Away Teams to CSV" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df.to_csv('../data/2014_epl_day_19/transfermarkt_20141227.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# premierleague.com" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "from bs4 import BeautifulSoup\n", "import bs4\n", "import requests" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Spurs', 'Man Utd']\n", "['Southampton', 'Chelsea']\n", "['Aston Villa', 'Sunderland']\n", "['Hull', 'Leicester']\n", "['Man City', 'Burnley']\n", "['QPR', 'Crystal Palace']\n", "['Stoke', 'West Brom']\n", "['West Ham', 'Arsenal']\n", "['Newcastle', 'Everton']\n", "['Leicester', 'Newcastle']\n", "['Tranmere', 'Swansea']\n", "['West Brom', 'Gateshead']\n", "['Liverpool', 'Chelsea']\n", "['Paris SG', 'Chelsea']\n", "['BSC Young Boys', 'Everton']\n", "['Liverpool', 'Besiktas']\n", "['Spurs', 'Fiorentina']\n" ] } ], "source": [ "# Downloading and parsing the data into a Python dict\n", "\n", "url = 'http://www.premierleague.com/en-gb/matchday.html'\n", "r = requests.get(url)\n", "soup = BeautifulSoup(r.text, 'html5lib') \n", "# Note: html5lib deals better with broken html than lxml\n", "\n", "home = []\n", "away = []\n", "\n", "for t in soup.findAll('td', { 'width' : '30%' }):\n", " team = t.text.strip().split(' v ')\n", " print(team)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# telegraph.co.uk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting Current Week Points" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "from bs4 import BeautifulSoup\n", "import bs4\n", "import requests" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(548, 6)\n" ] }, { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameteamsalarypts/salaryweek_ptstotal_pts
Januzaj, A Januzaj, A Manchester United 3.9 3.8 1 15
Grealish, J Grealish, J Aston Villa 2.5 4.0 3 10
Anichebe, V Anichebe, V West Bromwich Albion 4.0 8.2 10 33
Hibbert, T Hibbert, T Everton 2.2 3.2 0 7
Coutinho, P Coutinho, P Liverpool 4.4 11.1 2 49
\n", "
" ], "text/plain": [ " name team salary pts/salary week_pts \\\n", "Januzaj, A Januzaj, A Manchester United 3.9 3.8 1 \n", "Grealish, J Grealish, J Aston Villa 2.5 4.0 3 \n", "Anichebe, V Anichebe, V West Bromwich Albion 4.0 8.2 10 \n", "Hibbert, T Hibbert, T Everton 2.2 3.2 0 \n", "Coutinho, P Coutinho, P Liverpool 4.4 11.1 2 \n", "\n", " total_pts \n", "Januzaj, A 15 \n", "Grealish, J 10 \n", "Anichebe, V 33 \n", "Hibbert, T 7 \n", "Coutinho, P 49 " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "url = 'https://fantasyfootball.telegraph.co.uk/premierleague/players/'\n", "r = requests.get(url)\n", "soup = BeautifulSoup(r.text, 'html5lib') \n", "# Note: html5lib deals better with broken html than lxml\n", "\n", "player_dict = {}\n", "\n", "for t in soup.findAll('td', { 'class' : 'first' }):\n", " player = t.text.strip()\n", " player_dict[player] = []\n", " for s in t.next_siblings:\n", " if isinstance(s, bs4.Tag):\n", " player_dict[player].append(s.text)\n", "\n", "# parse the player dictionary\n", "df = pd.DataFrame.from_dict(player_dict, orient='index')\n", "\n", "# make name column\n", "df['name'] = df.index\n", "\n", "# assign column names and reorder columns\n", "df.columns = ['team', 'salary', 'pts/salary', 'week_pts', 'total_pts', 'name']\n", "df = df[['name', 'team', 'salary', 'pts/salary', 'week_pts', 'total_pts']]\n", "\n", "# parse data into the right format\n", "df['salary'] = df['salary'].apply(lambda x: x.strip('£').strip(' m'))\n", "df[['salary', 'pts/salary']] = df[['salary', 'pts/salary']].astype(float)\n", "df[['week_pts', 'total_pts']] = df[['week_pts', 'total_pts']].astype(int)\n", "\n", "print(df.shape)\n", "df.tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting 6-Week Points" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameteamsalarypts/salaryweek_ptstotal_pts6week_pts
Januzaj, A Januzaj, A Manchester United 3.9 3.8 1 15 2
Grealish, J Grealish, J Aston Villa 2.5 4.0 3 10 7
Anichebe, V Anichebe, V West Bromwich Albion 4.0 8.2 10 33 16
Hibbert, T Hibbert, T Everton 2.2 3.2 0 7 4
Coutinho, P Coutinho, P Liverpool 4.4 11.1 2 49 27
\n", "
" ], "text/plain": [ " name team salary pts/salary week_pts \\\n", "Januzaj, A Januzaj, A Manchester United 3.9 3.8 1 \n", "Grealish, J Grealish, J Aston Villa 2.5 4.0 3 \n", "Anichebe, V Anichebe, V West Bromwich Albion 4.0 8.2 10 \n", "Hibbert, T Hibbert, T Everton 2.2 3.2 0 \n", "Coutinho, P Coutinho, P Liverpool 4.4 11.1 2 \n", "\n", " total_pts 6week_pts \n", "Januzaj, A 15 2 \n", "Grealish, J 10 7 \n", "Anichebe, V 33 16 \n", "Hibbert, T 7 4 \n", "Coutinho, P 49 27 " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "url = 'https://fantasyfootball.telegraph.co.uk/premierleague/formguide/'\n", "r = requests.get(url)\n", "soup = BeautifulSoup(r.text, 'html5lib') \n", "# Note: html5lib deals better with broken html than lxml\n", "\n", "df['6week_pts'] = pd.Series(0, index=df.index)\n", "\n", "for t in soup.findAll('td', { 'class' : 'first' }):\n", " player = t.text.strip()\n", " if player:\n", " week6 = t.parent.find('td', { 'class' : 'sixth last' })\n", " df.loc[df['name'] == player, '6week_pts'] = week6.text\n", "\n", "df.tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Saving telegraph.co.uk to CSV" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df.to_csv('../data/2014_epl_day_20/telegraph_20141229.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# m.premierleague.com" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Combined Form of Previous 6 Days" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "from bs4 import BeautifulSoup\n", "import bs4\n", "import requests" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
position-last-6-gamesteam
West Brom 15 West Brom
Hull 12 Hull
Southampton 9 Southampton
Newcastle 11 Newcastle
Crystal Palace 18 Crystal Palace
\n", "
" ], "text/plain": [ " position-last-6-games team\n", "West Brom 15 West Brom\n", "Hull 12 Hull\n", "Southampton 9 Southampton\n", "Newcastle 11 Newcastle\n", "Crystal Palace 18 Crystal Palace" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "url = 'http://m.premierleague.com/en-gb/form-guide.html'\n", "r = requests.get(url)\n", "soup = BeautifulSoup(r.text, 'html5lib') \n", "# Note: html5lib deals better with broken html than lxml\n", "\n", "#df['6week_pts'] = pd.Series(0, index=df.index)\n", "\n", "team_dict = {}\n", "\n", "for d in soup.findAll('td', { 'class' : 'col-pos' }):\n", " if len(team_dict) > 20:\n", " break\n", " pos = d.text\n", " for e in d.next_siblings:\n", " if isinstance(e, bs4.Tag):\n", " if 'class' in e.attrs and 'col-club' in e.attrs['class']:\n", " club = e.text\n", " team_dict[club] = pos\n", " break\n", "\n", "df = pd.DataFrame.from_dict(team_dict, orient='index')\n", " \n", "df.columns = ['position-last-6-games']\n", "df['team'] = df.index\n", "df.tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Saving m.premierleague.com to CSV" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df.to_csv('../data/2014_epl_day_20/mpremierleague_20141230.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# fantasyfootballscout.co.uk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predicted Line-Ups" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "from bs4 import BeautifulSoup\n", "import bs4\n", "import requests" ] }, { "cell_type": "code", "execution_count": 85, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ArsenalAston VillaBurnleyChelseaCrystal PalaceEvertonHull CityLeicester CityLiverpoolManchester CityManchester UnitedNewcastle UnitedQueens Park RangersSouthamptonStoke CitySunderlandSwansea CityTottenham HotspurWest Bromwich AlbionWest Ham United
6 Flamini Sánchez Jones Matic McArthur Barry Meyler Mahrez Lucas Fernandinho Carrick Tioté Henry Wanyama Whelan Johnson Ki Bentaleb Gardner Nolan
7 Coquelin Cleverley Marney Willian Joe Ledley Barkley Livermore Drinkwater Moreno Y Touré Rooney Ayoze Pérez Barton Davis Mame Biram Diouf Larsson Dyer Lamela Morrison Noble
8 Oxlade-Chamberlain N'Zogbia Boyd Oscar Bolasie Naismith Brady James Coutinho Nasri Young Sissoko Fer Mane Walters Gómez Sigurdsson Eriksen Brunt Downing
9 Sánchez Benteke Barnes Hazard Zaha Mirallas Aluko Vardy Sterling Silva Falcao Gouffran Zamora Tadic Arnautovic Wickham Routledge Chadli Dorrans Sakho
10 Cazorla Agbonlahor Ings Diego Costa Campbell Lukaku Jelavic Ulloa Lallana Jovetic van Persie Armstrong Austin Pellè Crouch Fletcher Bony Kane Berahino Carroll
\n", "
" ], "text/plain": [ " Arsenal Aston Villa Burnley Chelsea Crystal Palace \\\n", "6 Flamini Sánchez Jones Matic McArthur \n", "7 Coquelin Cleverley Marney Willian Joe Ledley \n", "8 Oxlade-Chamberlain N'Zogbia Boyd Oscar Bolasie \n", "9 Sánchez Benteke Barnes Hazard Zaha \n", "10 Cazorla Agbonlahor Ings Diego Costa Campbell \n", "\n", " Everton Hull City Leicester City Liverpool Manchester City \\\n", "6 Barry Meyler Mahrez Lucas Fernandinho \n", "7 Barkley Livermore Drinkwater Moreno Y Touré \n", "8 Naismith Brady James Coutinho Nasri \n", "9 Mirallas Aluko Vardy Sterling Silva \n", "10 Lukaku Jelavic Ulloa Lallana Jovetic \n", "\n", " Manchester United Newcastle United Queens Park Rangers Southampton \\\n", "6 Carrick Tioté Henry Wanyama \n", "7 Rooney Ayoze Pérez Barton Davis \n", "8 Young Sissoko Fer Mane \n", "9 Falcao Gouffran Zamora Tadic \n", "10 van Persie Armstrong Austin Pellè \n", "\n", " Stoke City Sunderland Swansea City Tottenham Hotspur \\\n", "6 Whelan Johnson Ki Bentaleb \n", "7 Mame Biram Diouf Larsson Dyer Lamela \n", "8 Walters Gómez Sigurdsson Eriksen \n", "9 Arnautovic Wickham Routledge Chadli \n", "10 Crouch Fletcher Bony Kane \n", "\n", " West Bromwich Albion West Ham United \n", "6 Gardner Nolan \n", "7 Morrison Noble \n", "8 Brunt Downing \n", "9 Dorrans Sakho \n", "10 Berahino Carroll " ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "url = 'http://www.fantasyfootballscout.co.uk/team-news/'\n", "r = requests.get(url)\n", "soup = BeautifulSoup(r.text, 'html5lib') \n", "# Note: html5lib deals better with broken html than lxml\n", "\n", "\n", "team_dict = {}\n", "\n", "for li in soup.findAll('li'):\n", " for h2 in li.findAll('h2'):\n", " team = h2.text\n", " team_dict[team] = []\n", " for p in li.findAll('span', { 'class' : 'player-name' }):\n", " player = p.text\n", " team_dict[team].append(player)\n", " \n", "df = pd.DataFrame.from_dict(team_dict)\n", "df.tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Saving fantasyfootballscout.co.uk to CSV" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "execution_count": 86, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df.to_csv('../data/epl_1314_21/fantasyfootballscout.csv', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.1" } }, "nbformat": 4, "nbformat_minor": 0 }