{ "metadata": { "name": "", "signature": "sha256:ec4d60d620054da9715a81f0cf0cefa1ce95822e86978f66d727bb384d3c7204" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "%load_ext watermark\n", "%watermark -a 'Sebastian Raschka' -v" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Sebastian Raschka \n", "\n", "CPython 3.4.2\n", "IPython 2.3.1\n" ] } ], "prompt_number": 1 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Parsing data from dreamteamfc.com" ] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Sections" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- [Getting General Player Statistics](#Getting-General-Player-Statistics)\n", "- [Getting Injuries and Cards Information](#Getting-Injuries-and-Cards-Information)\n", "- [Getting Player Form Information](#Getting-Player-Form-Information)\n", "- [Saving the Data to CSV](#Saving-the-Data-to-CSV)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Getting General Player Statistics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "from bs4 import BeautifulSoup\n", "import bs4\n", "import requests" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "# Downloading and parsing the data into a Python dict\n", "\n", "player_dict = {}\n", "\n", "url = 'https://www.dreamteamfc.com/statistics/players/ALL/'\n", "r = requests.get(url)\n", "soup = BeautifulSoup(r.text, 'html5lib') \n", "# Note: html5lib deals better with broken html than lxml\n", "\n", "name_list = []\n", "\n", "for td in soup.findAll(\"td\", { \"class\" : \"tabName\" }):\n", " name = td.text.split('Statistics')[-1].strip()\n", " if name:\n", " name_list.append(name)\n", " res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]\n", " position, team, vfm, value, points = res\n", " value = value.strip('m')\n", " player_dict[name] = [name, position, team, vfm, value, points]\n", " \n", "print('Found: %s' % len(name_list))\n", "print(name_list[-1])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Found: 401\n", "O'Brien, Joey\n" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "# Reading the data into a pandas DataFrame\n", "\n", "df = pd.DataFrame.from_dict(player_dict, orient='index')\n", "df.columns = ['name', 'position', 'team', 'vfm', 'value', 'points']\n", "df[['vfm','value']] = df[['vfm','value']].astype(float)\n", "df[['points']] = df[['points']].astype(int)\n", "df.tail()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namepositionteamvfmvaluepoints
Odemwingie, Peter Odemwingie, Peter STR STO 1.20 2.5 3
Cisse, Papiss Cisse, Papiss STR NEW 17.67 3.0 53
Duff, Michael Duff, Michael DEF BUR 18.00 1.0 18
Speroni, Julian Speroni, Julian GK CRY 10.67 1.5 16
Flamini, Mathieu Flamini, Mathieu MID ARS 14.00 1.5 21
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": [ " name position team vfm value points\n", "Odemwingie, Peter Odemwingie, Peter STR STO 1.20 2.5 3\n", "Cisse, Papiss Cisse, Papiss STR NEW 17.67 3.0 53\n", "Duff, Michael Duff, Michael DEF BUR 18.00 1.0 18\n", "Speroni, Julian Speroni, Julian GK CRY 10.67 1.5 16\n", "Flamini, Mathieu Flamini, Mathieu MID ARS 14.00 1.5 21" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "df.describe()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
vfmvaluepoints
count 401.000000 401.000000 401.000000
mean 10.083416 2.770574 26.705736
std 9.518409 1.416327 25.338867
min -12.000000 1.000000 -12.000000
25% 2.670000 1.500000 7.000000
50% 8.500000 2.500000 21.000000
75% 14.000000 3.500000 37.000000
max 88.000000 7.500000 155.000000
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 5, "text": [ " vfm value points\n", "count 401.000000 401.000000 401.000000\n", "mean 10.083416 2.770574 26.705736\n", "std 9.518409 1.416327 25.338867\n", "min -12.000000 1.000000 -12.000000\n", "25% 2.670000 1.500000 7.000000\n", "50% 8.500000 2.500000 21.000000\n", "75% 14.000000 3.500000 37.000000\n", "max 88.000000 7.500000 155.000000" ] } ], "prompt_number": 5 }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Getting Injuries and Cards Information" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df['status'] = pd.Series('', index=df.index)\n", "df['description'] = pd.Series('', index=df.index)\n", "df['returns'] = pd.Series('', index=df.index)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "url = 'https://www.dreamteamfc.com/statistics/injuries-and-cards/ALL/'\n", "r = requests.get(url)\n", "soup = BeautifulSoup(r.text, 'html5lib')\n", "\n", "name_list = []\n", "\n", "for td in soup.findAll(\"td\", { \"class\" : \"tabName2\" }):\n", " name = td.text.split('stats')[-1].strip()\n", " if name:\n", " name_list.append(name)\n", " res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]\n", " position, team, status, description, returns = res\n", " df.loc[df.index==name,['status', 'description', 'returns']] = status, description, returns\n", " \n", "print('Found: %s' % len(name_list))\n", "print(name_list[-1])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Found: 84\n", "Tadic, Dusan\n" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "df.tail()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namepositionteamvfmvaluepointsstatusdescriptionreturns
Odemwingie, Peter Odemwingie, Peter STR STO 1.20 2.5 3 Injured Forced off during 30/8 game against Man City. ... Unknown
Cisse, Papiss Cisse, Papiss STR NEW 17.67 3.0 53 Doubtful Missed the Capital One Cup tie with Tottenham ... 21/12/2014
Duff, Michael Duff, Michael DEF BUR 18.00 1.0 18
Speroni, Julian Speroni, Julian GK CRY 10.67 1.5 16
Flamini, Mathieu Flamini, Mathieu MID ARS 14.00 1.5 21
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ " name position team vfm value points \\\n", "Odemwingie, Peter Odemwingie, Peter STR STO 1.20 2.5 3 \n", "Cisse, Papiss Cisse, Papiss STR NEW 17.67 3.0 53 \n", "Duff, Michael Duff, Michael DEF BUR 18.00 1.0 18 \n", "Speroni, Julian Speroni, Julian GK CRY 10.67 1.5 16 \n", "Flamini, Mathieu Flamini, Mathieu MID ARS 14.00 1.5 21 \n", "\n", " status \\\n", "Odemwingie, Peter Injured \n", "Cisse, Papiss Doubtful \n", "Duff, Michael \n", "Speroni, Julian \n", "Flamini, Mathieu \n", "\n", " description \\\n", "Odemwingie, Peter Forced off during 30/8 game against Man City. ... \n", "Cisse, Papiss Missed the Capital One Cup tie with Tottenham ... \n", "Duff, Michael \n", "Speroni, Julian \n", "Flamini, Mathieu \n", "\n", " returns \n", "Odemwingie, Peter Unknown \n", "Cisse, Papiss 21/12/2014 \n", "Duff, Michael \n", "Speroni, Julian \n", "Flamini, Mathieu " ] } ], "prompt_number": 8 }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Getting Player Form Information" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df['month_points'] = pd.Series(0, index=df.index)\n", "df['week_points'] = pd.Series(0, index=df.index)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "url = 'https://www.dreamteamfc.com/statistics/form-guide/all'\n", "r = requests.get(url)\n", "soup = BeautifulSoup(r.text, 'html5lib')\n", "\n", "name_list = []\n", "\n", "for td in soup.findAll(\"td\", { \"class\" : \"tabName\" }):\n", " name = td.text.strip()\n", " if name:\n", " name_list.append(name)\n", " \n", " res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]\n", " try:\n", " month_pts, week_pts = float(res[-2]), float(res[-1])\n", " df.loc[df.index==name, ['month_points', 'week_points']] = month_pts, week_pts\n", " except ValueError:\n", " pass\n", " \n", "print('Found: %s' % len(name_list))\n", "print(name_list[-1])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Found: 401\n", "O'Brien, Joey\n" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "# Reordering the columns\n", "\n", "df = df[['name', 'position', 'team', 'vfm', 'value', 'points', 'month_points', \n", " 'week_points', 'status', 'description', 'returns']]\n", "\n", "# \"Normalizing\" player names\n", "df['name'] = df['name'].apply(lambda x: x.lower())\n", "\n", "df.tail()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namepositionteamvfmvaluepointsmonth_pointsweek_pointsstatusdescriptionreturns
Odemwingie, Peter odemwingie, peter STR STO 1.20 2.5 3 0 0 Injured Forced off during 30/8 game against Man City. ... Unknown
Cisse, Papiss cisse, papiss STR NEW 17.67 3.0 53 23 0 Doubtful Missed the Capital One Cup tie with Tottenham ... 21/12/2014
Duff, Michael duff, michael DEF BUR 18.00 1.0 18 0 0
Speroni, Julian speroni, julian GK CRY 10.67 1.5 16 6-2
Flamini, Mathieu flamini, mathieu MID ARS 14.00 1.5 21 4 0
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 12, "text": [ " name position team vfm value points \\\n", "Odemwingie, Peter odemwingie, peter STR STO 1.20 2.5 3 \n", "Cisse, Papiss cisse, papiss STR NEW 17.67 3.0 53 \n", "Duff, Michael duff, michael DEF BUR 18.00 1.0 18 \n", "Speroni, Julian speroni, julian GK CRY 10.67 1.5 16 \n", "Flamini, Mathieu flamini, mathieu MID ARS 14.00 1.5 21 \n", "\n", " month_points week_points status \\\n", "Odemwingie, Peter 0 0 Injured \n", "Cisse, Papiss 23 0 Doubtful \n", "Duff, Michael 0 0 \n", "Speroni, Julian 6 -2 \n", "Flamini, Mathieu 4 0 \n", "\n", " description \\\n", "Odemwingie, Peter Forced off during 30/8 game against Man City. ... \n", "Cisse, Papiss Missed the Capital One Cup tie with Tottenham ... \n", "Duff, Michael \n", "Speroni, Julian \n", "Flamini, Mathieu \n", "\n", " returns \n", "Odemwingie, Peter Unknown \n", "Cisse, Papiss 21/12/2014 \n", "Duff, Michael \n", "Speroni, Julian \n", "Flamini, Mathieu " ] } ], "prompt_number": 12 }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "
" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Saving the Data to CSV" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[[back to top](#Sections)]" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Getting the current time stamp for the data\n", "\n", "from datetime import datetime\n", "\n", "url = 'https://www.dreamteamfc.com/statistics/players/ALL/'\n", "r = requests.get(url)\n", "data = r.text\n", "soup = BeautifulSoup(data)\n", "\n", "raw_date = soup.find('li', {'class' : 'pointsupdateinfo' }).text\n", "raw_date = raw_date.split()[-1].replace('/', '').strip()\n", "d = datetime.strptime(raw_date, '%d%m%Y').date()\n", "date = d.strftime('%Y%m%d')\n", "print(date)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "20141220\n" ] } ], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "df.to_csv('../data/dreamteamfc_%s.csv' % date, index=False)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 14 } ], "metadata": {} } ] }