{
"metadata": {
"name": "",
"signature": "sha256:ec4d60d620054da9715a81f0cf0cefa1ce95822e86978f66d727bb384d3c7204"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"%load_ext watermark\n",
"%watermark -a 'Sebastian Raschka' -v"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Sebastian Raschka \n",
"\n",
"CPython 3.4.2\n",
"IPython 2.3.1\n"
]
}
],
"prompt_number": 1
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Parsing data from dreamteamfc.com"
]
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Sections"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- [Getting General Player Statistics](#Getting-General-Player-Statistics)\n",
"- [Getting Injuries and Cards Information](#Getting-Injuries-and-Cards-Information)\n",
"- [Getting Player Form Information](#Getting-Player-Form-Information)\n",
"- [Saving the Data to CSV](#Saving-the-Data-to-CSV)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"
"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Getting General Player Statistics"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[[back to top](#Sections)]"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pandas as pd\n",
"from bs4 import BeautifulSoup\n",
"import bs4\n",
"import requests"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Downloading and parsing the data into a Python dict\n",
"\n",
"player_dict = {}\n",
"\n",
"url = 'https://www.dreamteamfc.com/statistics/players/ALL/'\n",
"r = requests.get(url)\n",
"soup = BeautifulSoup(r.text, 'html5lib') \n",
"# Note: html5lib deals better with broken html than lxml\n",
"\n",
"name_list = []\n",
"\n",
"for td in soup.findAll(\"td\", { \"class\" : \"tabName\" }):\n",
" name = td.text.split('Statistics')[-1].strip()\n",
" if name:\n",
" name_list.append(name)\n",
" res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]\n",
" position, team, vfm, value, points = res\n",
" value = value.strip('m')\n",
" player_dict[name] = [name, position, team, vfm, value, points]\n",
" \n",
"print('Found: %s' % len(name_list))\n",
"print(name_list[-1])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Found: 401\n",
"O'Brien, Joey\n"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Reading the data into a pandas DataFrame\n",
"\n",
"df = pd.DataFrame.from_dict(player_dict, orient='index')\n",
"df.columns = ['name', 'position', 'team', 'vfm', 'value', 'points']\n",
"df[['vfm','value']] = df[['vfm','value']].astype(float)\n",
"df[['points']] = df[['points']].astype(int)\n",
"df.tail()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" position | \n",
" team | \n",
" vfm | \n",
" value | \n",
" points | \n",
"
\n",
" \n",
" \n",
" \n",
" Odemwingie, Peter | \n",
" Odemwingie, Peter | \n",
" STR | \n",
" STO | \n",
" 1.20 | \n",
" 2.5 | \n",
" 3 | \n",
"
\n",
" \n",
" Cisse, Papiss | \n",
" Cisse, Papiss | \n",
" STR | \n",
" NEW | \n",
" 17.67 | \n",
" 3.0 | \n",
" 53 | \n",
"
\n",
" \n",
" Duff, Michael | \n",
" Duff, Michael | \n",
" DEF | \n",
" BUR | \n",
" 18.00 | \n",
" 1.0 | \n",
" 18 | \n",
"
\n",
" \n",
" Speroni, Julian | \n",
" Speroni, Julian | \n",
" GK | \n",
" CRY | \n",
" 10.67 | \n",
" 1.5 | \n",
" 16 | \n",
"
\n",
" \n",
" Flamini, Mathieu | \n",
" Flamini, Mathieu | \n",
" MID | \n",
" ARS | \n",
" 14.00 | \n",
" 1.5 | \n",
" 21 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
" name position team vfm value points\n",
"Odemwingie, Peter Odemwingie, Peter STR STO 1.20 2.5 3\n",
"Cisse, Papiss Cisse, Papiss STR NEW 17.67 3.0 53\n",
"Duff, Michael Duff, Michael DEF BUR 18.00 1.0 18\n",
"Speroni, Julian Speroni, Julian GK CRY 10.67 1.5 16\n",
"Flamini, Mathieu Flamini, Mathieu MID ARS 14.00 1.5 21"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.describe()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" vfm | \n",
" value | \n",
" points | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 401.000000 | \n",
" 401.000000 | \n",
" 401.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 10.083416 | \n",
" 2.770574 | \n",
" 26.705736 | \n",
"
\n",
" \n",
" std | \n",
" 9.518409 | \n",
" 1.416327 | \n",
" 25.338867 | \n",
"
\n",
" \n",
" min | \n",
" -12.000000 | \n",
" 1.000000 | \n",
" -12.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 2.670000 | \n",
" 1.500000 | \n",
" 7.000000 | \n",
"
\n",
" \n",
" 50% | \n",
" 8.500000 | \n",
" 2.500000 | \n",
" 21.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 14.000000 | \n",
" 3.500000 | \n",
" 37.000000 | \n",
"
\n",
" \n",
" max | \n",
" 88.000000 | \n",
" 7.500000 | \n",
" 155.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 5,
"text": [
" vfm value points\n",
"count 401.000000 401.000000 401.000000\n",
"mean 10.083416 2.770574 26.705736\n",
"std 9.518409 1.416327 25.338867\n",
"min -12.000000 1.000000 -12.000000\n",
"25% 2.670000 1.500000 7.000000\n",
"50% 8.500000 2.500000 21.000000\n",
"75% 14.000000 3.500000 37.000000\n",
"max 88.000000 7.500000 155.000000"
]
}
],
"prompt_number": 5
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"
"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Getting Injuries and Cards Information"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[[back to top](#Sections)]"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df['status'] = pd.Series('', index=df.index)\n",
"df['description'] = pd.Series('', index=df.index)\n",
"df['returns'] = pd.Series('', index=df.index)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"url = 'https://www.dreamteamfc.com/statistics/injuries-and-cards/ALL/'\n",
"r = requests.get(url)\n",
"soup = BeautifulSoup(r.text, 'html5lib')\n",
"\n",
"name_list = []\n",
"\n",
"for td in soup.findAll(\"td\", { \"class\" : \"tabName2\" }):\n",
" name = td.text.split('stats')[-1].strip()\n",
" if name:\n",
" name_list.append(name)\n",
" res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]\n",
" position, team, status, description, returns = res\n",
" df.loc[df.index==name,['status', 'description', 'returns']] = status, description, returns\n",
" \n",
"print('Found: %s' % len(name_list))\n",
"print(name_list[-1])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Found: 84\n",
"Tadic, Dusan\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.tail()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" position | \n",
" team | \n",
" vfm | \n",
" value | \n",
" points | \n",
" status | \n",
" description | \n",
" returns | \n",
"
\n",
" \n",
" \n",
" \n",
" Odemwingie, Peter | \n",
" Odemwingie, Peter | \n",
" STR | \n",
" STO | \n",
" 1.20 | \n",
" 2.5 | \n",
" 3 | \n",
" Injured | \n",
" Forced off during 30/8 game against Man City. ... | \n",
" Unknown | \n",
"
\n",
" \n",
" Cisse, Papiss | \n",
" Cisse, Papiss | \n",
" STR | \n",
" NEW | \n",
" 17.67 | \n",
" 3.0 | \n",
" 53 | \n",
" Doubtful | \n",
" Missed the Capital One Cup tie with Tottenham ... | \n",
" 21/12/2014 | \n",
"
\n",
" \n",
" Duff, Michael | \n",
" Duff, Michael | \n",
" DEF | \n",
" BUR | \n",
" 18.00 | \n",
" 1.0 | \n",
" 18 | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" Speroni, Julian | \n",
" Speroni, Julian | \n",
" GK | \n",
" CRY | \n",
" 10.67 | \n",
" 1.5 | \n",
" 16 | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" Flamini, Mathieu | \n",
" Flamini, Mathieu | \n",
" MID | \n",
" ARS | \n",
" 14.00 | \n",
" 1.5 | \n",
" 21 | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 8,
"text": [
" name position team vfm value points \\\n",
"Odemwingie, Peter Odemwingie, Peter STR STO 1.20 2.5 3 \n",
"Cisse, Papiss Cisse, Papiss STR NEW 17.67 3.0 53 \n",
"Duff, Michael Duff, Michael DEF BUR 18.00 1.0 18 \n",
"Speroni, Julian Speroni, Julian GK CRY 10.67 1.5 16 \n",
"Flamini, Mathieu Flamini, Mathieu MID ARS 14.00 1.5 21 \n",
"\n",
" status \\\n",
"Odemwingie, Peter Injured \n",
"Cisse, Papiss Doubtful \n",
"Duff, Michael \n",
"Speroni, Julian \n",
"Flamini, Mathieu \n",
"\n",
" description \\\n",
"Odemwingie, Peter Forced off during 30/8 game against Man City. ... \n",
"Cisse, Papiss Missed the Capital One Cup tie with Tottenham ... \n",
"Duff, Michael \n",
"Speroni, Julian \n",
"Flamini, Mathieu \n",
"\n",
" returns \n",
"Odemwingie, Peter Unknown \n",
"Cisse, Papiss 21/12/2014 \n",
"Duff, Michael \n",
"Speroni, Julian \n",
"Flamini, Mathieu "
]
}
],
"prompt_number": 8
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"
"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Getting Player Form Information"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[[back to top](#Sections)]"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df['month_points'] = pd.Series(0, index=df.index)\n",
"df['week_points'] = pd.Series(0, index=df.index)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"url = 'https://www.dreamteamfc.com/statistics/form-guide/all'\n",
"r = requests.get(url)\n",
"soup = BeautifulSoup(r.text, 'html5lib')\n",
"\n",
"name_list = []\n",
"\n",
"for td in soup.findAll(\"td\", { \"class\" : \"tabName\" }):\n",
" name = td.text.strip()\n",
" if name:\n",
" name_list.append(name)\n",
" \n",
" res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]\n",
" try:\n",
" month_pts, week_pts = float(res[-2]), float(res[-1])\n",
" df.loc[df.index==name, ['month_points', 'week_points']] = month_pts, week_pts\n",
" except ValueError:\n",
" pass\n",
" \n",
"print('Found: %s' % len(name_list))\n",
"print(name_list[-1])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Found: 401\n",
"O'Brien, Joey\n"
]
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Reordering the columns\n",
"\n",
"df = df[['name', 'position', 'team', 'vfm', 'value', 'points', 'month_points', \n",
" 'week_points', 'status', 'description', 'returns']]\n",
"\n",
"# \"Normalizing\" player names\n",
"df['name'] = df['name'].apply(lambda x: x.lower())\n",
"\n",
"df.tail()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" position | \n",
" team | \n",
" vfm | \n",
" value | \n",
" points | \n",
" month_points | \n",
" week_points | \n",
" status | \n",
" description | \n",
" returns | \n",
"
\n",
" \n",
" \n",
" \n",
" Odemwingie, Peter | \n",
" odemwingie, peter | \n",
" STR | \n",
" STO | \n",
" 1.20 | \n",
" 2.5 | \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" Injured | \n",
" Forced off during 30/8 game against Man City. ... | \n",
" Unknown | \n",
"
\n",
" \n",
" Cisse, Papiss | \n",
" cisse, papiss | \n",
" STR | \n",
" NEW | \n",
" 17.67 | \n",
" 3.0 | \n",
" 53 | \n",
" 23 | \n",
" 0 | \n",
" Doubtful | \n",
" Missed the Capital One Cup tie with Tottenham ... | \n",
" 21/12/2014 | \n",
"
\n",
" \n",
" Duff, Michael | \n",
" duff, michael | \n",
" DEF | \n",
" BUR | \n",
" 18.00 | \n",
" 1.0 | \n",
" 18 | \n",
" 0 | \n",
" 0 | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" Speroni, Julian | \n",
" speroni, julian | \n",
" GK | \n",
" CRY | \n",
" 10.67 | \n",
" 1.5 | \n",
" 16 | \n",
" 6 | \n",
" -2 | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" Flamini, Mathieu | \n",
" flamini, mathieu | \n",
" MID | \n",
" ARS | \n",
" 14.00 | \n",
" 1.5 | \n",
" 21 | \n",
" 4 | \n",
" 0 | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 12,
"text": [
" name position team vfm value points \\\n",
"Odemwingie, Peter odemwingie, peter STR STO 1.20 2.5 3 \n",
"Cisse, Papiss cisse, papiss STR NEW 17.67 3.0 53 \n",
"Duff, Michael duff, michael DEF BUR 18.00 1.0 18 \n",
"Speroni, Julian speroni, julian GK CRY 10.67 1.5 16 \n",
"Flamini, Mathieu flamini, mathieu MID ARS 14.00 1.5 21 \n",
"\n",
" month_points week_points status \\\n",
"Odemwingie, Peter 0 0 Injured \n",
"Cisse, Papiss 23 0 Doubtful \n",
"Duff, Michael 0 0 \n",
"Speroni, Julian 6 -2 \n",
"Flamini, Mathieu 4 0 \n",
"\n",
" description \\\n",
"Odemwingie, Peter Forced off during 30/8 game against Man City. ... \n",
"Cisse, Papiss Missed the Capital One Cup tie with Tottenham ... \n",
"Duff, Michael \n",
"Speroni, Julian \n",
"Flamini, Mathieu \n",
"\n",
" returns \n",
"Odemwingie, Peter Unknown \n",
"Cisse, Papiss 21/12/2014 \n",
"Duff, Michael \n",
"Speroni, Julian \n",
"Flamini, Mathieu "
]
}
],
"prompt_number": 12
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"
"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Saving the Data to CSV"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[[back to top](#Sections)]"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Getting the current time stamp for the data\n",
"\n",
"from datetime import datetime\n",
"\n",
"url = 'https://www.dreamteamfc.com/statistics/players/ALL/'\n",
"r = requests.get(url)\n",
"data = r.text\n",
"soup = BeautifulSoup(data)\n",
"\n",
"raw_date = soup.find('li', {'class' : 'pointsupdateinfo' }).text\n",
"raw_date = raw_date.split()[-1].replace('/', '').strip()\n",
"d = datetime.strptime(raw_date, '%d%m%Y').date()\n",
"date = d.strftime('%Y%m%d')\n",
"print(date)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"20141220\n"
]
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.to_csv('../data/dreamteamfc_%s.csv' % date, index=False)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 14
}
],
"metadata": {}
}
]
}