{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Scraping all expert draft rankings\n", "\n", "Start out by first scraping the FP expert list, and then using that list to scrape each expert's rankings." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "import datetime" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "TODAYS_DATE = datetime.date.today().strftime(\"%Y_%m_%d\")\n", "SCORING_TYPE = 'ppr'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Scrape the FP expert list" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "if SCORING_TYPE == 'ppr':\n", " fp_url = 'https://www.fantasypros.com/nfl/rankings/ppr-cheatsheets.php'\n", "elif SCORING_TYPE == 'standard':\n", " fp_url = 'https://www.fantasypros.com/nfl/rankings/consensus-cheatsheets.php'\n", "r = requests.get(fp_url)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "soup = BeautifulSoup(r.text,'lxml')\n", "expert_table = soup.find('table',{'id':'experts'})" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "87\n", "[{'expert_id': '3', 'source': 'ESPN', 'draft_rank': '108', 'in_season_rank': '104', 'name': 'Eric Karabell', 'date': '2017-08-23 12:00:008/23'}, {'expert_id': '5', 'source': 'ESPN', 'draft_rank': '', 'in_season_rank': '', 'name': 'Staff Composite', 'date': '2017-08-23 12:00:008/23'}, {'expert_id': '7', 'source': 'Yahoo! Sports', 'draft_rank': '69', 'in_season_rank': '65', 'name': 'Andy Behrens', 'date': '2017-08-25 10:51:138/25'}, {'expert_id': '9', 'source': 'Yahoo! Sports', 'draft_rank': '28', 'in_season_rank': '66', 'name': 'Scott Pianowski', 'date': '2017-08-25 02:21:318/24'}, {'expert_id': '15', 'source': 'ScoutFantasy', 'draft_rank': '76', 'in_season_rank': '8', 'name': 'Staff Rankings', 'date': '2017-08-23 07:47:338/23'}]\n" ] } ], "source": [ "experts = []\n", "rows = expert_table.findAll('tr')\n", "for tr in rows:\n", " columns = tr.findAll('td')\n", " if len(columns) > 0:\n", " expert = {}\n", " expert['expert_id'] = columns[0].find('input').get('value')\n", " expert['name'] = columns[1].text.strip()\n", " expert['source'] = columns[2].text.strip()\n", " expert['in_season_rank'] = columns[3].text.strip('#').strip()\n", " expert['draft_rank'] = columns[4].text.strip('#').strip()\n", " expert['date'] = columns[5].text.strip()\n", " experts.append(expert)\n", "print(len(experts))\n", "print(experts[:5])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " date draft_rank expert_id in_season_rank \\\n", "0 2017-08-23 12:00:008/23 108 3 104 \n", "1 2017-08-23 12:00:008/23 5 \n", "2 2017-08-25 10:51:138/25 69 7 65 \n", "3 2017-08-25 02:21:318/24 28 9 66 \n", "4 2017-08-23 07:47:338/23 76 15 8 \n", "\n", " name source \n", "0 Eric Karabell ESPN \n", "1 Staff Composite ESPN \n", "2 Andy Behrens Yahoo! Sports \n", "3 Scott Pianowski Yahoo! Sports \n", "4 Staff Rankings ScoutFantasy \n" ] } ], "source": [ "expert_df = pd.DataFrame(experts)\n", "print(expert_df.head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Need to clean up that messy date field" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " date draft_rank expert_id in_season_rank name \\\n", "0 2017-08-23 108 3 104 Eric Karabell \n", "1 2017-08-23 5 Staff Composite \n", "2 2017-08-25 69 7 65 Andy Behrens \n", "3 2017-08-25 28 9 66 Scott Pianowski \n", "4 2017-08-23 76 15 8 Staff Rankings \n", "\n", " source \n", "0 ESPN \n", "1 ESPN \n", "2 Yahoo! Sports \n", "3 Yahoo! Sports \n", "4 ScoutFantasy \n" ] } ], "source": [ "expert_df['date'] = expert_df['date'].str.split(' ').str.get(0)\n", "print(expert_df.head())" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "expert_list_file = 'data/fp_experts_{}_{}.tsv'.format(SCORING_TYPE, TODAYS_DATE)\n", "expert_df.to_csv(expert_list_file, sep='\\t', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Scrape each individual FP expert ranking" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "87\n" ] } ], "source": [ "expert_ids = expert_df['expert_id'].tolist()\n", "print(len(expert_ids))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here's the function we'll use to scrape each expert's ranking." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def get_expert_rankings(expert_id,score_type):\n", " return_rows = []\n", " \n", " payload = {\"source\":'2',\n", " \"id\":expert_id,\n", " \"year\":\"2017\",\n", " \"position\":\"ALL\",\n", " \"scoring\":score_type, #PPR or STD\n", " \"week\":\"0\",\n", " \"ajax\":\"true\"}\n", " widget_url = \"https://partners.fantasypros.com/external/widget/nfl-staff-rankings.php\"\n", " r = requests.get(widget_url, params=payload)\n", " \n", " soup = BeautifulSoup(r.content,'lxml')\n", " ranking_table = soup.find('table')\n", " if ranking_table:\n", " rows = ranking_table.findAll('tr')\n", " for tr in rows:\n", " columns = tr.findAll('td')\n", " if len(columns) > 0:\n", " return_row = {}\n", " return_row['rank'] = int(columns[0].text.strip())\n", " return_row['player_name'] = columns[1].find('a').text\n", " rest = columns[1].find('small').text\n", " if \"-\" in rest:\n", " rest_split = rest.split(',')\n", " return_row['position'], return_row['team'] = rest_split[0].split(\" - \")\n", " else:\n", " return_row['position'] = 'D/ST'\n", " return_row['team'] = return_row['player_name']\n", " return_row['expert_id'] = expert_id\n", " return_rows.append(return_row)\n", " return return_rows" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[{'player_name': 'David Johnson', 'expert_id': '7', 'position': 'RB', 'rank': 1, 'team': 'ARI'}, {'player_name': \"Le'Veon Bell\", 'expert_id': '7', 'position': 'RB', 'rank': 2, 'team': 'PIT'}, {'player_name': 'Antonio Brown', 'expert_id': '7', 'position': 'WR', 'rank': 3, 'team': 'PIT'}]\n" ] } ], "source": [ "test_ranks = get_expert_rankings('7',SCORING_TYPE)\n", "print(test_ranks[:3])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from tqdm import tqdm_notebook" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "all_ranks = []\n", "for expert_id in tqdm_notebook(expert_ids):\n", " all_ranks += get_expert_rankings(expert_id, SCORING_TYPE)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " expert_id player_name position rank team\n", "0 3 David Johnson RB 1 ARI\n", "1 3 Le'Veon Bell RB 2 PIT\n", "2 3 Antonio Brown WR 3 PIT\n", "3 3 Odell Beckham Jr. WR 4 NYG\n", "4 3 Mike Evans WR 5 TB\n", "21793\n" ] } ], "source": [ "rankings_df = pd.DataFrame(all_ranks)\n", "print(rankings_df.head())\n", "print(len(rankings_df))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Across the 87 expert rankings, we've managed to put together a dataset of 21,793 player/expert/rank observations." ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "expert_rankings_file = 'data/fp_rankings_{}_{}.tsv'.format(SCORING_TYPE,TODAYS_DATE)\n", "rankings_df.to_csv(expert_rankings_file, sep='\\t', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }