{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Scraping all expert draft rankings\n",
    "\n",
    "Start out by first scraping the FP expert list, and then using that list to scrape each expert's rankings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import pandas as pd\n",
    "import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "TODAYS_DATE = datetime.date.today().strftime(\"%Y_%m_%d\")\n",
    "SCORING_TYPE = 'ppr'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Scrape the FP expert list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "if SCORING_TYPE == 'ppr':\n",
    "    fp_url = 'https://www.fantasypros.com/nfl/rankings/ppr-cheatsheets.php'\n",
    "elif SCORING_TYPE == 'standard':\n",
    "    fp_url = 'https://www.fantasypros.com/nfl/rankings/consensus-cheatsheets.php'\n",
    "r = requests.get(fp_url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "soup = BeautifulSoup(r.text,'lxml')\n",
    "expert_table = soup.find('table',{'id':'experts'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "87\n",
      "[{'expert_id': '3', 'source': 'ESPN', 'draft_rank': '108', 'in_season_rank': '104', 'name': 'Eric Karabell', 'date': '2017-08-23 12:00:008/23'}, {'expert_id': '5', 'source': 'ESPN', 'draft_rank': '', 'in_season_rank': '', 'name': 'Staff Composite', 'date': '2017-08-23 12:00:008/23'}, {'expert_id': '7', 'source': 'Yahoo! Sports', 'draft_rank': '69', 'in_season_rank': '65', 'name': 'Andy Behrens', 'date': '2017-08-25 10:51:138/25'}, {'expert_id': '9', 'source': 'Yahoo! Sports', 'draft_rank': '28', 'in_season_rank': '66', 'name': 'Scott Pianowski', 'date': '2017-08-25 02:21:318/24'}, {'expert_id': '15', 'source': 'ScoutFantasy', 'draft_rank': '76', 'in_season_rank': '8', 'name': 'Staff Rankings', 'date': '2017-08-23 07:47:338/23'}]\n"
     ]
    }
   ],
   "source": [
    "experts = []\n",
    "rows = expert_table.findAll('tr')\n",
    "for tr in rows:\n",
    "    columns = tr.findAll('td')\n",
    "    if len(columns) > 0:\n",
    "        expert = {}\n",
    "        expert['expert_id'] = columns[0].find('input').get('value')\n",
    "        expert['name'] = columns[1].text.strip()\n",
    "        expert['source'] = columns[2].text.strip()\n",
    "        expert['in_season_rank'] = columns[3].text.strip('#').strip()\n",
    "        expert['draft_rank'] = columns[4].text.strip('#').strip()\n",
    "        expert['date'] = columns[5].text.strip()\n",
    "        experts.append(expert)\n",
    "print(len(experts))\n",
    "print(experts[:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                      date draft_rank expert_id in_season_rank  \\\n",
      "0  2017-08-23 12:00:008/23        108         3            104   \n",
      "1  2017-08-23 12:00:008/23                    5                  \n",
      "2  2017-08-25 10:51:138/25         69         7             65   \n",
      "3  2017-08-25 02:21:318/24         28         9             66   \n",
      "4  2017-08-23 07:47:338/23         76        15              8   \n",
      "\n",
      "              name         source  \n",
      "0    Eric Karabell           ESPN  \n",
      "1  Staff Composite           ESPN  \n",
      "2     Andy Behrens  Yahoo! Sports  \n",
      "3  Scott Pianowski  Yahoo! Sports  \n",
      "4   Staff Rankings   ScoutFantasy  \n"
     ]
    }
   ],
   "source": [
    "expert_df = pd.DataFrame(experts)\n",
    "print(expert_df.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Need to clean up that messy date field"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "         date draft_rank expert_id in_season_rank             name  \\\n",
      "0  2017-08-23        108         3            104    Eric Karabell   \n",
      "1  2017-08-23                    5                 Staff Composite   \n",
      "2  2017-08-25         69         7             65     Andy Behrens   \n",
      "3  2017-08-25         28         9             66  Scott Pianowski   \n",
      "4  2017-08-23         76        15              8   Staff Rankings   \n",
      "\n",
      "          source  \n",
      "0           ESPN  \n",
      "1           ESPN  \n",
      "2  Yahoo! Sports  \n",
      "3  Yahoo! Sports  \n",
      "4   ScoutFantasy  \n"
     ]
    }
   ],
   "source": [
    "expert_df['date'] = expert_df['date'].str.split(' ').str.get(0)\n",
    "print(expert_df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "expert_list_file = 'data/fp_experts_{}_{}.tsv'.format(SCORING_TYPE, TODAYS_DATE)\n",
    "expert_df.to_csv(expert_list_file, sep='\\t', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Scrape each individual FP expert ranking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "87\n"
     ]
    }
   ],
   "source": [
    "expert_ids = expert_df['expert_id'].tolist()\n",
    "print(len(expert_ids))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here's the function we'll use to scrape each expert's ranking."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_expert_rankings(expert_id,score_type):\n",
    "    return_rows = []\n",
    "    \n",
    "    payload = {\"source\":'2',\n",
    "               \"id\":expert_id,\n",
    "              \"year\":\"2017\",\n",
    "               \"position\":\"ALL\",\n",
    "               \"scoring\":score_type, #PPR or STD\n",
    "              \"week\":\"0\",\n",
    "              \"ajax\":\"true\"}\n",
    "    widget_url = \"https://partners.fantasypros.com/external/widget/nfl-staff-rankings.php\"\n",
    "    r = requests.get(widget_url, params=payload)\n",
    "    \n",
    "    soup = BeautifulSoup(r.content,'lxml')\n",
    "    ranking_table = soup.find('table')\n",
    "    if ranking_table:\n",
    "        rows = ranking_table.findAll('tr')\n",
    "        for tr in rows:\n",
    "            columns = tr.findAll('td')\n",
    "            if len(columns) > 0:\n",
    "                return_row = {}\n",
    "                return_row['rank'] = int(columns[0].text.strip())\n",
    "                return_row['player_name'] = columns[1].find('a').text\n",
    "                rest = columns[1].find('small').text\n",
    "                if \"-\" in rest:\n",
    "                    rest_split = rest.split(',')\n",
    "                    return_row['position'], return_row['team'] = rest_split[0].split(\" - \")\n",
    "                else:\n",
    "                    return_row['position'] = 'D/ST'\n",
    "                    return_row['team'] = return_row['player_name']\n",
    "                return_row['expert_id'] = expert_id\n",
    "                return_rows.append(return_row)\n",
    "    return return_rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'player_name': 'David Johnson', 'expert_id': '7', 'position': 'RB', 'rank': 1, 'team': 'ARI'}, {'player_name': \"Le'Veon Bell\", 'expert_id': '7', 'position': 'RB', 'rank': 2, 'team': 'PIT'}, {'player_name': 'Antonio Brown', 'expert_id': '7', 'position': 'WR', 'rank': 3, 'team': 'PIT'}]\n"
     ]
    }
   ],
   "source": [
    "test_ranks = get_expert_rankings('7',SCORING_TYPE)\n",
    "print(test_ranks[:3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from tqdm import tqdm_notebook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "all_ranks = []\n",
    "for expert_id in tqdm_notebook(expert_ids):\n",
    "    all_ranks += get_expert_rankings(expert_id, SCORING_TYPE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  expert_id        player_name position  rank team\n",
      "0         3      David Johnson       RB     1  ARI\n",
      "1         3       Le'Veon Bell       RB     2  PIT\n",
      "2         3      Antonio Brown       WR     3  PIT\n",
      "3         3  Odell Beckham Jr.       WR     4  NYG\n",
      "4         3         Mike Evans       WR     5   TB\n",
      "21793\n"
     ]
    }
   ],
   "source": [
    "rankings_df = pd.DataFrame(all_ranks)\n",
    "print(rankings_df.head())\n",
    "print(len(rankings_df))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Across the 87 expert rankings, we've managed to put together a dataset of 21,793 player/expert/rank observations."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "expert_rankings_file = 'data/fp_rankings_{}_{}.tsv'.format(SCORING_TYPE,TODAYS_DATE)\n",
    "rankings_df.to_csv(expert_rankings_file, sep='\\t', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}