{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Setup\n",
      "import pattern.web as web\n",
      "import pandas as pd\n",
      "import numpy as np\n",
      "import matplotlib.pyplot as plt\n",
      "\n",
      "from cs109style import customize_mpl, customize_css\n",
      "customize_mpl()\n",
      "customize_css()\n",
      "%pylab inline"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Example 2: extracting reddit titles, upvotes, downvotes, and submission time\n",
      "\n",
      "### We'll operate in two phases:\n",
      "* first, find all the URLs to comment pages on the first few front pages of reddit.\n",
      "* second, extract information from each comments page"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def get_links_from_front_pages(n):\n",
      "    'find  URLs of comments pages, linked from the n first few pages of reddit'\n",
      "    url = web.URL('http://www.reddit.com/')\n",
      "    comment_pages = []\n",
      "    for page_idx in range(n):\n",
      "        dom = web.DOM(url.download(cached=False))\n",
      "        \n",
      "        ### Extract comments pages\n",
      "        \n",
      "        ### find the next page link - reddit has 25 links per page\n",
      "\n",
      "    # use set() to remove repeated URLs\n",
      "    return list(set(comment_pages))\n",
      "\n",
      "            \n",
      "print len(get_links_from_front_pages(6))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def info_from_comments_pages(links):\n",
      "    'fetch title, upvotes, downvotes, time of submission from a sequence of links'\n",
      "    results = []\n",
      "    for urltext in links:\n",
      "        url = web.URL(urltext)\n",
      "        print \"fetching info for\", url\n",
      "        try:\n",
      "            dom = web.DOM(url.download(cached=False))\n",
      "            \n",
      "            ### Extract title, upvotes, downvotes, submission time\n",
      "            \n",
      "            results.append((title, upvotes, downvotes, pd.to_datetime(time)))\n",
      "        except KeyboardInterrupt:\n",
      "            # allow us to interrupt the kernel and still continue\n",
      "            break\n",
      "        except:\n",
      "            pass  # some things that look like comment pages don't have the information above\n",
      "    return results"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "comments_pages = get_links_from_front_pages(5)\n",
      "print \"Fetching info for\", len(comments_pages), \"pages\"\n",
      "pages = info_from_comments_pages(comments_pages)\n",
      "titles, upvotes, downvotes, dates = zip(*pages)  # zip(*seq) transposes a sequence of sequences.\n",
      "df = pd.DataFrame({'title' : titles, 'upvotes' : upvotes, 'downvotes' : downvotes, 'date' : dates}, index=dates)\n",
      "print df"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df.sort('date', inplace=True)\n",
      "df['upvotes'].plot(c='g')\n",
      "df['downvotes'].plot(c='r')\n",
      "(df['upvotes'] - df['downvotes']).plot(c='k')\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}