{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "# Setup\n", "import pattern.web as web\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "from cs109style import customize_mpl, customize_css\n", "customize_mpl()\n", "customize_css()\n", "%pylab inline" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Example 2: extracting reddit titles, upvotes, downvotes, and submission time\n", "\n", "### We'll operate in two phases:\n", "* first, find all the URLs to comment pages on the first few front pages of reddit.\n", "* second, extract information from each comments page" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def get_links_from_front_pages(n):\n", " 'find URLs of comments pages, linked from the n first few pages of reddit'\n", " url = web.URL('http://www.reddit.com/')\n", " comment_pages = []\n", " for page_idx in range(n):\n", " dom = web.DOM(url.download(cached=False))\n", " \n", " ### Extract comments pages\n", " \n", " ### find the next page link - reddit has 25 links per page\n", "\n", " # use set() to remove repeated URLs\n", " return list(set(comment_pages))\n", "\n", " \n", "print len(get_links_from_front_pages(6))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "def info_from_comments_pages(links):\n", " 'fetch title, upvotes, downvotes, time of submission from a sequence of links'\n", " results = []\n", " for urltext in links:\n", " url = web.URL(urltext)\n", " print \"fetching info for\", url\n", " try:\n", " dom = web.DOM(url.download(cached=False))\n", " \n", " ### Extract title, upvotes, downvotes, submission time\n", " \n", " results.append((title, upvotes, downvotes, pd.to_datetime(time)))\n", " except KeyboardInterrupt:\n", " # allow us to interrupt the kernel and still continue\n", " break\n", " except:\n", " pass # some things that look like comment pages don't have the information above\n", " return results" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "comments_pages = get_links_from_front_pages(5)\n", "print \"Fetching info for\", len(comments_pages), \"pages\"\n", "pages = info_from_comments_pages(comments_pages)\n", "titles, upvotes, downvotes, dates = zip(*pages) # zip(*seq) transposes a sequence of sequences.\n", "df = pd.DataFrame({'title' : titles, 'upvotes' : upvotes, 'downvotes' : downvotes, 'date' : dates}, index=dates)\n", "print df" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "df.sort('date', inplace=True)\n", "df['upvotes'].plot(c='g')\n", "df['downvotes'].plot(c='r')\n", "(df['upvotes'] - df['downvotes']).plot(c='k')\n" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }