{ "metadata": { "name": "Hacker News Hires Analysis" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import matplotlib.pyplot as plt\n", "from IPython.core import display" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Can we scrape HN? https://news.ycombinator.com/item?id=1721105\n", " \n", "# A little setup before we get going" ] }, { "cell_type": "code", "collapsed": false, "input": [ "posts_savefile = 'posts.csv'\n", "tdm_savefile = 'posts_tdm.csv'\n", "\n", "urls = (\n", "(2011, 1, 'https://news.ycombinator.com/item?id=2057704'),\n", "(2011, 2, 'https://news.ycombinator.com/item?id=2161360'),\n", "(2011, 3, 'https://news.ycombinator.com/item?id=2270790'),\n", "(2011, 4, 'https://news.ycombinator.com/item?id=2396027'),\n", "(2011, 5, 'https://news.ycombinator.com/item?id=2503204'),\n", "(2011, 6, 'https://news.ycombinator.com/item?id=2607052'),\n", "(2011, 7, 'https://news.ycombinator.com/item?id=2719028'),\n", "(2011, 8, 'https://news.ycombinator.com/item?id=2831646'),\n", "(2011, 9, 'https://news.ycombinator.com/item?id=2949787'),\n", "(2011, 10, 'https://news.ycombinator.com/item?id=3060221'),\n", "(2011, 11, 'https://news.ycombinator.com/item?id=3181796'),\n", "(2011, 12, 'https://news.ycombinator.com/item?id=3300290'),\n", "(2012, 1, 'https://news.ycombinator.com/item?id=3412900'),\n", "(2012, 2, 'https://news.ycombinator.com/item?id=3537881'),\n", "(2012, 3, 'https://news.ycombinator.com/item?id=3652041'),\n", "(2012, 4, 'https://news.ycombinator.com/item?id=3783657'),\n", "(2012, 5, 'https://news.ycombinator.com/item?id=3913997'),\n", "(2012, 6, 'https://news.ycombinator.com/item?id=4053076'),\n", "(2012, 7, 'https://news.ycombinator.com/item?id=4184755'),\n", "(2012, 8, 'https://news.ycombinator.com/item?id=4323597'),\n", "(2012, 9, 'https://news.ycombinator.com/item?id=4463689'),\n", "(2012, 10, 'https://news.ycombinator.com/item?id=4596375'),\n", "(2012, 11, 'https://news.ycombinator.com/item?id=4727241'),\n", "(2012, 12, 'https://news.ycombinator.com/item?id=4857714'),\n", "(2013, 1, 'https://news.ycombinator.com/item?id=4992617'),\n", "(2013, 2, 'https://news.ycombinator.com/item?id=5150834'),\n", "(2013, 3, 'https://news.ycombinator.com/item?id=5304169'), \n", "(2013, 4, 'https://news.ycombinator.com/item?id=5472746'),\n", "(2013, 5, 'https://news.ycombinator.com/item?id=5637663'),\n", "(2013, 6, 'https://news.ycombinator.com/item?id=5803764'),\n", "(2013, 7, 'https://news.ycombinator.com/item?id=5970187'),\n", "(2013, 8, 'https://news.ycombinator.com/item?id=6139927'),\n", "(2013, 9, 'https://news.ycombinator.com/item?id=6310234'),\n", "(2013, 10, 'https://news.ycombinator.com/item?id=6475879'),\n", "(2013, 11, 'https://news.ycombinator.com/item?id=6653437'),\n", "(2013, 12, 'https://news.ycombinator.com/item?id=6827554'),\n", "(2014, 1, 'https://news.ycombinator.com/item?id=6995020'),\n", "(2014, 2, 'https://news.ycombinator.com/item?id=7162197'),\n", "(2014, 3, 'https://news.ycombinator.com/item?id=7324236'), \n", "(2014, 4, 'https://news.ycombinator.com/item?id=7507765'),\n", "(2014, 5, 'https://news.ycombinator.com/item?id=7679431')\n", ")\n", "\n", "def filename(year, month):\n", " return 'html/hn_%d_%d.html' % (year, month)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "# maybe drop urls into a DataFrame to save to CSV?\n", "import pandas as pd\n", "urlsdf = pd.DataFrame(list(urls), columns=['year', 'month', 'url'])\n", "urlsdf.head(3)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", " | year | \n", "month | \n", "url | \n", "
---|---|---|---|
0 | \n", "2011 | \n", "1 | \n", "https://news.ycombinator.com/item?id=2057704 | \n", "
1 | \n", "2011 | \n", "2 | \n", "https://news.ycombinator.com/item?id=2161360 | \n", "
2 | \n", "2011 | \n", "3 | \n", "https://news.ycombinator.com/item?id=2270790 | \n", "