{ "metadata": { "name": "pageview-scraper" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "#!/usr/bin/env python\n", "\n", "'''\n", "get_pageview_by_day.py - batch download wiki pageview history\n", " Usage: python get_pageview_by_day.py -s -e -d \n", " E.g. python get_pageview_by_day.py -s 20130410 -e 20130417 -d /Data/\n", " The parameters will be set as default if not given.\n", " \n", "@author: Brian Keegan and Yu-Ru Lin \n", "@contact: bkeegan@gmail.com and yuruliny@gmail.com \n", "@date: October 22, 2013\n", "\n", "'''\n", "\n", "import os,urllib2,hashlib\n", "from bs4 import BeautifulSoup\n", "from dateutil.relativedelta import relativedelta\n", "from datetime import datetime, timedelta\n", "\n", "datapath = '/scratch/yuru/data/wiki/pagecounts_by_day'\n", "if not os.path.exists(datapath): os.system('mkdir -p %s'%datapath)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 187 }, { "cell_type": "code", "collapsed": false, "input": [ "# From http://stackoverflow.com/questions/4028697/how-do-i-download-a-zip-file-in-python-using-urllib2/4028728\n", "def get_file(url,filepath,md5_dict):\n", " # Open the url\n", " try:\n", " f = urllib2.urlopen(url)\n", " #print \"downloading \" + url\n", "\n", " # Open our local file for writing\n", " with open(filepath+os.path.basename(url), \"wb\") as local_file:\n", " read_file = f.read()\n", " file_md5 = hashlib.md5(read_file).hexdigest()\n", " if file_md5 == md5_dict[os.path.basename(url)]:\n", " local_file.write(read_file)\n", " else:\n", " raise ValueError('MD5 checksum on {0} does not match master.'.format(os.path.basename(url)))\n", "\n", " #handle errors\n", " except urllib2.HTTPError, e:\n", " print \"HTTP Error:\", e.code, url\n", " except urllib2.URLError, e:\n", " print \"URL Error:\", e.reason, url" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 180 }, { "cell_type": "code", "collapsed": false, "input": [ "def get_dumps(start,end,datapath=os.getcwd()+'/Data/'):\n", " start_dt = datetime.strptime(start,'%Y%m%d')\n", " end_dt = datetime.strptime(end,'%Y%m%d')\n", " url_base = 'http://dumps.wikimedia.org/other/pagecounts-raw/'\n", " \n", " if not os.path.exists(datapath): \n", " os.system('mkdir -p {0}'.format(datapath))\n", " \n", " # Enforce time boundaries\n", " if start_dt < datetime(2007,12,10,0,0,0):\n", " raise ValueError('Time range must be after 10 Dec 2007')\n", " elif end_dt > datetime.today():\n", " raise ValueError('Time range must end before today') \n", " \n", " index = start_dt\n", " while index < end_dt:\n", " year = datetime.strftime(index,'%Y')\n", " month = datetime.strftime(index,'%m')\n", " \n", " # Make md5 checksum dictionary that can be checked after downloading files below\n", " md5_url = url_base + '{0}/{0}-{1}/'.format(year,month) + 'md5sums.txt'\n", " md5s = urllib2.urlopen(md5_url).readlines()\n", " md5_dict = dict([i.strip().split(' ') for i in md5s if 'pagecounts' in i])\n", " md5_dict = {v:k for k,v in md5_dict.iteritems()}\n", " \n", " # Can't pass clean filenames since seconds field varies, scrape filenames instead\n", " url = url_base + '{0}/{0}-{1}/'.format(year,month)\n", " soup = BeautifulSoup(urllib2.urlopen(url))\n", " pagecount_links = [link.get('href') for link in soup.findAll('a') if 'pagecounts' in link.get('href')]\n", " \n", " # Return list of links in valid timespan\n", " valid_links = [datetime.strptime(i,'pagecounts-%Y%m%d-%H%M%S.gz') for i in pagecount_links]\n", " valid_links = [i for i in valid_links if i > start_dt and i < end_dt]\n", " valid_links = [datetime.strftime(i,'pagecounts-%Y%m%d-%H%M%S.gz') for i in valid_links]\n", " \n", " # If the directory doesn't exist, create it\n", " filepath = datapath+'{0}/{1}/'.format(year,month)\n", " \n", " if not os.path.exists(filepath): \n", " os.system('mkdir -p {0}'.format(filepath))\n", " \n", " # Get the data\n", " for link in valid_links:\n", " url = url_base + '{0}/{0}-{1}/'.format(year,month) + link\n", " # Check if file already exists and has non-zero length\n", " if os.path.exists(filepath+link) and os.path.getsize(filepath+link) > 0: \n", " print link + ' already exists!'\n", " continue\n", " \n", " try:\n", " print \"Retrieving pageviews for \" + link\n", " get_file(url,filepath,md5_dict)\n", " except (KeyboardInterrupt, SystemExit):\n", " sys.exit(0)\n", " break\n", " \n", " index += relativedelta(months=1)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 181 }, { "cell_type": "code", "collapsed": false, "input": [ "def main(argv):\n", " args = set( a.lower() for a in sys.argv[1:] )\n", " for i,arg in enumerate(argv):\n", " if arg in ['-h','--help']: \n", " print 'Usage: python pageview_scraper.py -s -e -d '\n", " if arg in ['-s','--start']:\n", " try: \n", " start = argv[i+1]\n", " except: \n", " start = datetime.strftime(datetime.today()- timedelta(days=1),'%Y%m%d')\n", " if arg in ['-e','--end']:\n", " try: \n", " end = argv[i+1]\n", " except: \n", " end = datetime.strftime(datetime.today(),'%Y%m%d')\n", " if arg in ['-d','--dir']:\n", " try:\n", " datapath = argv[i+1]\n", " except: \n", " datapath = os.getcwd()+'/Data/'\n", "\t get_dumps(start,end,datapath)\n", " pass\n", "\n", "if __name__ == '__main__': \n", " main(sys.argv[1:])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 189 } ], "metadata": {} } ] }