{ "metadata": { "name": "wikipedia_scraping" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Wikipedia data scraping functions" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook contains a variety of functions primarily for accessing the MediaWiki API to extract data page revisions, user revisions, article hyperlinks, category membership, and pageview dynamics.\n", "\n", "These scripts invoke several non-standard libraries:\n", "\n", "* WikiTools - https://code.google.com/p/python-wikitools/\n", "\n", "* NetworkX - http://networkx.github.io/\n", "\n", "* Pandas - http://pandas.pydata.org/\n", "\n", "This code was primarily authored by Brian Keegan (bkeegan@gmail.com) in 2012 and 2013 with contributions from Nick Bennett (nick271828@gmail.com)." ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Basic functions" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from wikitools import wiki, api\n", "import networkx as nx\n", "from operator import itemgetter\n", "from collections import Counter\n", "import re, random, datetime, urlparse, urllib2, simplejson, copy\n", "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "def is_ip(ip_string, masked=False):\n", "\t# '''\n", "\t# Input:\n", "\t# ip_string - A string we'd like to check if it matches the pattern of a valid IP address.\n", "\t# Output:\n", "\t# A boolean value indicating whether the input was a valid IP address.\n", "\t# '''\n", "\tif not isinstance(ip_string, str) and not isinstance(ip_string, unicode):\n", "\t\treturn False\n", "\tif masked:\n", "\t\tip_pattern = re.compile('((([\\d]{1,3})|([Xx]{1,3}))\\.){3}(([\\d]{1,3})|([Xx]{1,3}))', re.UNICODE)\n", "\telse:\n", "\t\tip_pattern = re.compile('([\\d]{1,3}\\.){3}([\\d]{1,3})', re.UNICODE)\n", "\tif ip_pattern.match(ip_string):\n", "\t\treturn True\n", "\telse:\n", "\t\treturn False\n", "\n", "def convert_to_datetime(string):\n", " dt = datetime.datetime.strptime(string,'%Y-%m-%dT%H:%M:%SZ')\n", " return dt\n", " \n", "def convert_from_datetime(dt):\n", " string = dt.strftime('%Y%m%d%H%M%S')\n", " return string\n", "\n", "def convert_datetime_to_epoch(dt):\n", " epochtime = (dt - datetime.datetime(1970,1,1)).total_seconds()\n", " return epochtime\n", "\n", "def wikipedia_query(query_params,lang='en'):\n", "\tsite = wiki.Wiki(url='http://'+lang+'.wikipedia.org/w/api.php')\n", "\trequest = api.APIRequest(site, query_params)\n", "\tresult = request.query()\n", "\treturn result[query_params['action']]\n", "\n", "def short_wikipedia_query(query_params,lang='en'):\n", "\tsite = wiki.Wiki(url='http://'+lang+'.wikipedia.org/w/api.php')\n", "\trequest = api.APIRequest(site, query_params)\n", "\t# Don't do multiple requests\n", "\tresult = request.query(querycontinue=False)\n", "\treturn result[query_params['action']]\n", "\n", "def random_string(le, letters=True, numerals=False):\n", "\tdef rc():\n", "\t\tcharset = []\n", "\t\tcr = lambda x,y: range(ord(x), ord(y) + 1)\n", "\t\tif letters:\n", "\t\t\tcharset += cr('a', 'z')\n", "\t\tif numerals:\n", "\t\t\tcharset += cr('0', '9')\n", "\t\treturn chr(random.choice(charset))\n", "\tdef rcs(k):\n", "\t\treturn [rc() for i in range(k)]\n", "\treturn ''.join(rcs(le))\n", "\n", "def clean_revision(rev):\n", "\t# We must deal with some malformed user/userid values. Some \n", "\t# revisions have the following problems:\n", "\t# 1. no 'user' or 'userid' keys and the existence of the 'userhidden' key\n", "\t# 2. 'userid'=='0' and 'user'=='Conversion script' and 'anon'==''\n", "\t# 3. 'userid'=='0' and 'user'=='66.92.166.xxx' and 'anon'==''\n", "\t# 4. 'userid'=='0' and 'user'=='204.55.21.34' and 'anon'==''\n", "\t# In these cases, we must substitute a placeholder value\n", "\t# for 'userid' to uniquely identify the respective kind\n", "\t# of malformed revision as above. \n", "\trevision = rev.copy()\n", "\tif 'userhidden' in revision:\n", "\t\trevision['user'] = random_string(15, letters=False, numerals=True)\n", "\t\trevision['userid'] = revision['user']\n", "\telif 'anon' in revision:\n", "\t\tif revision['user']=='Conversion script':\n", "\t\t\trevision['user'] = random_string(14, letters=False, numerals=True)\n", "\t\t\trevision['userid'] = revision['user']\n", "\t\telif is_ip(revision['user']):\n", "\t\t\t# Just leaving this reflection in for consistency\n", "\t\t\trevision['user'] = revision['user']\n", "\t\t\t# The weird stuff about multiplying '0' by a number is to \n", "\t\t\t# make sure that IP addresses end up looking like this:\n", "\t\t\t# 192.168.1.1 -> 192168001001\n", "\t\t\t# This serves to prevent collisions if the numbers were\n", "\t\t\t# simply joined by removing the periods:\n", "\t\t\t# 215.1.67.240 -> 215167240\n", "\t\t\t# 21.51.67.240 -> 215167240\n", "\t\t\t# This also results in the number being exactly 12 decimal digits.\n", "\t\t\trevision['userid'] = ''.join(['0' * (3 - len(octet)) + octet \\\n", "\t\t\t\t\t\t\t\t\t\t\tfor octet in revision['user'].split('.')])\n", "\t\telif is_ip(revision['user'], masked=True):\n", "\t\t\t# Let's distinguish masked IP addresses, like\n", "\t\t\t# 192.168.1.xxx or 255.XXX.XXX.XXX, by setting \n", "\t\t\t# 'user'/'userid' both to a random 13 digit number\n", "\t\t\t# or 13 character string. \n", "\t\t\t# This will probably be unique and easily \n", "\t\t\t# distinguished from an IP address (with 12 digits\n", "\t\t\t# or characters). \n", "\t\t\trevision['user'] = random_string(13, letters=False, numerals=True)\n", "\t\t\trevision['userid'] = revision['user']\n", "\treturn revision\n", "\n", "def cast_to_unicode(string):\n", " if isinstance(string,str):\n", " try:\n", " string2 = string.decode('utf8')\n", " except:\n", " try:\n", " string2 = string.decode('latin1')\n", " except:\n", " print \"Some messed up encoding here\"\n", " elif isinstance(string,unicode):\n", " string2 = string\n", " return string2" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "User revisions" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def get_user_revisions(user,dt_end,lang):\n", " '''\n", " Input: \n", " user - The name of a wikipedia user with no \"User:\" prefix, e.g. 'Madcoverboy' \n", " dt_end - a datetime object indicating the maximum datetime to return for revisions\n", " lang - a string (typically two characters) indicating the language version of Wikipedia to crawl\n", "\n", " Output:\n", " revisions - A list of revisions for the given article, each given as a dictionary. This will\n", " include all properties as described by revision_properties, and will also include the\n", " title and id of the source article. \n", " '''\n", " user = cast_to_unicode(user)\n", " revisions = list()\n", " dt_end_string = convert_from_datetime(dt_end)\n", " result = wikipedia_query({'action':'query',\n", " 'list': 'usercontribs',\n", " 'ucuser': u\"User:\"+user,\n", " 'ucprop': 'ids|title|timestamp|sizediff',\n", " #'ucnamespace':'0',\n", " 'uclimit': '500',\n", " 'ucend':dt_end_string},lang)\n", " if result and 'usercontribs' in result.keys():\n", " r = result['usercontribs']\n", " r = sorted(r, key=lambda revision: revision['timestamp'])\n", " for revision in r:\n", " # Sometimes the size key is not present, so we'll set it to 0 in those cases\n", " revision['sizediff'] = revision.get('sizediff', 0)\n", " revision['timestamp'] = convert_to_datetime(revision['timestamp'])\n", " revisions.append(revision)\n", " return revisions\n", "\n", "def get_user_properties(user,lang):\n", " '''\n", " Input:\n", " user - a string with no \"User:\" prefix corresponding to the username (\"Madcoverboy\"\n", " lang - a string (usually two digits) for the language version of Wikipedia to query\n", "\n", " Output:\n", " result - a dictionary containing attrubutes about the user\n", " '''\n", " user = cast_to_unicode(user)\n", " result = wikipedia_query({'action':'query',\n", " 'list':'users',\n", " 'usprop':'blockinfo|groups|editcount|registration|gender',\n", " 'ususers':user},lang)\n", " return result\n", " \n", "def make_user_alters(revisions):\n", " '''\n", " Input:\n", " revisions - a list of revisions generated by get_user_revisions\n", "\n", " Output:\n", " alters - a dictionary keyed by page name that returns a dictionary containing\n", " the count of how many times the user edited the page, the timestamp of the user's\n", " earliest edit to the page, the timestamp the user's latest edit to the page, and \n", " the namespace of the page itself\n", " '''\n", " alters = dict()\n", " for rev in revisions:\n", " if rev['title'] not in alters.keys():\n", " alters[rev['title']] = dict()\n", " alters[rev['title']]['count'] = 1\n", " alters[rev['title']]['min_timestamp'] = rev['timestamp']\n", " alters[rev['title']]['max_timestamp'] = rev['timestamp']\n", " alters[rev['title']]['ns'] = rev['ns']\n", " else:\n", " alters[rev['title']]['count'] += 1\n", " alters[rev['title']]['max_timestamp'] = rev['timestamp']\n", " return alters" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Page revisions" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def rename_on_redirect(article_title,lang='en'):\n", " '''\n", " Input:\n", " article_title - a string with the name of the article or page that may be redirected to another title\n", " lang - a string (typically two characters) indicating the language version of Wikipedia to crawl\n", "\n", " Output:\n", " article_title - a string with the name of the article or page that the redirect resolves to\n", " '''\n", " result = wikipedia_query({'titles': article_title,\n", " 'prop': 'info',\n", " 'action': 'query',\n", " 'redirects': 'True'},lang)\n", " if 'redirects' in result.keys() and 'pages' in result.keys():\n", " article_title = result['redirects'][0]['to']\n", " return article_title\n", "\n", "def get_page_revisions(article_title,dt_start,dt_end,lang):\n", " '''\n", " Input: \n", " article - A string with the name of the article or page to crawl\n", " dt_start - A datetime object indicating the minimum datetime to return for revisions\n", " dt_end - a datetime object indicating the maximum datetime to return for revisions\n", " lang - a string (typically two characters) indicating the language version of Wikipedia to crawl\n", " \n", " Output:\n", " revisions - A list of revisions for the given article, each given as a dictionary. This will\n", " include all properties as described by revision_properties, and will also include the\n", " title and id of the source article. \n", " '''\n", " article_title = rename_on_redirect(article_title)\n", " dt_start_string = convert_from_datetime(dt_start)\n", " dt_end_string = convert_from_datetime(dt_end) \n", " revisions = list()\n", " result = wikipedia_query({'titles': article_title,\n", " 'prop': 'revisions',\n", " 'rvprop': 'ids|timestamp|user|userid|size',\n", " 'rvlimit': '5000',\n", " 'rvstart': dt_start_string,\n", " 'rvend': dt_end_string,\n", " 'rvdir': 'newer',\n", " 'action': 'query'},lang)\n", " if result and 'pages' in result.keys():\n", " page_number = result['pages'].keys()[0]\n", " try:\n", " r = result['pages'][page_number]['revisions']\n", " for revision in r:\n", " revision['pageid'] = page_number\n", " revision['title'] = result['pages'][page_number]['title']\n", " # Sometimes the size key is not present, so we'll set it to 0 in those cases\n", " revision['size'] = revision.get('size', 0)\n", " revision['timestamp'] = convert_to_datetime(revision['timestamp'])\n", " revisions.append(revision)\n", " except KeyError:\n", " revisions = list()\n", " return revisions\n", "\n", "def make_page_alters(revisions):\n", " '''\n", " Input:\n", " revisions - a list of revisions generated by get_page_revisions\n", "\n", " Output:\n", " alters - a dictionary keyed by user name that returns a dictionary containing\n", " the count of how many times the user edited the page, the timestamp of the user's\n", " earliest edit to the page, the timestamp the user's latest edit to the page, and \n", " the namespace of the page itself\n", " '''\n", " alters = dict()\n", " for rev in revisions:\n", " if rev['user'] not in alters.keys():\n", " alters[rev['user']] = dict()\n", " alters[rev['user']]['count'] = 1\n", " alters[rev['user']]['min_timestamp'] = rev['timestamp']\n", " alters[rev['user']]['max_timestamp'] = rev['timestamp']\n", " else:\n", " alters[rev['user']]['count'] += 1\n", " alters[rev['user']]['max_timestamp'] = rev['timestamp']\n", " return alters" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "def get_page_content(page_title,lang):\n", " '''\n", " Input: \n", " page_title - A string with the name of the article or page to crawl\n", " lang - A string (typically two characters) indicating the language version of Wikipedia to crawl\n", "\n", " Output:\n", " revisions_dict - A dictionary of revisions for the given article keyed by revision ID returning a \n", " a dictionary of revision attributes. These attributes include all properties as described \n", " by revision_properties, and will also include the title and id of the source article. \n", " '''\n", " article_title = rename_on_redirect(page_title)\n", " revisions_dict = dict()\n", " result = wikipedia_query({'titles': page_title,\n", " 'prop': 'revisions',\n", " 'rvprop': 'ids|timestamp|user|userid|size|content',\n", " 'rvlimit': '5000',\n", " 'action': 'query'},lang)\n", " if result and 'pages' in result.keys():\n", " page_number = result['pages'].keys()[0]\n", " revisions = result['pages'][page_number]['revisions']\n", " for revision in revisions:\n", " rev = dict()\n", " rev['pageid'] = page_number\n", " rev['title'] = result['pages'][page_number]['title']\n", " rev['size'] = revision.get('size', 0) # Sometimes the size key is not present, so we'll set it to 0 in those cases\n", " rev['timestamp'] = convert_to_datetime(revision['timestamp'])\n", " rev['content'] = revision.get('*',unicode()) # Sometimes content hidden, return with empty unicode string\n", " rev['links'] = link_finder(rev['content'])\n", " rev['username'] = revision['user']\n", " rev['userid'] = revision['userid']\n", " rev['revid'] = revision['revid']\n", " revisions_dict[revision['revid']] = rev\n", " return revisions_dict\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 202 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Category members" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def get_category_members(category_name, depth, lang='en'):\n", " '''\n", " Input: \n", " category_name - The name of a Wikipedia(en) category, e.g. 'Category:2001_fires'. \n", " depth - An integer in the range [0,n) reflecting the number of sub-categories to crawl\n", " lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl\n", "\n", " Output:\n", " articles - A list of articles that are found within the given category or one of its\n", " subcategories, explored recursively. Each article will be a dictionary object with\n", " the keys 'title' and 'id' with the values of the individual article's title and \n", " page_id respectively. \n", " '''\n", " articles = []\n", " if depth < 0:\n", " return articles\n", " \n", " #Begin crawling articles in category\n", " results = wikipedia_query({'list': 'categorymembers',\n", " 'cmtitle': category_name,\n", " 'cmtype': 'page',\n", " 'cmlimit': '500',\n", " 'action': 'query'},lang) \n", " if 'categorymembers' in results.keys() and len(results['categorymembers']) > 0:\n", " for i, page in enumerate(results['categorymembers']):\n", " article = page['title']\n", " articles.append(article)\n", " \n", " # Begin crawling subcategories\n", " results = wikipedia_query({'list': 'categorymembers',\n", " 'cmtitle': category_name,\n", " 'cmtype': 'subcat',\n", " 'cmlimit': '500',\n", " 'action': 'query'},lang)\n", " subcategories = []\n", " if 'categorymembers' in results.keys() and len(results['categorymembers']) > 0:\n", " for i, category in enumerate(results['categorymembers']):\n", " cat_title = category['title']\n", " subcategories.append(cat_title)\n", " for category in subcategories:\n", " articles += get_category_members(category,depth-1) \n", " return articles\n", "\n", "def get_page_categories(page_title,lang='en'):\n", " '''\n", " Input:\n", " page_title - A string with the name of the article or page to crawl\n", " lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl\n", "\n", " Output:\n", " categories - A list of the names of the categories of which the page is a member\n", " '''\n", " page_title = rename_on_redirect(page_title)\n", " results = wikipedia_query({'prop': 'categories',\n", " 'titles': page_title,\n", " 'cllimit': '500',\n", " 'clshow':'!hidden',\n", " 'action': 'query'},lang)\n", " if 'pages' in results.keys():\n", " page_number = results['pages'].keys()[0]\n", " categories = results['pages'][page_number]['categories']\n", " categories = [i['title'] for i in categories]\n", " categories = [i for i in categories if i != u'Category:Living people']\n", " else:\n", " print u\"{0} not found in category results\".format(page_title)\n", " return categories" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Hyperlinks" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def get_page_outlinks(page_title,lang='en'):\n", " '''\n", " Input:\n", " page_title - A string with the name of the article or page to crawl\n", " lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl\n", "\n", " Output:\n", " outlinks - A list of all \"alter\" pages that link out from the current version of the \"ego\" page\n", "\n", " Notes:\n", " This uses API calls to return all [[links]] which may be slower and result in overlinking from templates\n", " '''\n", " # This approach is susceptible to 'overlinking' as it includes links from templates\n", " page_title = cast_to_unicode(page_title)\n", " page_title = rename_on_redirect(page_title)\n", " result = wikipedia_query({'titles': page_title,\n", " 'prop': 'links',\n", " 'pllimit': '500',\n", " 'plnamespace':'0',\n", " 'action': 'query'},lang)\n", " if 'pages' in result.keys():\n", " page_number = result['pages'].keys()[0]\n", " results = result['pages'][page_number]['links']\n", " outlinks = [l['title'] for l in results]\n", " else:\n", " print u\"Error: No links found in {0}\".format(page_title)\n", " return outlinks\n", "\n", "def get_page_inlinks(page_title,lang='en'):\n", " '''\n", " Input:\n", " page_title - A string with the name of the article or page to crawl\n", " lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl\n", "\n", " Output:\n", " inlinks - A list of all \"alter\" pages that link in to the current version of the \"ego\" page\n", " '''\n", " page_title = cast_to_unicode(page_title)\n", " page_title = rename_on_redirect(page_title)\n", " result = wikipedia_query({'bltitle': page_title,\n", " 'list': 'backlinks',\n", " 'bllimit': '500',\n", " 'blnamespace':'0',\n", " 'blfilterredir':'nonredirects',\n", " 'action': 'query'},lang)\n", " if 'backlinks' in result.keys():\n", " results = result['backlinks']\n", " inlinks = [l['title'] for l in results]\n", " else:\n", " print u\"Error: No links found in {0}\".format(article_title)\n", " return inlinks\n", "\n", "# Links inside templates are included which results in completely-connected components\n", "# Remove links from templates by getting a list of templates used across all pages\n", "def get_page_templates(page_title,lang):\n", " '''\n", " Input:\n", " page_title - A string with the name of the article or page to crawl\n", " lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl\n", "\n", " Output:\n", " templates - A list of all the templates (which contain redundant links) in the current version\n", " '''\n", " page_title = cast_to_unicode(page_title)\n", " page_title = rename_on_redirect(page_title)\n", " result = wikipedia_query({'titles': page_title,\n", " 'prop': 'templates',\n", " 'tllimit': '500',\n", " 'action': 'query'},lang)\n", " if 'pages' in result.keys():\n", " page_id = result['pages'].keys()[0]\n", " templates = [i['title'] for i in result['pages'][page_id]['templates']]\n", " return templates\n", "\n", "def get_page_links(page_title,lang='en'):\n", " '''\n", " Input:\n", " page_title - A string with the name of the article or page to crawl that is the \"ego\" page\n", " lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl\n", "\n", " Output:\n", " links - A dictionary keyed by ['in','out'] of all \"alter\" pages that link in to and out from the \n", " current version of the \"ego\" page\n", " '''\n", " links=dict()\n", " links['in'] = get_page_inlinks(page_title,lang)\n", " links['out'] = get_page_outlinks(page_title,lang)\n", " return links\n", "\n", "# Identify links based on content of revisions\n", "def link_finder(content_string):\n", " '''\n", " Input:\n", " content_string - A string containing the raw wiki-markup for a page\n", "\n", " Output:\n", " links - A list of all \"alter\" pages that link out from the current version of the \"ego\" page\n", "\n", " Notes:\n", " This uses regular expressions to coarsely parse the content for instances of [[links]] and likely returns messy data\n", " '''\n", " links = list()\n", " for i,j in re.findall(r'\\[\\[([^|\\]]*\\|)?([^\\]]+)\\]\\]',content_string):\n", " if len(i) == 0:\n", " links.append(j)\n", " elif u'#' not in i :\n", " links.append(i[:-1])\n", " elif u'#' in i:\n", " new_i = i[:i.index(u'#')]\n", " links.append(new_i)\n", " links = [l for l in links if u'|' not in l and u'Category:' not in l and u'File:' not in l]\n", " return links\n", "\n", "def get_page_outlinks_from_content(page_title,lang='en'):\n", " '''\n", " Input:\n", " page_title - A string with the name of the article or page to crawl that is the \"ego\" page\n", " lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl\n", "\n", " Output:\n", " links - A list of all \"alter\" pages that link out from the current version of the \"ego\" page\n", "\n", " Notes:\n", " This uses regular expressions to coarsely parse the content for instances of [[links]] and may be messy\n", " '''\n", " page_title = cast_to_unicode(page_title)\n", " page_title = rename_on_redirect(page_title)\n", " \n", " # Get content from most recent revision of an article\n", " result = short_wikipedia_query({'titles': page_title,\n", " 'prop': 'revisions',\n", " 'rvlimit': '1',\n", " 'rvprop':'ids|timestamp|user|userid|content',\n", " 'action': 'query'},lang)\n", " if 'pages' in result.keys():\n", " page_id = result['pages'].keys()[0]\n", " content = result['pages'][page_id]['revisions'][0]['*']\n", " links = link_finder(content)\n", " else:\n", " print u'...Error in {0}'.format(page_title)\n", " links = list()\n", " \n", " return links" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Discussion" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def get_user_outdiscussion(user_name,dt_end,lang='en'):\n", " '''\n", " Input:\n", " user_name - The name of a \"ego\" wikipedia user with no \"User:\" prefix, e.g. 'Madcoverboy' \n", " dt_end - a datetime object indicating the maximum datetime to return for revisions\n", " lang - a string (typically two characters) indicating the language version of Wikipedia to crawl\n", "\n", " Output:\n", " users - A list of all \"alter\" user talk pages that the ego has ever posted to\n", " '''\n", " # User revision code in only user namespace\n", " user_name = cast_to_unicode(user_name)\n", " users = dict()\n", " dt_end_string = convert_from_datetime(dt_end)\n", " result = wikipedia_query({'action':'query',\n", " 'list': 'usercontribs',\n", " 'ucuser': u\"User:\"+user_name,\n", " 'ucprop': 'ids|title|timestamp|sizediff',\n", " 'ucnamespace':'3',\n", " 'uclimit': '500',\n", " 'ucend':dt_end_string},lang)\n", " if result and 'usercontribs' in result.keys():\n", " r = result['usercontribs']\n", " for rev in r:\n", " alter = rev['title'][10:] # Ignore \"User talk:\"\n", " if alter not in users.keys():\n", " users[alter] = dict()\n", " users[alter]['count'] = 1\n", " users[alter]['min_timestamp'] = rev['timestamp']\n", " users[alter]['max_timestamp'] = rev['timestamp']\n", " else:\n", " users[alter]['count'] += 1\n", " users[alter]['max_timestamp'] = rev['timestamp']\n", " return users\n", "\n", "def get_user_indiscussion(user_name,dt_end,lang='en'):\n", " '''\n", " Input:\n", " user_name - The name of a \"ego\" wikipedia user with no \"User:\" prefix, e.g. 'Madcoverboy' \n", " dt_end - a datetime object indicating the maximum datetime to return for revisions\n", " lang - a string (typically two characters) indicating the language version of Wikipedia to crawl\n", "\n", " Output:\n", " users - A list of all \"alter\" user talk pages that have ever posted to the user's talk page\n", " '''\n", " # Article revision code in only user talk page\n", " user_name = cast_to_unicode(user_name)\n", " users = dict()\n", " dt_end_string = convert_from_datetime(dt_end)\n", " result = wikipedia_query({'titles': u'User talk:'+user_name,\n", " 'prop': 'revisions',\n", " 'rvprop': 'ids|timestamp|user|userid|size',\n", " 'rvlimit': '5000',\n", " 'rvend': dt_end_string,\n", " 'action': 'query'},lang)\n", " if result and 'pages' in result.keys():\n", " page_number = result['pages'].keys()[0]\n", " try:\n", " r = result['pages'][page_number]['revisions']\n", " for rev in r:\n", " if rev['user'] not in users.keys():\n", " users[rev['user']] = dict()\n", " users[rev['user']]['count'] = 1\n", " users[rev['user']]['min_timestamp'] = rev['timestamp']\n", " users[rev['user']]['max_timestamp'] = rev['timestamp']\n", " else:\n", " users[rev['user']]['count'] += 1\n", " users[rev['user']]['max_timestamp'] = rev['timestamp']\n", " except KeyError:\n", " pass\n", " return users\n", "\n", "def get_user_discussion(user_name,dt_end,lang='en'):\n", " '''\n", " Input:\n", " user_name - The name of a \"ego\" wikipedia user with no \"User:\" prefix, e.g. 'Madcoverboy' \n", " dt_end - a datetime object indicating the maximum datetime to return for revisions\n", " lang - a string (typically two characters) indicating the language version of Wikipedia to crawl\n", "\n", " Output:\n", " users - A dictionary keyed by the values ['in','out'] that combines both get_user_outdiscussion and\n", " get_user_indiscussion\n", " '''\n", " users=dict()\n", " users['out'] = get_user_outdiscussion(user_name,dt_end,lang)\n", " users['in'] = get_user_indiscussion(user_name,dt_end,lang)\n", " return users" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Trajectories" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def make_article_trajectory(revisions):\n", " '''\n", " Input:\n", " revisions - A list of revisions generated by get_page_revisions\n", "\n", " Output:\n", " g - A NetworkX DiGraph object corresponding to the trajectory of an article moving between users\n", " Nodes are users and links from i to j exist when user i made a revision immediately following user j\n", " '''\n", " g = nx.DiGraph()\n", " # Sort revisions on ascending timestamp\n", " sorted_revisions = sorted(revisions,key=lambda k:k['timestamp'])\n", "\n", " # Don't use the last revision\n", " for num,rev in enumerate(sorted_revisions[:-1]):\n", " # Edge exists between user and user in next revision\n", " edge = (rev['user'],revisions[num+1]['user'])\n", " if g.has_edge(*edge):\n", " g[edge[0]][edge[1]]['weight'] += 1\n", " else:\n", " g.add_edge(*edge,weight=1)\n", " return g\n", "\n", "def make_editor_trajectory(revisions):\n", " '''\n", " Input:\n", " revisions - A list of revisions generated by get_user_revisions\n", "\n", " Output:\n", " g - A NetworkX DiGraph object corresponding to the trajectory of a user moving between articles\n", " Nodes are pages and links from i to j exist when page i was edited by the user immediately following page j\n", " '''\n", " g = nx.DiGraph()\n", " # Sort revisions on ascending timestamp\n", " sorted_revisions = sorted(revisions,key=lambda k:k['timestamp'])\n", "\n", " # Don't use the last revision\n", " for num,rev in enumerate(sorted_revisions[:-1]):\n", " # Edge exists between user and user in next revision\n", " edge = (rev['title'],revisions[num+1]['user'])\n", " if g.has_edge(*edge):\n", " g[rev['title']][revisions[num+1]['user']]['weight'] += 1\n", " else:\n", " g.add_edge(*edge,weight=1)\n", " return g" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Pageviews" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def fixurl(url):\n", " # turn string into unicode\n", " if not isinstance(url,unicode):\n", " url = url.decode('utf8')\n", "\n", " # parse it\n", " parsed = urlparse.urlsplit(url)\n", "\n", " # divide the netloc further\n", " userpass,at,hostport = parsed.netloc.rpartition('@')\n", " user,colon1,pass_ = userpass.partition(':')\n", " host,colon2,port = hostport.partition(':')\n", "\n", " # encode each component\n", " scheme = parsed.scheme.encode('utf8')\n", " user = urllib2.quote(user.encode('utf8'))\n", " colon1 = colon1.encode('utf8')\n", " pass_ = urllib2.quote(pass_.encode('utf8'))\n", " at = at.encode('utf8')\n", " host = host.encode('idna')\n", " colon2 = colon2.encode('utf8')\n", " port = port.encode('utf8')\n", " path = '/'.join( # could be encoded slashes!\n", " urllib2.quote(urllib2.unquote(pce).encode('utf8'),'')\n", " for pce in parsed.path.split('/')\n", " )\n", " query = urllib2.quote(urllib2.unquote(parsed.query).encode('utf8'),'=&?/')\n", " fragment = urllib2.quote(urllib2.unquote(parsed.fragment).encode('utf8'))\n", "\n", " # put it back together\n", " netloc = ''.join((user,colon1,pass_,at,host,colon2,port))\n", " return urlparse.urlunsplit((scheme,netloc,path,query,fragment))\n", "\n", "def convert_months_to_strings(m):\n", "\tif len(str(m)) > 1:\n", "\t\tnew_m = unicode(m)\n", "\telse:\n", "\t\tnew_m = u'0'+unicode(m)\n", "\treturn new_m\n", "\n", "def get_url(article_name,lang,month,year):\n", " url = u\"http://stats.grok.se/json/\" + lang + u\"/\" + unicode(year) + convert_months_to_strings(month) + u\"/\" + article_name\n", " fixed_url = fixurl(url)\n", " return fixed_url\n", "\n", "def requester(url):\n", " opener = urllib2.build_opener()\n", " req = urllib2.Request(url)\n", " f = opener.open(req)\n", " r = simplejson.load(f)\n", " result = pd.Series(r['daily_views'])\n", " return result\n", "\n", "def clean_timestamps(df):\n", " to_drop = list()\n", " for d in df.index:\n", " try:\n", " datetime.date(int(d[0:4]),int(d[5:7]),int(d[8:10]))\n", " except ValueError:\n", " to_drop.append(d)\n", " df2 = df.drop(to_drop,axis=0)\n", " df2.index = pd.to_datetime(df2.index)\n", " return df2\n", "\n", "def get_pageviews(article,lang,min_date,max_date):\n", " rng = pd.date_range(min_date,max_date,freq='M')\n", " rng2 = [(i.month,i.year) for i in rng]\n", " ts = pd.Series()\n", " for i in rng2:\n", " url = get_url(article,lang,i[0],i[1])\n", " result = requester(url)\n", " ts = pd.Series.append(result,ts)\n", " ts = ts.sort_index()\n", " ts = clean_timestamps(ts)\n", " ts = ts.asfreq('D')\n", " return ts\n", "\n", "def make_pageview_df(article_list,lang,min_date,max_date):\n", " df = pd.DataFrame(index=pd.date_range(start=min_date,end=max_date))\n", " l = len(article_list)\n", " for num,a in enumerate(article_list):\n", " try:\n", " print \"{0} / {1} : {2}\".format(num+1,l,a)\n", " ts = get_pageviews(a,lang,min_date,max_date)\n", " df[a] = ts\n", " except:\n", " print u'Something happened to {0}'.format(unicode(a))\n", " pass\n", " return df" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Make networks" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def editors_other_activity(article_title,dt_start,dt_end,ignorelist,lang):\n", " revisions = get_page_revisions(article_title,dt_start,dt_end,lang)\n", " revision_alters = make_page_alters(revisions)\n", " revision_alters2 = {k:v for k,v in revision_alters.iteritems() if k not in ignorelist}\n", " \n", " alter_contributions = dict()\n", " for num,editor_alter in enumerate(revision_alters2.keys()):\n", " print u\"{0} / {1}: {2}\".format(num+1,len(revision_alters2.keys()),editor_alter)\n", " alter_contributions[editor_alter] = get_user_revisions(editor_alter,dt_start,lang)\n", " \n", " #el = directed_dict_to_edgelist(alter_discussions)\n", " return revisions,alter_contributions\n", "\n", "def editing_primary_discussion_secondary(article_title,dt_start,dt_end,ignorelist,lang):\n", " revisions = get_page_revisions(article_title,dt_start,dt_end,lang)\n", " revision_alters = make_page_alters(revisions)\n", " revision_alters2 = {k:v for k,v in revision_alters.iteritems() if k not in ignorelist}\n", " \n", " alter_discussions = dict()\n", " for num,editor_alter in enumerate(revision_alters2.keys()):\n", " print u\"{0} / {1}: {2}\".format(num+1,len(revision_alters2.keys()),editor_alter)\n", " alter_discussions[editor_alter] = get_user_discussion(editor_alter,dt)\n", " \n", " #el = directed_dict_to_edgelist(alter_discussions)\n", " return revisions,alter_discussions" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "g = nx.DiGraph()\n", "for user,revisions in alter_contribs.iteritems():\n", " #print user\n", " for rev in revisions:\n", " article = rev['title']\n", " \n", " # If edge already exists, iterate weight\n", " if g.has_edge(user,title):\n", " g[user][title]['weight'] += 1\n", " \n", " # Otherwise create editor node and properties then add new edge\n", " else:\n", " # If editor node is not invalid or an IP, do a bunch of stuff\n", " if 'invalid' not in user_props[user]['users'][0].keys():\n", " ns = rev['ns']\n", " gen = user_props[user]['users'][0]['gender']\n", " edits = user_props[user]['users'][0]['editcount']\n", " \n", " # Registration returns None sometimes\n", " start = user_props[user]['users'][0]['registration']\n", " if start is not None:\n", " start = convert_datetime_to_epoch(convert_to_datetime(start))\n", " else:\n", " start = u'unknown'\n", " \n", " # Add node\n", " g.add_node(user, gender = gen, startdate = start, edits = edits, nodetype = 'user', ns='user')\n", " g.add_node(article, gender = 'page', startdate = 'page', edits = 'page', sysop = 'page', autoconfirmed = 'page', nodetype = 'page',namespace=ns)\n", " \n", " if 'sysop' in user_props[user]['users'][0]['groups']:\n", " g.node[user]['sysop'] = 1\n", " else:\n", " g.node[user]['sysop'] = 0\n", " \n", " if 'autoconfirmed' in user_props[user]['users'][0]['groups']:\n", " g.node[user]['autoconfirmed'] = 1\n", " else:\n", " g.node[user]['autoconfirmed'] = 0\n", " \n", " g.add_edge(user,article,weight=1)\n", " \n", " # If editor node is ivalid or an IP, populate fields\n", " else:\n", " g.add_node(user,gender=u'unknown',start=u'uknown',edits=u'unknown',sysop=0,autoconfirmed=0,nodetype='user')\n", "\n", "# Remove Talk:Chelsea_Manning because it's connected to everything\n", "g.remove_node('Talk:Chelsea Manning')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "editors = [title for title,attribs in g.nodes(data=True) if attribs['nodetype'] == 'user']\n", "#pages = [title for title,attribs in g.nodes(data=True) if attribs['nodetype'] == 'page']\n", "g2 = g.to_undirected()\n", "g3 = nx.bipartite.weighted_projected_graph(g2,editors)\n", "#g4 = nx.bipartite.weighted_projected_graph(g2,pages)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "nx.write_graphml(g,'Manning_talk_coauthorship.graphml')\n", "nx.write_gexf(g,'Manning_talk_coauthorship.gexf')\n", "nx.write_graphml(g3,'Manning_talk_coediting.graphml')\n", "nx.write_gexf(g3,'Manning_talk_coediting.gexf')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "def editing_primary_hyperlink_secondary(article_title,dt_start,dt_end,ignorelist):\n", " revisions = get_page_revisions(article_title,dt_start,dt_end,lang)\n", " revision_alters = make_page_alters(revisions)\n", " revision_alters2 = {k:v for k,v in revision_alters.iteritems() if k not in ignorelist}\n", " \n", " alter_hyperlinks = dict()\n", " for num,editor_alter in enumerate(revision_alters2.keys()):\n", " print u\"{0} / {1}: {2}\".format(num+1,len(revision_alters2.keys()),editor_alter)\n", " alter_discussions[editor_alter] = get_page_outlinks(editor_alter,dt)\n", " \n", " el = directed_dict_to_edgelist(alter_discussions)\n", " return revisions,alter_discussions,el" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "def two_step_editing(article_title,dt,ignorelist):\n", " revisions = get_page_revisions(article_title,dt)\n", " revision_alters = make_page_alters(revisions)\n", " revision_alters2 = {k:v for k,v in revision_alters.iteritems() if k not in ignorelist}\n", " \n", " alter_revisions = dict()\n", " for num,editor_alter in enumerate(revision_alters2.keys()):\n", " print u\"{0} / {1}: {2}\".format(num+1,len(revision_alters2.keys()),editor_alter)\n", " alter_revisions[editor_alter] = get_user_revisions(editor_alter,dt)\n", " return revisions, alter_revisions" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "def two_step_outlinks(page_title):\n", " page_alters = dict()\n", " templates_dict = dict()\n", " \n", " links = get_page_outlinks(page_title)\n", " page_alters[unicode(page_title)] = links\n", " \n", " templates = get_page_templates(page_title)\n", " templates_dict[page_title] = templates\n", " \n", " l = len(links)\n", " for num,link in enumerate(links):\n", " print u\"{0} / {1} : {2}\".format(num+1,l,link)\n", " try:\n", " page_alters[link] = get_page_outlinks(link)\n", " templates_dict[link] = get_page_templates(link)\n", " except:\n", " print u\"...{0} doesn't exist\".format(link)\n", " pass\n", " return page_alters,templates_dict" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "def two_step_outlinks_from_content(page_title):\n", " page_alters = dict()\n", " \n", " links = get_page_outlinks_from_content(page_title)\n", " unique_links = list(set(links))\n", " page_alters[unicode(page_title)] = unique_links\n", " \n", " l = len(unique_links)\n", " for num,link in enumerate(unique_links):\n", " print u\"{0} / {1} : {2}\".format(num+1,l,link)\n", " try:\n", " page_alters[link] = get_page_outlinks_from_content(link)\n", " except:\n", " print u\"...{0} doesn't exist\".format(link)\n", " pass\n", " return page_alters" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "def make_hyperlink_network(hyperlink_dict):\n", " hyperlink_g = nx.DiGraph()\n", " for page,links in hyperlink_dict.iteritems():\n", " for link in links:\n", " # Only include links to 1-step alter pages, not 2-step alters' alters\n", " if link in hyperlink_dict.keys():\n", " hyperlink_g.add_edge(page,link)\n", " return hyperlink_g" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "def make_shared_user_editing_network(alter_revisions_dict,threshold):\n", " \n", " # Make the graph\n", " net = nx.DiGraph()\n", " for editor,revisions in alter_revisions_dict.iteritems():\n", " articles = [r['title'] for r in revisions]\n", " for num,article in enumerate(articles[:-1]):\n", " if net.has_edge(article,articles[num+1]):\n", " net[article][articles[num+1]]['weight'] += 1\n", " else:\n", " net.add_edge(article,articles[num+1],weight=1)\n", " \n", " # If edge is below threshold, remove it \n", " for i,j,d in net.edges_iter(data=True):\n", " if d['weight'] < threshold:\n", " net.remove_edge(i,j)\n", " \n", " # Remove self-loops\n", " for i,j,d in net.edges_iter(data=True):\n", " if i == j:\n", " net.remove_edge(i,j)\n", " \n", " # Remove resulting isolates\n", " isolates = nx.isolates(net)\n", " for isolate in isolates:\n", " net.remove_node(isolate)\n", " \n", " return net" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "# Take the alter_revisions_dict keyed by user with a list of revisions\n", "# And return an inverted alter_pages keyed by page with a dictionary of users\n", "def invert_alter_revisions(alter_revisions_dict):\n", " alter_pages = dict()\n", " for user,revisions in alter_revisions_dict.iteritems():\n", " temp_list = list()\n", " for revision in revisions:\n", " temp_list.append(revision['title'])\n", " alter_pages[user] = dict(Counter(temp_list))\n", "\n", " inverted_alter_pages = dict()\n", " for user,counts in alter_pages.iteritems():\n", " for article,count in counts.iteritems():\n", " try:\n", " inverted_alter_pages[article][user] = count\n", " except KeyError:\n", " inverted_alter_pages[article] = dict()\n", " inverted_alter_pages[article][user] = count\n", " \n", " return inverted_alter_pages" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "def make_shared_page_editing_network(alter_revisions_dict,threshold):\n", " \n", " inverted_alter_revisions_dict = invert_alter_revisions(alter_revisions_dict)\n", " \n", " # Make the graph\n", " g = nx.DiGraph()\n", " for page,users in inverted_alter_revisions_dict.iteritems():\n", " user_list = users.keys()\n", " for num,user in enumerate(user_list[:-1]):\n", " next_user = user_list[num+1]\n", " if g.has_edge(user,next_user):\n", " g[user][next_user]['weight'] += 1\n", " else:\n", " g.add_edge(user,next_user,weight=1)\n", " \n", " # If edge is below threshold, remove it \n", " for i,j,d in g.edges_iter(data=True):\n", " if d['weight'] < threshold:\n", " g.remove_edge(i,j)\n", " \n", " # Remove self-loops\n", " for i,j,d in g.edges_iter(data=True):\n", " if i == j:\n", " g.remove_edge(i,j)\n", " \n", " # Remove resulting isolates\n", " isolates = nx.isolates(g)\n", " for isolate in isolates:\n", " g.remove_node(isolate)\n", " \n", " return g" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "def make_category_network(categories_dict):\n", " '''Takes a dictionary keyed by page name with list of categories as values\n", " Returns a two-mode (enforced by DiGraph) page-category\n", " '''\n", " g_categories=nx.DiGraph()\n", "\n", " for page,categories in categories_dict.iteritems():\n", " for category in categories:\n", " g_categories.add_node(page,node_type='page')\n", " g_categories.add_node(category,node_type='category')\n", " g_categories.add_edge(page,category)\n", "\n", " return g_categories" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 19 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Example" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Select articles from [the 2012 Mexican elections category on the Spanish Wikipedia](http://es.wikipedia.org/wiki/Categor%C3%ADa:Elecciones_de_M%C3%A9xico_de_2012) based on articles having more than one gubinatorial candidate having an existing article (\"blue links\")." ] }, { "cell_type": "code", "collapsed": false, "input": [ "articles = ['Elecciones estatales de 2012 en Yucat\u00e1n','Elecciones estatales en Tabasco de 2012','Elecciones estatales en San Luis Potos\u00ed de 2012','Elecciones estatales de Morelos de 2012','Elecciones estatales en Jalisco de 2012','Elecciones estatales en Guanajuato de 2012','Elecciones en el Distrito Federal (M\u00e9xico) de 2012','Elecciones estatales en Chiapas de 2012']\n", "articles = [i.decode('utf8') for i in articles]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "category_members = get_category_members('Categor\u00eda:Elecciones_de_M\u00e9xico_de_2012',1,'es')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 47 }, { "cell_type": "code", "collapsed": false, "input": [ "category_members" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 48, "text": [ "[u'Elecciones federales en M\\xe9xico de 2012',\n", " u'Elecciones estatales de Campeche de 2012',\n", " u'Elecciones estatales en Chiapas de 2012',\n", " u'Elecciones estatales de Colima de 2012',\n", " u'Elecciones en el Distrito Federal (M\\xe9xico) de 2012',\n", " u'Elecciones estatales del Estado de M\\xe9xico de 2012',\n", " u'Elecciones estatales en Guanajuato de 2012',\n", " u'Elecciones estatales de Guerrero de 2012',\n", " u'Elecciones estatales extraordinarias de Hidalgo de 2012',\n", " u'Elecciones estatales en Jalisco de 2012',\n", " u'Elecciones estatales extraordinarias de Michoac\\xe1n de 2012',\n", " u'Elecciones estatales de Morelos de 2012',\n", " u'Elecciones estatales de Nuevo Le\\xf3n de 2012',\n", " u'Elecciones estatales de Quer\\xe9taro de 2012',\n", " u'Elecciones estatales en San Luis Potos\\xed de 2012',\n", " u'Elecciones estatales en Tabasco de 2012',\n", " u'Elecciones estatales de 2012 en Yucat\\xe1n',\n", " u'Elecciones estatales extraordinarias de Yucat\\xe1n de 2012']" ] } ], "prompt_number": 48 }, { "cell_type": "code", "collapsed": false, "input": [ "bots = get_category_members('Category:All Wikipedia bots',3,'en')\n", "bots = [b[5:] for b in bots]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "user_props = dict()\n", "for i,user in enumerate(alter_contribs.keys()):\n", " print u\"{0} / {1}: {2}\".format(i+1,len(alter_contribs.keys()),user)\n", " user_props[user] = get_user_properties(user,'en')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "1 / 507: Edison\n", "2 / 507: Richard BB" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "3 / 507: 156.98.4.11" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "4 / 507: Vobedd" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "5 / 507: Qcomplex5" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "6 / 507: Skyraider" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "7 / 507: Adjwilley" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "8 / 507: Wbm1058" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "9 / 507: Roscelese" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "10 / 507: Bernarddb" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "11 / 507: Solarguy17" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "12 / 507: It Is Me Here" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "13 / 507: Degen Earthfast" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "14 / 507: Tony Webster" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "15 / 507: Guerillero" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "16 / 507: Coffeepusher" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "17 / 507: Vexorian" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "18 / 507: Rhialto" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "19 / 507: Sodaant" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "20 / 507: Jfhutson" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "21 / 507: Marcus Qwertyus" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "22 / 507: Carolmooredc" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "23 / 507: Cullen328" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "24 / 507: Benlisquare" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "25 / 507: Rcsprinter123" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "26 / 507: EvergreenFir" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "27 / 507: Wslack" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "28 / 507: BrownHairedGirl" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "29 / 507: Thechungling" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "30 / 507: Two kinds of pork" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "31 / 507: CaseyPenk" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "32 / 507: Casey.Grim85" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "33 / 507: Pudeo" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "34 / 507: KoshVorlon" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "35 / 507: NE Ent" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "36 / 507: Miranche" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "37 / 507: Wctaiwan" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "38 / 507: Rlendog" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "39 / 507: FT2" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "40 / 507: Wallie" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "41 / 507: Livitup" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "42 / 507: 190.235.87.27" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "43 / 507: Param Mudgal" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "44 / 507: Pass a Method" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "45 / 507: David Gerard" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "46 / 507: Pawyilee" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "47 / 507: Trinitresque" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "48 / 507: Daffydavid" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "49 / 507: Scott Martin" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "50 / 507: 117.199.7.24" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "51 / 507: Jenssey" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "52 / 507: Zzyzx11" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "53 / 507: GorillaWarfare" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "54 / 507: Necrothesp" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "55 / 507: Hullaballoo Wolfowitz" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "56 / 507: Brettalan" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "57 / 507: 97.84.222.198" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "58 / 507: Scottywong" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "59 / 507: Themfromspace" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "60 / 507: Shrigley" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "61 / 507: LtGen" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "62 / 507: Nick" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "63 / 507: Steeletrap" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "64 / 507: Michael Dorosh" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "65 / 507: Yourself In Person" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "66 / 507: Fs" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "67 / 507: Juno" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "68 / 507: Me and" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "69 / 507: Sophie means wisdom" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "70 / 507: Ericloewe" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "71 / 507: Toyokuni3" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "72 / 507: AnonNep" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "73 / 507: Ileanadu" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "74 / 507: Jeude54cartes" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "75 / 507: Zoe Brain" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "76 / 507: Vinithehat" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "77 / 507: Cengime" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "78 / 507: Abeg92" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "79 / 507: Born2cycle" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "80 / 507: Kevin W." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "81 / 507: Sovetus" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "82 / 507: Sj" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "83 / 507: 91.153.87.155" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "84 / 507: Wadewitz" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "85 / 507: Katana geldar" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "86 / 507: Vigyani" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "87 / 507: Solomonfromfinland" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "88 / 507: Mareklug" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "89 / 507: DrCruse" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "90 / 507: Eopsid" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "91 / 507: Scray" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "92 / 507: Theodolite" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "93 / 507: Dralwik" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "94 / 507: Snappy" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "95 / 507: PublicAmpersand" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "96 / 507: Zaphody3k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "97 / 507: Agmonaco" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "98 / 507: Liz" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "99 / 507: SqueakBox" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "100 / 507: Crumpled Fire" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "101 / 507: A Thousand Doors" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "102 / 507: AzureCitizen" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "103 / 507: Hitmonchan" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "104 / 507: Hamiltonstone" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "105 / 507: 83.128.147.107" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "106 / 507: Miraculouschaos" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "107 / 507: Dyrnych" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "108 / 507: Hobit" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "109 / 507: DanHakimi" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "110 / 507: Wikipeterproject" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "111 / 507: Cameron Scott" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "112 / 507: PikkoroDaimao" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "113 / 507: GiantSnowman" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "114 / 507: Kelly" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "115 / 507: Cimon Avaro" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "116 / 507: 86.16.146.123" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "117 / 507: ThinkEnemies" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "118 / 507: KTC" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "119 / 507: Shii" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "120 / 507: BHC" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "121 / 507: Thegreatdr" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "122 / 507: Joefromrandb" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "123 / 507: Milkunderwood" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "124 / 507: Maximilian Sch\u00f6nherr" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "125 / 507: Kaldari" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "126 / 507: DHeyward" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "127 / 507: Byposted" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "128 / 507: Almonroth" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "129 / 507: Srlevine1" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "130 / 507: BlueSalix" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "131 / 507: Vanisaac" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "132 / 507: FutureTrillionaire" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "133 / 507: John Cline" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "134 / 507: Pointillist" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "135 / 507: Raeven0" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "136 / 507: Psychologicaloric" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "137 / 507: Tennenrishin" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "138 / 507: Atshal" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "139 / 507: Modest Genius" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "140 / 507: 5minutes" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "141 / 507: Josepharari" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "142 / 507: Tbhotch" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "143 / 507: 70.89.234.49" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "144 / 507: TParis" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "145 / 507: JamesAM" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "146 / 507: Golbez" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "147 / 507: 208.163.239.119" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "148 / 507: FormerIP" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "149 / 507: StAnselm" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "150 / 507: Cyclopia" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "151 / 507: HiB2Bornot2B" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "152 / 507: Jayron32" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "153 / 507: Iselilja" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "154 / 507: Jojhutton" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "155 / 507: BFWB" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "156 / 507: Talmage" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "157 / 507: 24.22.47.95" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "158 / 507: K7L" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "159 / 507: Azirus" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "160 / 507: Smyth" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "161 / 507: Cavarrone" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "162 / 507: OtterSmith" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "163 / 507: Anthonyhcole" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "164 / 507: R. fiend" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "165 / 507: Michael Glass" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "166 / 507: Soerfm" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "167 / 507: Loadmaster" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "168 / 507: Daira Hopwood" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "169 / 507: 85.65.68.209" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "170 / 507: 99.192.64.222" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "171 / 507: Kiralexis" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "172 / 507: DPRoberts534" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "173 / 507: 98.157.156.137" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "174 / 507: Insulam Simia" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "175 / 507: U-Mos" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "176 / 507: 2001:5C0:1000:A:0:0:0:49D" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "177 / 507: Jburman" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "178 / 507: Malerooster" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "179 / 507: Thehistorian10" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "180 / 507: Fightin' Phillie" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "181 / 507: Safiel" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "182 / 507: Coemgenus" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "183 / 507: Jackmcbarn" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "184 / 507: Archaeo" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "185 / 507: AlexTiefling" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "186 / 507: NativeForeigner" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "187 / 507: Belorn" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "188 / 507: LukeSurl" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "189 / 507: 86.173.69.123" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "190 / 507: Eregli bob" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "191 / 507: Nicholas Perkins" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "192 / 507: Amatulic" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "193 / 507: Gtadood" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "194 / 507: Torquemama007" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "195 / 507: Casiotone" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "196 / 507: Jean-Jacques Georges" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "197 / 507: Dainamo" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "198 / 507: Labattblueboy" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "199 / 507: Phil Sandifer" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "200 / 507: Pez Dispens3r" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "201 / 507: Bob bobato" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "202 / 507: DragonflySixtyseven" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "203 / 507: Bright Darkness" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "204 / 507: Psychonaut" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "205 / 507: Sbingner" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "206 / 507: Thebirdlover" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "207 / 507: Ukrained2012" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "208 / 507: AutomaticStrikeout" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "209 / 507: Maproom" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "210 / 507: GeorgeLouis" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "211 / 507: 69.244.220.253" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "212 / 507: 71.231.186.92" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "213 / 507: Synchronism" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "214 / 507: JCO312" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "215 / 507: Tariqabjotu" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "216 / 507: 71.90.172.117" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "217 / 507: Chris G" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "218 / 507: Obiwankenobi" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "219 / 507: Mr. Stradivarius" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "220 / 507: GenericBob" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "221 / 507: TheCatalyst31" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "222 / 507: 71.116.34.80" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "223 / 507: A.amitkumar" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "224 / 507: Sluffs" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "225 / 507: Vegaswikian" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "226 / 507: Tombomp" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "227 / 507: KathrynBrooks1" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "228 / 507: Canoe1967" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "229 / 507: 71.179.167.242" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "230 / 507: 184.152.74.159" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "231 / 507: Lacarids" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "232 / 507: Gymnophoria" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "233 / 507: Miranda1989" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "234 / 507: Robin Lionheart" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "235 / 507: GrimmC" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "236 / 507: 7daysahead" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "237 / 507: Richard75" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "238 / 507: GregorB" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "239 / 507: 97.123.210.252" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "240 / 507: Agnosticaphid" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "241 / 507: MONGO" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "242 / 507: Mpgviolist" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "243 / 507: Hebel" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "244 / 507: NinjaRobotPirate" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "245 / 507: Silver seren" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "246 / 507: Giants27" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "247 / 507: Brandmeister" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "248 / 507: Surfer43" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "249 / 507: Tarc" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "250 / 507: BrianJ34" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "251 / 507: Blueboar" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "252 / 507: Fighter1stClass" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "253 / 507: Maunus" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "254 / 507: Walterego" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "255 / 507: LlywelynII" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "256 / 507: QuackCD" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "257 / 507: BabbaQ" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "258 / 507: Sandstein" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "259 / 507: BD2412" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "260 / 507: 74.138.45.132" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "261 / 507: 88.66.37.221" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "262 / 507: Alaric" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "263 / 507: Theodore!" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "264 / 507: Penwhale" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "265 / 507: Blackbird 4" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "266 / 507: JDiala" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "267 / 507: Cls14" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "268 / 507: Dicklyon" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "269 / 507: Guy Macon" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "270 / 507: Dorsal Axe" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "271 / 507: Count Iblis" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "272 / 507: Cymru.lass" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "273 / 507: Fritzendugan" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "274 / 507: Muboshgu" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "275 / 507: PauAmma" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "276 / 507: TripleU" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "277 / 507: Ajfweb" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "278 / 507: Taylor Trescott" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "279 / 507: S\u00f8ren" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "280 / 507: Helixdq" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "281 / 507: Gobonobo" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "282 / 507: Alanscottwalker" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "283 / 507: 84.18.241.143" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "284 / 507: Mike Rosoft" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "285 / 507: Netcrusher88" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "286 / 507: 2001:558:6024:12:10BB:B8E3:A9F3:C3C3" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "287 / 507: White whirlwind" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "288 / 507: Andrewman327" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "289 / 507: Sportfan5000" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "290 / 507: Tivanir2" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "291 / 507: ItsZippy" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "292 / 507: A Quest For Knowledge" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "293 / 507: Yintan" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "294 / 507: Another Believer" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "295 / 507: AjaxSmack" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "296 / 507: 151.230.243.44" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "297 / 507: Berean Hunter" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "298 / 507: Tryptofish" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "299 / 507: XMattingly" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "300 / 507: Jonie148" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "301 / 507: \u1f49 \u03bf\u1f36\u03c3\u03c4\u03c1\u03bf\u03c2" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "302 / 507: Jonathandeamer" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "303 / 507: Emarsee" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "304 / 507: JasonCNJ" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "305 / 507: MightySaiyan" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "306 / 507: 108.247.32.232" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "307 / 507: Writegeist" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "308 / 507: And Adoil Descended" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "309 / 507: 71.68.234.176" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "310 / 507: TheScootz" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "311 / 507: Risker" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "312 / 507: Sam Blacketer" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "313 / 507: SlimVirgin" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "314 / 507: JASpencer" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "315 / 507: Woody" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "316 / 507: Bdell555" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "317 / 507: Phoebe" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "318 / 507: 168.12.253.66" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "319 / 507: Hot Stop" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "320 / 507: Srich32977" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "321 / 507: 86.153.186.25" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "322 / 507: 181.179.58.111" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "323 / 507: Count Truthstein" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "324 / 507: Alex Hortman" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "325 / 507: Thatbox" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "326 / 507: George Ho" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "327 / 507: InedibleHulk" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "328 / 507: Isaidnoway" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "329 / 507: My very best wishes" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "330 / 507: Gaurav" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "331 / 507: Saxman1984" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "332 / 507: Mohamed CJ" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "333 / 507: 65.51.209.126" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "334 / 507: Cindamuse" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "335 / 507: MaxHarmony" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "336 / 507: HandsomeFella" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "337 / 507: Yonskii" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "338 / 507: 198.161.2.241" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "339 / 507: Wnt" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "340 / 507: Hbdragon88" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "341 / 507: Martylunsford" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "342 / 507: Wikid77" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "343 / 507: Shemp Howard, Jr." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "344 / 507: 173.178.34.11" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "345 / 507: Gaijin42" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "346 / 507: Eclecticology" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "347 / 507: Red Slash" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "348 / 507: 76.65.128.222" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "349 / 507: Baseball Bugs" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "350 / 507: Redrose64" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "351 / 507: 82.42.38.252" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "352 / 507: IFreedom1212" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "353 / 507: Jehochman" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "354 / 507: Ken Arromdee" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "355 / 507: Trystan" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "356 / 507: Grolltech" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "357 / 507: NewAccount4Me" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "358 / 507: Totorotroll" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "359 / 507: Moncrief" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "360 / 507: Numaz\u0130s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "361 / 507: LudicrousTripe" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "362 / 507: Toddy1" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "363 / 507: Soranoch" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "364 / 507: M.thoriyan" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "365 / 507: Welshsocialist" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "366 / 507: Eddpayne" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "367 / 507: Jayen466" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "368 / 507: Cowcharge" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "369 / 507: Nil Einne" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "370 / 507: Jbower47" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "371 / 507: 159.83.196.1" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "372 / 507: Foofbun" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "373 / 507: Countered" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "374 / 507: McGeddon" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "375 / 507: Fyunck(click)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "376 / 507: Iamcuriousblue" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "377 / 507: NickCT" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "378 / 507: 88.73.34.231" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "379 / 507: Haxwell" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "380 / 507: 23 editor" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "381 / 507: 92.29.51.58" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "382 / 507: Edge3" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "383 / 507: SarekOfVulcan" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "384 / 507: Smowton" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "385 / 507: 190.103.67.169" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "386 / 507: Timrollpickering" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "387 / 507: Cjarbo2" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "388 / 507: Norden1990" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "389 / 507: Kairi Izumi" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "390 / 507: FoxyOrange" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "391 / 507: Mark Arsten" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "392 / 507: 2.80.208.56" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "393 / 507: Bearcat" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "394 / 507: Labellementeuse" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "395 / 507: Surtsicna" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "396 / 507: I JethroBT" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "397 / 507: Anagogist" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "398 / 507: DracoEssentialis" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "399 / 507: Njardarlogar" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "400 / 507: ColonelHenry" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "401 / 507: Floydian" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "402 / 507: Mattgirling" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "403 / 507: 69.155.81.253" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "404 / 507: Jaakko Sivonen" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "405 / 507: IRWolfie-" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "406 / 507: KumiokoCleanStart" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "407 / 507: Aoidh" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "408 / 507: 142.161.97.237" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "409 / 507: PenguiN42" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "410 / 507: Collect" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "411 / 507: MrDolomite" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "412 / 507: Oren0" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "413 / 507: McPhail" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "414 / 507: OohBunnies!" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "415 / 507: Sailsbystars" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "416 / 507: Joseph A. Spadaro" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "417 / 507: Wester" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "418 / 507: 68.81.192.33" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "419 / 507: Randy2063" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "420 / 507: Lyo" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "421 / 507: StuartH" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "422 / 507: OSborn" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "423 / 507: Niemti" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "424 / 507: Haipa Doragon" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "425 / 507: Steven Zhang" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "426 / 507: Wasmachien" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "427 / 507: 71.184.71.199" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "428 / 507: GregJackP" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "429 / 507: Deep Purple Dreams" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "430 / 507: Robofish" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "431 / 507: Longsight" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "432 / 507: Ginsengbomb" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "433 / 507: PiMaster3" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "434 / 507: AndyTheGrump" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "435 / 507: Mark Miller" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "436 / 507: PBS" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "437 / 507: Rannph\u00e1irt\u00ed anaithnid" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "438 / 507: Thryduulf" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "439 / 507: Space simian" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "440 / 507: Morwen" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "441 / 507: SchreiberBike" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "442 / 507: CFynn" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "443 / 507: Badanagram" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "444 / 507: -sche" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "445 / 507: Yetisyny" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "446 / 507: Carrite" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "447 / 507: Dmarquard" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "448 / 507: VictusB" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "449 / 507: Sca" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "450 / 507: Dirac66" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "451 / 507: LionMans Account" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "452 / 507: Scs" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "453 / 507: Bwmoll3" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "454 / 507: Bluerasberry" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "455 / 507: April Arcus" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "456 / 507: Antonio Hazard" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "457 / 507: Thinking of England" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "458 / 507: 94.31.32.30" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "459 / 507: Dee Earley" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "460 / 507: 108.226.20.130" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "461 / 507: JohnValeron" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "462 / 507: Tocino" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "463 / 507: Stryn" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "464 / 507: 97.90.153.202" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "465 / 507: General Staal" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "466 / 507: Josh Gorand" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "467 / 507: Rinnenadtrosc" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "468 / 507: Adrian" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "469 / 507: JasonJack" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "470 / 507: Alandeus" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "471 / 507: Abductive" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "472 / 507: Ross Hill" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "473 / 507: Cerejota" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "474 / 507: LFaraone" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "475 / 507: Lawsonstu" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "476 / 507: DebashisM" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "477 / 507: Crisis" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "478 / 507: An Editor With a Self-Referential Name" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "479 / 507: WeldNeck" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "480 / 507: Shoeless Ho" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "481 / 507: Somchai Sun" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "482 / 507: Paul Erik" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "483 / 507: CombatWombat42" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "484 / 507: Neutron" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "485 / 507: Amitabho" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "486 / 507: Bob K31416" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "487 / 507: 202.174.184.14" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "488 / 507: Andy Dingley" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "489 / 507: 91.125.230.213" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "490 / 507: Uvaduck" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "491 / 507: Daniel32708" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "492 / 507: FeydHuxtable" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "493 / 507: Mjb" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "494 / 507: Ishmael reis" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "495 / 507: Mispy" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "496 / 507: NorthBySouthBaranof" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "497 / 507: Prototime" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "498 / 507: Alex Bakharev" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "499 / 507: Stephan Schulz" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "500 / 507: Hurtsmyears" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "501 / 507: Pigsonthewing" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "502 / 507: Rgrasmus" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "503 / 507: Sue Gardner" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "504 / 507: Knowledgekid87" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "505 / 507: Tazerdadog" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "506 / 507: Wing gundam" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "507 / 507: 90.210.192.246" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 12 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get all of the links from each of these pages." ] }, { "cell_type": "code", "collapsed": false, "input": [ "hyperlink_dict = dict()\n", "for i,a in enumerate(category_members):\n", " print u'{0} / {1} : {2}'.format(i+1,len(category_members),a)\n", " hyperlink_dict[a] = get_page_outlinks_from_content(a,'es')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "1 / 18 : Elecciones federales en M\u00e9xico de 2012\n", "2 / 18 : Elecciones estatales de Campeche de 2012" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "3 / 18 : Elecciones estatales en Chiapas de 2012" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "4 / 18 : Elecciones estatales de Colima de 2012" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "5 / 18 : Elecciones en el Distrito Federal (M\u00e9xico) de 2012" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "6 / 18 : Elecciones estatales del Estado de M\u00e9xico de 2012" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "7 / 18 : Elecciones estatales en Guanajuato de 2012" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "8 / 18 : Elecciones estatales de Guerrero de 2012" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "9 / 18 : Elecciones estatales extraordinarias de Hidalgo de 2012" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "10 / 18 : Elecciones estatales en Jalisco de 2012" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "11 / 18 : Elecciones estatales extraordinarias de Michoac\u00e1n de 2012" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "12 / 18 : Elecciones estatales de Morelos de 2012" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "13 / 18 : Elecciones estatales de Nuevo Le\u00f3n de 2012" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "14 / 18 : Elecciones estatales de Quer\u00e9taro de 2012" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "15 / 18 : Elecciones estatales en San Luis Potos\u00ed de 2012" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "16 / 18 : Elecciones estatales en Tabasco de 2012" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "17 / 18 : Elecciones estatales de 2012 en Yucat\u00e1n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "18 / 18 : Elecciones estatales extraordinarias de Yucat\u00e1n de 2012" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 50 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a set of alters to crawl in turn, excluding links to categories, files, and archives." ] }, { "cell_type": "code", "collapsed": false, "input": [ "hyperlink_alters = list()\n", "for ego,alters in hyperlink_dict.iteritems():\n", " alters = list(set(alters))\n", " for alter in alters:\n", " if u'Categor\\xeda:' not in alter and u'Anexo:' not in alter and u'Archivo:' not in alter:\n", " hyperlink_alters.append(alter)\n", "\n", "hyperlink_alters = list(set(hyperlink_alters))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 56 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Crawl these alters and add their alters to the hyperlink dictionary. Some pages may not exist, in which case ignore them." ] }, { "cell_type": "code", "collapsed": false, "input": [ "for i,a in enumerate(hyperlink_alters):\n", " print u'{0} / {1} : {2}'.format(i+1,len(hyperlink_alters),a)\n", " try:\n", " hyperlink_dict[a] = get_page_outlinks_from_content(a,'es')\n", " except KeyError:\n", " print u\"...{0} doesn't exist\".format(a)\n", " pass" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0;34mu'{0} / {1} : {2}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhyperlink_alters\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mhyperlink_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_page_outlinks_from_content\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'es'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0;34mu\"...{0} doesn't exist\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36mget_page_outlinks_from_content\u001b[0;34m(page_title, lang)\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_page_outlinks_from_content\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpage_title\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mlang\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mpage_title\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcast_to_unicode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpage_title\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 72\u001b[0;31m \u001b[0mpage_title\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrename_on_redirect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpage_title\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 73\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0;31m# Get content from most recent revision of an article\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36mrename_on_redirect\u001b[0;34m(article_title)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m'prop'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'info'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m'action'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'query'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m 'redirects': 'True'})\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'redirects'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m'pages'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0marticle_title\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'redirects'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'to'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36mwikipedia_query\u001b[0;34m(query_params, lang)\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0msite\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwiki\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mWiki\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'http://'\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mlang\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m'.wikipedia.org/w/api.php'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[0mrequest\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mapi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mAPIRequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msite\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquery_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 39\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 40\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mquery_params\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'action'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/wikitools/api.pyc\u001b[0m in \u001b[0;36mquery\u001b[0;34m(self, querycontinue)\u001b[0m\n\u001b[1;32m 137\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 138\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 139\u001b[0;31m \u001b[0mrawdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getRaw\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 140\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__parseJSON\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrawdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 141\u001b[0m \u001b[0;31m#Certain errors should probably be handled here...\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/wikitools/api.pyc\u001b[0m in \u001b[0;36m__getRaw\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 212\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0mcatcherror\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 214\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopener\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 215\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mgzip\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc\u001b[0m in \u001b[0;36mopen\u001b[0;34m(self, fullurl, data, timeout)\u001b[0m\n\u001b[1;32m 398\u001b[0m \u001b[0mreq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmeth\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 399\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 400\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 401\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 402\u001b[0m \u001b[0;31m# post-process response\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc\u001b[0m in \u001b[0;36m_open\u001b[0;34m(self, req, data)\u001b[0m\n\u001b[1;32m 416\u001b[0m \u001b[0mprotocol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 417\u001b[0m result = self._call_chain(self.handle_open, protocol, protocol +\n\u001b[0;32m--> 418\u001b[0;31m '_open', req)\n\u001b[0m\u001b[1;32m 419\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 420\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc\u001b[0m in \u001b[0;36m_call_chain\u001b[0;34m(self, chain, kind, meth_name, *args)\u001b[0m\n\u001b[1;32m 376\u001b[0m \u001b[0mfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhandler\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmeth_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 377\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 378\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 379\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 380\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc\u001b[0m in \u001b[0;36mhttp_open\u001b[0;34m(self, req)\u001b[0m\n\u001b[1;32m 1205\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1206\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mhttp_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1207\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhttplib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mHTTPConnection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1208\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1209\u001b[0m \u001b[0mhttp_request\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAbstractHTTPHandler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_request_\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc\u001b[0m in \u001b[0;36mdo_open\u001b[0;34m(self, http_class, req)\u001b[0m\n\u001b[1;32m 1178\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1179\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1180\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuffering\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1181\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# buffering kw not supported\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1182\u001b[0m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc\u001b[0m in \u001b[0;36mgetresponse\u001b[0;34m(self, buffering)\u001b[0m\n\u001b[1;32m 1028\u001b[0m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresponse_class\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1029\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1030\u001b[0;31m \u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbegin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1031\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwill_close\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0m_UNKNOWN\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1032\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__state\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_CS_IDLE\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc\u001b[0m in \u001b[0;36mbegin\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 405\u001b[0m \u001b[0;31m# read until we get a non-100 response\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 406\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 407\u001b[0;31m \u001b[0mversion\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreason\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 408\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mstatus\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mCONTINUE\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 409\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc\u001b[0m in \u001b[0;36m_read_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 363\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_read_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0;31m# Initialize with Simple-Response defaults\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 365\u001b[0;31m \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 366\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebuglevel\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 367\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0;34m\"reply:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrepr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.pyc\u001b[0m in \u001b[0;36mreadline\u001b[0;34m(self, size)\u001b[0m\n\u001b[1;32m 445\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 446\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 447\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_rbufsize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 448\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 449\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mEINTR\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "1 / 847 : \n" ] } ], "prompt_number": 57 }, { "cell_type": "code", "collapsed": false, "input": [ "hyperlink_graph = nx.DiGraph()\n", "for ego,alters in hyperlink_dict.iteritems():\n", " for alter in alters:\n", " if alter in hyperlink_dict.keys():\n", " hyperlink_graph.add_edge(ego,alter)\n", "nx.write_graphml(hyperlink_graph,'hyperlinks.graphml')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 45 }, { "cell_type": "code", "collapsed": false, "input": [ "net = nx.DiGraph()\n", "for article,revisions in alter_revs.iteritems():\n", " for revision in revisions:\n", " if 'user' in revision.keys() and 'bot' not in revision['user']:\n", " try:\n", " net[revision['user']][revision['title']]['weight'] += 1\n", " except KeyError:\n", " net.add_node(revision['user'],node_type='user')\n", " net.add_node(revision['title'],node_type='article')\n", " net.add_edge(revision['user'],revision['title'],weight=1)\n", " \n", "net_articles = [i for i,j in net.nodes(data=True) if j['node_type'] == 'article']\n", "net_users = [i for i,j in net.nodes(data=True) if j['node_type'] == 'user']\n", "\n", "len(net_users)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 168, "text": [ "2443" ] } ], "prompt_number": 168 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Load from Pickle" ] }, { "cell_type": "code", "collapsed": false, "input": [ "result = cPickle.load(open('Boston_Marathon_bombings.p','rb'))\n", "revisions_dict = dict()\n", "page_number = result['pages'].keys()[0]\n", "revisions = result['pages'][page_number]['revisions']\n", "for revision in revisions:\n", " rev = dict()\n", " rev['pageid'] = page_number\n", " rev['title'] = result['pages'][page_number]['title']\n", " rev['size'] = revision.get('size', 0) # Sometimes the size key is not present, so we'll set it to 0 in those cases\n", " rev['timestamp'] = convert_to_datetime(revision['timestamp'])\n", " rev['content'] = revision.get('*',unicode()) # Sometimes content hidden, return with empty unicode string\n", " rev['links'] = link_finder(rev['content'])\n", " rev['username'] = revision['user']\n", " rev['userid'] = revision['userid']\n", " rev['revid'] = revision['revid']\n", " revisions_dict[revision['revid']] = rev" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "def adjacency_calcs(revisions):\n", " revisions = sorted(revisions,key=itemgetter('pageid','timestamp'))\n", " revisions[0]['position'] = 0\n", " revisions[0]['edit_lag'] = datetime.timedelta(0)\n", " revisions[0]['bytes_added'] = revisions[0]['size']\n", " revisions[0]['unique_users'] = [revisions[0]['username']]\n", " revisions[0]['unique_users_count'] = 1\n", " revisions[0]['article_age'] = 0\n", " for num,rev in enumerate(revisions[:-1]):\n", " revisions[num+1]['position'] = rev['position'] + 1\n", " revisions[num+1]['edit_lag'] = revisions[num+1]['timestamp'] - rev['timestamp']\n", " revisions[num+1]['bytes_added'] = revisions[num+1]['size'] - rev['size']\n", " \n", " revisions[num+1]['unique_users'] = rev['unique_users']\n", " revisions[num+1]['unique_users'].append(revisions[num+1]['username'])\n", " revisions[num+1]['unique_users'] = list(set(revisions[num+1]['unique_users']))\n", " \n", " revisions[num+1]['unique_users_count'] = len(revisions[num+1]['unique_users'])\n", " revisions[num+1]['article_age'] = revisions[num+1]['timestamp'] - revisions[0]['timestamp']\n", " return revisions" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 42 } ], "metadata": {} } ] }