{ "metadata": { "name": "", "signature": "sha256:03e0033c320d99b62371b58129e8f62562c0b738c3376866576168d0cf67389f" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This scriots gets a lot of random pages from a Wikipedia if their Talk_page is in a category that descsribes their \"Class\" e.g. \"Stub\", \"Featured Article\" etc." ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pywikibot\n", "from pywikibot import pagegenerators\n", "enwp = pywikibot.Site('en','wikipedia')\n", "import re\n", "from collections import defaultdict" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Starting 1 threads...\n" ] } ], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "def page_class(page):\n", " talk = page.toggleTalkPage()\n", " cats = talk.categories()\n", " for cat in cats:\n", " cat_tit = cat.title().split('Category:')[1]\n", " match = re.search(r'(\\w+)\\-Class', cat_tit)\n", " if match:\n", " return match.group(1)\n", " return None" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "classed_pages = defaultdict(list)\n", "\n", "#currently there is a bug in pywikibot that only allows 25 random pages at a time\n", "for i in range(0,2001):\n", " #print 'making new random'\n", " random_pages = enwp.randompages(namespaces=[0], step=25, total=25)\n", " count25 = 0\n", " for page in random_pages:\n", " count25 += 1\n", " wikiclass = page_class(page)\n", " if wikiclass:\n", " if wikiclass not in classed_pages.keys():\n", " print wikiclass\n", " classed_pages[wikiclass].append(page.get())\n", " if count25 == 24:\n", " break\n", "print \"done\"\n", " " ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Stub\n", "Start" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "B" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "C" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "List" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "GA" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "FA" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Disambig" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "FL" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "A" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "NA" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Redirect" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "unassessed" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING: Http response status 503\n", "WARNING:pywiki:Http response status 503\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING: Non-JSON response received from server wikipedia:en; the server may be down.\n", "WARNING:pywiki:Non-JSON response received from server wikipedia:en; the server may be down.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "Set gcllimit = 2500\n", "INFO:pywiki:Set gcllimit = 2500\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING: Waiting 5 seconds before retrying.\n", "WARNING:pywiki:Waiting 5 seconds before retrying.\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Current" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Unassessed" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Future" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Needed" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING: Http response status 503\n", "WARNING:pywiki:Http response status 503\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING: Non-JSON response received from server wikipedia:en; the server may be down.\n", "WARNING:pywiki:Non-JSON response received from server wikipedia:en; the server may be down.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "Set gcllimit = 2500\n", "INFO:pywiki:Set gcllimit = 2500\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING: Waiting 5 seconds before retrying.\n", "WARNING:pywiki:Waiting 5 seconds before retrying.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING: Http response status 503\n", "WARNING:pywiki:Http response status 503\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING: Non-JSON response received from server wikipedia:en; the server may be down.\n", "WARNING:pywiki:Non-JSON response received from server wikipedia:en; the server may be down.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "Set gcllimit = 2500\n", "INFO:pywiki:Set gcllimit = 2500\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING: Waiting 5 seconds before retrying.\n", "WARNING:pywiki:Waiting 5 seconds before retrying.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Deferred" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING: Http response status 503\n", "WARNING:pywiki:Http response status 503\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING: Non-JSON response received from server wikipedia:en; the server may be down.\n", "WARNING:pywiki:Non-JSON response received from server wikipedia:en; the server may be down.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "Set gcllimit = 2500\n", "INFO:pywiki:Set gcllimit = 2500\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING: Waiting 5 seconds before retrying.\n", "WARNING:pywiki:Waiting 5 seconds before retrying.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "done" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "import json\n", "json.dump(classed_pages, open('test_class_data.json','w'))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "sum([len(l) for l in classed_pages.itervalues()])\n", "\n" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 19, "text": [ "38959" ] } ], "prompt_number": 19 } ], "metadata": {} } ] }