{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# hyperlinks network" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%run \"libraries.ipynb\"\n", "\n", "from IPython.display import display, HTML\n", "\n", "from bs4 import BeautifulSoup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## loading the original namespace\n", "\n", "In order to control our namespace, we are importing [a list of wikipedia pages](https://github.com/WeKeyPedia/notebooks/blob/master/geometry/data/pagenames.txt) that we extracted from the [List of geometry topics](http://en.wikipedia.org/wiki/list_of_geometry_topics)." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "pages = codecs.open(\"data/pagenames.txt\",\"r\", \"utf-8-sig\").readlines()\n", "pages = map(lambda x: x.strip(), pages)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## finding wikipedia links in the page content" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "nodes: 300\n", "edges: 3781\n" ] } ], "source": [ "hyperlinks_graph = nx.DiGraph()\n", "\n", "def get_content(page):\n", " with codecs.open(\"data/pages/%s.json\" % (page), \"r\", \"utf-8-sig\") as f:\n", " j = json.load(f)\n", " content = j[\"query\"][\"pages\"][j[\"query\"][\"pages\"].keys()[0]]\n", " content = content[\"revisions\"][0][\"*\"]\n", "\n", " return content\n", "\n", "def find_hyperlinks(page):\n", " hyperlinks = []\n", "\n", " content = get_content(page)\n", " content = BeautifulSoup(content, 'html.parser') \n", "\n", " hyperlinks = content.find_all('a')\n", " \n", " return hyperlinks\n", "\n", "for p in pages:\n", " occurences_link = {}\n", " occurences_named_entity = {}\n", "\n", " hyperlinks = find_hyperlinks(p)\n", " content = u\"\" + get_content(p)\n", " # keep only title of the link\n", " hyperlinks = map(lambda x: x.get(\"title\"), hyperlinks)\n", "\n", " # some hyperlinks have no title\n", " hyperlinks = [ x for x in hyperlinks if x != None ]\n", " \n", " # sorted hyperlinks by title length in order to get a more precise n-graming\n", " # otherwise terms like \"triangle\" are over-evaluated because of other terms\n", " # like \"equilateral triangle\"\n", " hyperlinks = sorted(hyperlinks, key=lambda k: -len(k))\n", " \n", " gruyere = content\n", " \n", " for k in hyperlinks:\n", " # count occurences links\n", " occurences_link.setdefault(k, 0)\n", " occurences_link[k] += 1\n", " \n", " # count occurences terms\n", " occurences_named_entity.setdefault(k, 0)\n", " occurences_named_entity[k] = unicode(gruyere).count(unicode(k))\n", " \n", " gruyere = gruyere.replace(k, \"\")\n", "\n", " # print \"coverage: %s/%s (%s%%)\" % (len(gruyere), len(content), float(len(gruyere))*100/float(len(content)))\n", " # reduce to a list of unique items\n", " \n", " hyperlinks = list(set(hyperlinks))\n", "\n", " # keep only linked pages that are inside the initial domain\n", " intradomain_pages = set(hyperlinks) & set(pages)\n", " extradomain_pages = set(hyperlinks) - set(pages)\n", "\n", "# print len( hyperlinks )\n", "# print len( intradomain_pages )\n", "# print len( extradomain_pages )\n", " \n", " for target in intradomain_pages:\n", " edge_info = { \"link occurence\": occurences_link[target], \"term occurence\": occurences_named_entity[target] }\n", " hyperlinks_graph.add_edge(p, target, attr_dict=edge_info)\n", "\n", "print \"nodes: %s\" % len(hyperlinks_graph.nodes())\n", "print \"edges: %s\" % len(hyperlinks_graph.edges())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## storing the result graph" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "nx.write_gexf(hyperlinks_graph, \"data/hyperlinks.gexf\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## community detection (louvain)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import community\n", "partitions = community.best_partition(hyperlinks_graph.to_undirected())" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
group %s | \" % (c)\n", " html += \", \".join(map(lambda x: u\"{0}\".format(x), ps))\n", " html += \" |