{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# hyperlinks network" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%run \"libraries.ipynb\"\n", "\n", "from IPython.display import display, HTML\n", "\n", "from bs4 import BeautifulSoup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## loading the original namespace\n", "\n", "In order to control our namespace, we are importing [a list of wikipedia pages](https://github.com/WeKeyPedia/notebooks/blob/master/geometry/data/pagenames.txt) that we extracted from the [List of geometry topics](http://en.wikipedia.org/wiki/list_of_geometry_topics)." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "pages = codecs.open(\"data/pagenames.txt\",\"r\", \"utf-8-sig\").readlines()\n", "pages = map(lambda x: x.strip(), pages)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## finding wikipedia links in the page content" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "nodes: 300\n", "edges: 3781\n" ] } ], "source": [ "hyperlinks_graph = nx.DiGraph()\n", "\n", "def get_content(page):\n", " with codecs.open(\"data/pages/%s.json\" % (page), \"r\", \"utf-8-sig\") as f:\n", " j = json.load(f)\n", " content = j[\"query\"][\"pages\"][j[\"query\"][\"pages\"].keys()[0]]\n", " content = content[\"revisions\"][0][\"*\"]\n", "\n", " return content\n", "\n", "def find_hyperlinks(page):\n", " hyperlinks = []\n", "\n", " content = get_content(page)\n", " content = BeautifulSoup(content, 'html.parser') \n", "\n", " hyperlinks = content.find_all('a')\n", " \n", " return hyperlinks\n", "\n", "for p in pages:\n", " occurences_link = {}\n", " occurences_named_entity = {}\n", "\n", " hyperlinks = find_hyperlinks(p)\n", " content = u\"\" + get_content(p)\n", " # keep only title of the link\n", " hyperlinks = map(lambda x: x.get(\"title\"), hyperlinks)\n", "\n", " # some hyperlinks have no title\n", " hyperlinks = [ x for x in hyperlinks if x != None ]\n", " \n", " # sorted hyperlinks by title length in order to get a more precise n-graming\n", " # otherwise terms like \"triangle\" are over-evaluated because of other terms\n", " # like \"equilateral triangle\"\n", " hyperlinks = sorted(hyperlinks, key=lambda k: -len(k))\n", " \n", " gruyere = content\n", " \n", " for k in hyperlinks:\n", " # count occurences links\n", " occurences_link.setdefault(k, 0)\n", " occurences_link[k] += 1\n", " \n", " # count occurences terms\n", " occurences_named_entity.setdefault(k, 0)\n", " occurences_named_entity[k] = unicode(gruyere).count(unicode(k))\n", " \n", " gruyere = gruyere.replace(k, \"\")\n", "\n", " # print \"coverage: %s/%s (%s%%)\" % (len(gruyere), len(content), float(len(gruyere))*100/float(len(content)))\n", " # reduce to a list of unique items\n", " \n", " hyperlinks = list(set(hyperlinks))\n", "\n", " # keep only linked pages that are inside the initial domain\n", " intradomain_pages = set(hyperlinks) & set(pages)\n", " extradomain_pages = set(hyperlinks) - set(pages)\n", "\n", "# print len( hyperlinks )\n", "# print len( intradomain_pages )\n", "# print len( extradomain_pages )\n", " \n", " for target in intradomain_pages:\n", " edge_info = { \"link occurence\": occurences_link[target], \"term occurence\": occurences_named_entity[target] }\n", " hyperlinks_graph.add_edge(p, target, attr_dict=edge_info)\n", "\n", "print \"nodes: %s\" % len(hyperlinks_graph.nodes())\n", "print \"edges: %s\" % len(hyperlinks_graph.edges())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## storing the result graph" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "nx.write_gexf(hyperlinks_graph, \"data/hyperlinks.gexf\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## community detection (louvain)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import community\n", "partitions = community.best_partition(hyperlinks_graph.to_undirected())" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "

group 0

Bézier curve, Integral geometry, Discrete geometry, Polygon triangulation, Ehrhart polynomial, Point in polygon, 3D computer graphics, Convex hull, Digital geometry, Spline (mathematics), Geometry of numbers, Non-uniform rational B-spline, Euclidean shortest path, Minkowski's theorem, Image analysis, Delaunay triangulation, Pick's theorem, Computational geometry, B-spline, Hidden line removal, Binary space partitioning, Computer graphics, Minkowski addition, Graham scan, Constructive solid geometry, Convex geometry, Ray tracing (graphics), Point location

group 1

Elliptic geometry, Geometrization conjecture, Absolute geometry, Shear mapping, Invariant (mathematics), Ordered geometry, Congruence (geometry), Annulus (mathematics), Klein geometry, Symplectic geometry, Root system, Reflection (mathematics), Erlangen program, 2D computer graphics, Contact geometry, Parallel (geometry), Minkowski space, Similarity (geometry), Riemannian geometry, Non-Euclidean geometry, Information geometry, Homothetic transformation, Coordinate rotations and reflections, Systolic geometry, Line (geometry), Pseudosphere, Ruppeiner geometry, Euclidean geometry, Isometry, Affine transformation, Rotation (mathematics), Point (geometry), Hyperbolic geometry, Euclidean distance, Affine geometry, Sangaku, Translation (geometry), Pythagorean theorem, Four-dimensional space, Transformation geometry, Spherical geometry, Hadwiger's theorem, Differential geometry, Strähle construction, Parallel postulate, Hilbert's axioms

group 2

Triangle inequality, Spherical trigonometry, Symmedian, Squaring the circle, Isosceles trapezoid, Equilateral triangle, List of circle topics, Polar sine, Circle, Pi, Incircle and excircles of a triangle, Poncelet–Steiner theorem, Holditch's theorem, Golden angle, Tangential quadrilateral, Rectangle, Heron's formula, Circumscribed circle, Brahmagupta's formula, List of triangle inequalities, Ptolemy's theorem, Van Hiele model, Distance geometry, Power center (geometry), Nine-point circle, Angle trisection, Inscribed angle, Altitude (triangle), Dividing a circle into areas, List of interactive geometry software, Trigonometry, Quadrilateral, Curve of constant width, Kite (geometry), Isoperimetric inequality, Orthodiagonal quadrilateral, Bicentric quadrilateral, Central angle, Equidiagonal quadrilateral, Trapezoid, Pons asinorum, Sphericon, Orthocentric system, Euler line, Right triangle, Isosceles triangle, Concurrent lines, Astronomy, Taxicab geometry, Homothetic center, Integer triangle, Heronian triangle, Straightedge, Mrs. Miniver's problem, Reuleaux triangle, Ball (mathematics), Bretschneider's formula, Pedal triangle, Compass-and-straightedge construction, List of triangle topics, Parallelogram law, Thales' theorem, List of trigonometry topics, Pedoe's inequality, Angle, Acute and obtuse triangles, Pythagorean triple, Triangle, Concyclic points, Cyclic quadrilateral, Rhombus

group 3

Prismatoid, Kepler–Poinsot polyhedron, 2D geometric model, Point groups in three dimensions, Tetrahedron, Regular polytope, Polytope compound, Star polygon, Wallpaper group, Square, Penrose tiling, Space group, Convex uniform honeycomb, Polygon, Relative direction, Prototile, Parallelepiped, Aperiodic tiling, Honeycomb (geometry), Frieze group, Crystal, Internal and external angle, Deltahedron, Zonohedron, Regular Polytopes (book), Pyramid (geometry), Quasicrystal, Fractal, Polyhedron, Polytope, Lattice (group), Symmetry, Prism (geometry), Voronoi diagram, Wallace–Bolyai–Gerwien theorem, Glide reflection, Uniform tessellation, Pattern, Wang tile, Platonic solid, Roman surface, Dissection problem, Dihedral angle, Coxeter group, Point groups in two dimensions, Angular defect, Tessellation, Uniform polyhedron, Hilbert's third problem, Symmetry group, Chirality (mathematics), Schläfli symbol, Translational symmetry, Mirror image, Handedness, Johnson solid, Heronian tetrahedron, Archimedean solid, Crystal system, Point group

group 4

Steiner chain, Girard Desargues, Kissing number problem, Napkin ring problem, Eccentricity (mathematics), Dandelin spheres, Semi-major axis, Paraboloid, Pappus's centroid theorem, Descriptive geometry, Parametric surface, Mathematical morphology, Cone, Focus (geometry), Leech lattice, Cavalieri's principle, Parabolic reflector, Soddy's hexlet, Kepler conjecture, Geometry, Quadric, Analytic geometry, Conic section, Infinitesimal transformation, Coordinate-free, Sphere, Spheroid, Matrix representation of conic sections, Hyperbola, Mathematics and fiber arts, Torus, Solid geometry, Parabolic microphone, Shape, Ellipsoid, Parabola, Sphere packing, Normal (geometry), Parametric equation, Geometric shape, Locus (mathematics), The Method of Mechanical Theorems, Cross section (geometry), Hyperboloid, Ellipse

group 5

Incidence (geometry), Complex geometry, Oval (projective plane), Affine space, Enumerative geometry, Duality (projective geometry), Riemann sphere, Algebraic geometry, Birational geometry, Line at infinity, Group action, Topology, Inversive geometry, Parabolic geometry (differential geometry), Complex projective plane, Projective geometry, Borromean rings, Pascal's theorem, Hyperplane at infinity, Cross-ratio, Projective line over a ring, Finite geometry, Homogeneous coordinates, Projective line, Plane at infinity, N-sphere, Desargues' theorem, Lie sphere geometry, Point at infinity, 3-sphere, Quantum geometry, Monge's theorem, Incidence geometry, Toric variety, Tropical geometry, Pappus's hexagon theorem, Synthetic geometry, 3D projection, Arc (projective geometry), Projective plane, Möbius transformation, Mathematics, Noncommutative geometry, Conformal geometry, Stereographic projection, Homography, Hermite spline, Projective space, Hyperplane

group 6

Epipolar geometry
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def print_groups(communities):\n", " html = \"\"\n", "\n", " for c, ps in communities.iteritems():\n", " html += \"\"\n", " \n", " html += \"

group %s

\" % (c)\n", " html += \", \".join(map(lambda x: u\"{0}\".format(x), ps))\n", " html += \"
\"\n", "\n", " display(HTML(html))\n", " \n", "communities = {}\n", "for k, v in partitions.iteritems():\n", " communities.setdefault(v, []).append(k)\n", "\n", "print_groups(communities)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## explore local subgraphes" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":0: FutureWarning: IPython widgets are experimental and may change in the future.\n" ] } ], "source": [ "from IPython.html import widgets\n", "import matplotlib.patheffects as path_effects" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%config InlineBackend.figure_formats=['svg']\n", "\n", "def display_local_graph(page):\n", " nbunch = [ page ]\n", " nbunch.extend( list(hyperlinks_graph.to_undirected()[page]))\n", " g = hyperlinks_graph.subgraph(nbunch)\n", "\n", " #nx.draw_spring(g2)\n", " plt\n", " pos = nx.spring_layout(g,iterations=150)\n", " \n", " plt.figure(figsize=(10,10))\n", "\n", " nodes = nx.draw_networkx_nodes(g, pos, alpha=0.7)\n", " nodes.set_edgecolor('w')\n", " nx.draw_networkx_edges(g, pos, alpha=0.3, width=1)\n", " labels = nx.draw_networkx_labels(g, pos, font_size=8, font_family=\"Bitstream Vera Sans\")\n", " map(lambda x: labels[x].set_path_effects([path_effects.Stroke(linewidth=1, foreground='white'),\n", " path_effects.Normal()]),labels)\n", " \n", " plt.axis('off')\n", " plt.show()\n", "\n", "\n", "w = widgets.Select(description=\"page\", options=pages)\n", "widgets.interact(display_local_graph, page=w);" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.8" } }, "nbformat": 4, "nbformat_minor": 0 }