{ "metadata": { "name": "", "signature": "sha256:9a606ffda65f7f3f2aabfad8caca6f72a9ce68e461e4250a1a643804b9cd9860" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import pprint\n", "from difflib import SequenceMatcher\n", "\n", "# http://python-cluster.sourceforge.net/\n", "from cluster import HierarchicalClustering\n", "\n", "# input urls to be clustered\n", "urls = [\n", " 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814385',\n", " '#articles',\n", " 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814335',\n", " 'http://yro.slashdot.org/~drDugan/',\n", " 'http://web.sourceforge.com/privacy.php',\n", " 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28815123',\n", " 'http://slashdot.org//slashdot.org/~Darkness404',\n", " 'http://slashdot.org//radio.slashdot.org',\n", " 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814429',\n", " 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814457',\n", " 'http://slashdot.org//slashdot.org/article.pl?sid=09/07/24/1545238',\n", " 'http://slashdot.org//slashdot.org/comments.pl?sid=09/07/24/1545238&cid=28810581',\n", " 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28815269',\n", " 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814657',\n", " 'http://web.sourceforge.com/terms.php'\n", " 'http://slashdot.org//it.slashdot.org/search',\n", " 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814581',\n", " 'http://xkcd.com/612/',\n", " 'http://web.sourceforge.com/advertising',\n", " 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814785',\n", "]\n", "\n", "# distance function compares two urls and finds the distance\n", "# uses SequenceMatcher from python standard module difflib\n", "def distance(url1, url2):\n", " ratio = SequenceMatcher(None, url1, url2).ratio()\n", " return 1.0 - ratio\n", "\n", "# Perform clustering\n", "hc = HierarchicalClustering(urls, distance)\n", "clusters = hc.getlevel(0.2)\n", "\n", "pprint.pprint(clusters)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[['#articles'],\n", " ['http://xkcd.com/612/'],\n", " ['http://web.sourceforge.com/advertising'],\n", " ['http://web.sourceforge.com/privacy.php'],\n", " ['http://web.sourceforge.com/terms.phphttp://slashdot.org//it.slashdot.org/search'],\n", " ['http://yro.slashdot.org/~drDugan/'],\n", " ['http://slashdot.org//radio.slashdot.org'],\n", " ['http://slashdot.org//slashdot.org/~Darkness404'],\n", " ['http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814785',\n", " 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814429',\n", " 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814457'],\n", " ['http://slashdot.org//slashdot.org/article.pl?sid=09/07/24/1545238',\n", " 'http://slashdot.org//slashdot.org/comments.pl?sid=09/07/24/1545238&cid=28810581',\n", " 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28815123',\n", " 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28815269',\n", " 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814335',\n", " 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814385',\n", " 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814581',\n", " 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814657']]\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "type(urls)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": [ "list" ] } ], "prompt_number": 4 } ], "metadata": {} } ] }