{ "metadata": { "name": "", "signature": "sha256:725eb4cf4f4eeb216571343ab7a3f13921d5890e98ef0eaa1d5ca6323018e207" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "#coded by Agung, PhD\n", "import pandas as pd\n", "import numpy as np\n", "import os, sqlite3\n", "import json\n", "\n", "\n", "databases = []\n", "dataku = []\n", "directory = \"D:/DATA/example\"\n", "for filename in os.listdir(directory):\n", " flname = os.path.join(directory, filename)\n", " databases.append(flname)\n", "\n", "for database in databases:\n", "\n", " try:\n", " with sqlite3.connect(database) as conn:\n", " cur = conn.cursor()\n", " sqlqry = pd.read_sql(\"SELECT value FROM data WHERE name='BrowserBookmarksProbe'\",conn)\n", " a = sqlqry['value']\n", " records = [json.loads(line) for line in a]\n", " for row in records:\n", " dataku.append(row)\n", "\n", " except sqlite3.Error, err:\n", " print \"[INFO] %s\" % err\n", "\n", "url = [url['url'] for url in dataku]\n", "visits = [visits['visits'] for visits in dataku]\n", "datazip = zip(url,visits)\n", "frame = pd.DataFrame(data=datazip, columns=['url','visits'])\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "urls = frame['url']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "import pprint\n", "from difflib import SequenceMatcher\n", "\n", "# http://python-cluster.sourceforge.net/\n", "from cluster import HierarchicalClustering\n", "\n", "def distance(url1, url2):\n", " ratio = SequenceMatcher(None, url1, url2).ratio()\n", " return 1.0 - ratio\n", "\n", "# Perform clustering\n", "hc = HierarchicalClustering(urls, distance)\n", "clusters = hc.getlevel(0.2)\n", "\n", "pprint.pprint(clusters)" ], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }