{ "metadata": { "name": "", "signature": "sha256:bbd40871454de4bfea1a6e165ce883033b31bf3aab8c3438c8a38f1c1a6f74af" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import numpy as np\n", "import os, sqlite3\n", "import json\n", "from urllib import urlencode\n", "from urlparse import urlparse, parse_qs, urlunparse\n", "import urlparse\n", "import pprint\n", "from difflib import SequenceMatcher\n", "\n", "\n", "\n", "def parsed(x):\n", " parse = urlparse(x)\n", " return parse.netloc\n", "\n", "\n", "lines = []\n", "\n", "databases = []\n", "dataku = []\n", "\n", "\n", "directory = \"D:/DATA/example\"\n", "for filename in os.listdir(directory):\n", " flname = os.path.join(directory, filename)\n", " databases.append(flname)\n", "\n", "for database in databases:\n", "\n", " try:\n", " with sqlite3.connect(database) as conn:\n", " cur = conn.cursor()\n", " sqlqry = pd.read_sql(\"SELECT value FROM data WHERE name='BrowserBookmarksProbe'\",conn)\n", " a = sqlqry['value']\n", " #b = sqlqry['timestamp']\n", " records = [json.loads(line) for line in a]\n", " for row in records:\n", " dataku.append(row)\n", " \n", "\n", " except sqlite3.Error, err:\n", " print \"[INFO] %s\" % err\n", "\n", "url = [url['url'] for url in dataku]\n", "visits = [visits['visits'] for visits in dataku]\n", "datazip = zip(url,visits)\n", "\n", "df = pd.DataFrame(datazip, columns=['url','visits'])\n", "\n", "dfnew= df['url'].apply(parsed)\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "clusterdata = dfnew.head(100)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "clusterdata.head(10)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 22, "text": [ "0 m.blog.naver.com\n", "1 m.facebook.com\n", "2 details\n", "3 story-api.kakao.com\n", "4 m.cafe.daum.net\n", "5 details\n", "6 cafe.daum.net\n", "7 m.facebook.com\n", "8 j.exit-ad.com\n", "9 asked.kr\n", "Name: url, dtype: object" ] } ], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "strdata = clusterdata.apply(str)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 26 }, { "cell_type": "code", "collapsed": false, "input": [ "strdata.head(10)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 27, "text": [ "0 m.blog.naver.com\n", "1 m.facebook.com\n", "2 details\n", "3 story-api.kakao.com\n", "4 m.cafe.daum.net\n", "5 details\n", "6 cafe.daum.net\n", "7 m.facebook.com\n", "8 j.exit-ad.com\n", "9 asked.kr\n", "Name: url, dtype: object" ] } ], "prompt_number": 27 }, { "cell_type": "code", "collapsed": false, "input": [ "import pprint\n", "from difflib import SequenceMatcher\n", "\n", "from cluster import HierarchicalClustering\n", "\n", "def distance(url1, url2):\n", " ratio = SequenceMatcher(None, url1, url2).ratio()\n", " return 1.0 - ratio\n", "\n", "# Perform clustering\n", "hc = HierarchicalClustering(strdata, distance)\n", "clusters = hc.getlevel(0.2)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 28 }, { "cell_type": "code", "collapsed": false, "input": [ "clusters" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 29, "text": [ "[['details', 'details'],\n", " ['auth.waffle.at',\n", " 'auth.waffle.at',\n", " 'auth.waffle.at',\n", " 'auth.waffle.at',\n", " 'auth.waffle.at',\n", " 'auth.waffle.at',\n", " 'auth.waffle.at',\n", " 'auth.waffle.at',\n", " 'auth.waffle.at'],\n", " ['bit.ly'],\n", " ['asked.kr', 'asked.kr', 'asked.kr', 'asked.kr'],\n", " ['cyber.kepco.co.kr',\n", " 'cyber.kepco.co.kr',\n", " 'cyber.kepco.co.kr',\n", " 'cyber.kepco.co.kr',\n", " 'cyber.kepco.co.kr',\n", " 'cyber.kepco.co.kr',\n", " 'cyber.kepco.co.kr',\n", " 'cyber.kepco.co.kr'],\n", " ['jesuside.com.ne.kr'],\n", " ['t.co', 't.co'],\n", " ['j.exit-ad.com'],\n", " ['m.event.toast.com'],\n", " ['twitpic.com'],\n", " ['i1.media.daumcdn.net'],\n", " ['cafe.daum.net', 'm.cafe.daum.net', 'm.cafe.daum.net'],\n", " ['eugenejulia.tistory.com'],\n", " ['m.imbc.com'],\n", " ['m.winixcorp.com'],\n", " ['m.humoruniv.com', 'web.humoruniv.com'],\n", " ['ggstory.com'],\n", " ['m.gsshop.com', 'm.gsshop.com'],\n", " ['m.facebook.com',\n", " 'm.facebook.com',\n", " 'm.facebook.com',\n", " 'm.facebook.com',\n", " 'm.facebook.com',\n", " 'm.facebook.com',\n", " 'm.facebook.com',\n", " 'm.facebook.com',\n", " 'm.facebook.com',\n", " 'm.facebook.com',\n", " 'm.facebook.com',\n", " 'm.facebook.com',\n", " 'm.facebook.com',\n", " 'm.facebook.com',\n", " 'm.facebook.com',\n", " 'm.facebook.com',\n", " 'm.facebook.com',\n", " 'm.facebook.com',\n", " 'm.facebook.com'],\n", " ['instagram.com'],\n", " ['m.universaltoy.co.kr'],\n", " ['m.insight.co.kr'],\n", " ['www.shescoming.co.kr',\n", " 'www.shescoming.co.kr',\n", " 'www.shescoming.co.kr',\n", " 'www.shescoming.co.kr',\n", " 'www.shescoming.co.kr'],\n", " ['www.lotteconf.co.kr'],\n", " ['www.skt-lte.co.kr'],\n", " ['www.ebs.co.kr'],\n", " ['m.11am.co.kr'],\n", " ['www.annpiona.co.kr', 'm.hani.co.kr', 'www.hani.co.kr'],\n", " ['www.pulmuoneamio.com'],\n", " ['www.threemusic.com'],\n", " ['www.youtube.com'],\n", " ['www.gobalnews.com'],\n", " ['www.google.com'],\n", " ['www.100tap.com'],\n", " ['story-api.kakao.com'],\n", " ['www.korea-ps.com'],\n", " ['m.blog.naver.com',\n", " 'm.blog.naver.com',\n", " 'm.blog.naver.com',\n", " 'm.blog.naver.com',\n", " 'm.blog.naver.com'],\n", " ['m.cafe.naver.com',\n", " 'm.cafe.naver.com',\n", " 'm.cafe.naver.com',\n", " 'nid.naver.com',\n", " 'nid.naver.com',\n", " 'www.naver.com',\n", " 'm.news.naver.com',\n", " 'news.naver.com'],\n", " ['plus.kakao.com'],\n", " ['www.dalkomm.com'],\n", " ['www.kakao.com']]" ] } ], "prompt_number": 29 }, { "cell_type": "code", "collapsed": false, "input": [ "for row in clusters:\n", " print row" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "['details', 'details']\n", "['auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at']\n", "['bit.ly']\n", "['asked.kr', 'asked.kr', 'asked.kr', 'asked.kr']\n", "['cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr']\n", "['jesuside.com.ne.kr']\n", "['t.co', 't.co']\n", "['j.exit-ad.com']\n", "['m.event.toast.com']\n", "['twitpic.com']\n", "['i1.media.daumcdn.net']\n", "['cafe.daum.net', 'm.cafe.daum.net', 'm.cafe.daum.net']\n", "['eugenejulia.tistory.com']\n", "['m.imbc.com']\n", "['m.winixcorp.com']\n", "['m.humoruniv.com', 'web.humoruniv.com']\n", "['ggstory.com']\n", "['m.gsshop.com', 'm.gsshop.com']\n", "['m.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com']\n", "['instagram.com']\n", "['m.universaltoy.co.kr']\n", "['m.insight.co.kr']\n", "['www.shescoming.co.kr', 'www.shescoming.co.kr', 'www.shescoming.co.kr', 'www.shescoming.co.kr', 'www.shescoming.co.kr']\n", "['www.lotteconf.co.kr']\n", "['www.skt-lte.co.kr']\n", "['www.ebs.co.kr']\n", "['m.11am.co.kr']\n", "['www.annpiona.co.kr', 'm.hani.co.kr', 'www.hani.co.kr']\n", "['www.pulmuoneamio.com']\n", "['www.threemusic.com']\n", "['www.youtube.com']\n", "['www.gobalnews.com']\n", "['www.google.com']\n", "['www.100tap.com']\n", "['story-api.kakao.com']\n", "['www.korea-ps.com']\n", "['m.blog.naver.com', 'm.blog.naver.com', 'm.blog.naver.com', 'm.blog.naver.com', 'm.blog.naver.com']\n", "['m.cafe.naver.com', 'm.cafe.naver.com', 'm.cafe.naver.com', 'nid.naver.com', 'nid.naver.com', 'www.naver.com', 'm.news.naver.com', 'news.naver.com']\n", "['plus.kakao.com']\n", "['www.dalkomm.com']\n", "['www.kakao.com']\n" ] } ], "prompt_number": 30 } ], "metadata": {} } ] }