{ "metadata": { "name": "", "signature": "sha256:b04a5628f8bccb432904e5231ba01716c9a0718df099ad83e97aaaf6eff8e63e" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "#\n", "# Gianluca Demartini, 2015\n", "# http://gianlucademartini.net\n", "# Processing of WikiR data https://wikireverse.org/data (version 2014-23-data) to see which top-level domain (e.g., .com .net) link to each Wikipedia\n", "# The final data goes to a tab separated file which is load in to Tableau Public to generate this visualisation: http:\n", "#\n", "\n", "import pandas as pd\n", "file0='./Desktop/2014-23-data/part-0'\n", "file1='./Desktop/2014-23-data/part-1'\n", "file2='./Desktop/2014-23-data/part-2'\n", "file3='./Desktop/2014-23-data/part-3'\n", "header_row=['lan','wiki','url','WebTitle', 'time']\n", "\n", "#load data from files\n", "try:\n", " wikiR0 = pd.read_csv(file0, error_bad_lines=False, header=None, names=header_row, sep='\\t')\n", " wikiR1 = pd.read_csv(file1, error_bad_lines=False, header=None, names=header_row, sep='\\t')\n", " wikiR2 = pd.read_csv(file2, error_bad_lines=False, header=None, names=header_row, sep='\\t')\n", " wikiR3 = pd.read_csv(file3, error_bad_lines=False, header=None, names=header_row, sep='\\t')\n", "except pd.parser.CParserError as e:\n", " print e\n", "\n", "#put all in one dataframe\n", "wikiR = wikiR0.append(wikiR1, ignore_index=True)\n", "wikiR = wikiR.append(wikiR2, ignore_index=True)\n", "wikiR = wikiR.append(wikiR3, ignore_index=True)\n", "\n", "wikiR.head()\n" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lanwikiurlWebTitletime
0 ab ???????? http://mail-archives.apache.org/mod_mbox/openo... NaN 2014-07-13 08:10:00
1 ab 1939 http://rarplayer.appspot.com/wiki/1939 1939 \u2013 Wikipedia, wolna encyklopedia 2014-08-02 06:30:00
2 ab 1994 http://rarplayer.appspot.com/wiki/1994 1994 \u2013 Wikipedia, wolna encyklopedia 2014-08-01 00:02:00
3 ab 2004 http://instapedia.com/m/2004 2004 - iPhone/Mobile Wikipedia 2014-07-13 16:18:00
4 ab 2007 http://rarplayer.appspot.com/wiki/2007 2007 \u2013 Wikipedia, wolna encyklopedia 2014-07-31 09:37:00
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 3, "text": [ " lan wiki url \\\n", "0 ab ???????? http://mail-archives.apache.org/mod_mbox/openo... \n", "1 ab 1939 http://rarplayer.appspot.com/wiki/1939 \n", "2 ab 1994 http://rarplayer.appspot.com/wiki/1994 \n", "3 ab 2004 http://instapedia.com/m/2004 \n", "4 ab 2007 http://rarplayer.appspot.com/wiki/2007 \n", "\n", " WebTitle time \n", "0 NaN 2014-07-13 08:10:00 \n", "1 1939 \u2013 Wikipedia, wolna encyklopedia 2014-08-02 06:30:00 \n", "2 1994 \u2013 Wikipedia, wolna encyklopedia 2014-08-01 00:02:00 \n", "3 2004 - iPhone/Mobile Wikipedia 2014-07-13 16:18:00 \n", "4 2007 \u2013 Wikipedia, wolna encyklopedia 2014-07-31 09:37:00 " ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "#how much data?\n", "len(wikiR.index)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": [ "36301008" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "from __future__ import with_statement\n", "from urlparse import urlparse\n", "import urllib2\n", "\n", "# load tlds, ignore comments and empty lines:\n", "file=\"https://publicsuffix.org/list/effective_tld_names.dat\"\n", "filehandle = urllib2.urlopen(file) \n", "tlds = [line.strip() for line in filehandle if line[0] not in \"/\\n\"]\n", "\n", "#function to get the top level domain from the url given the list of tlds\n", "def get_TLdomain(url, tlds):\n", " # print url\n", " url_elements = urlparse(url)[1].split('.')\n", " # url_elements = [\"abcde\",\"co\",\"uk\"]\n", "\n", " for i in range(-len(url_elements), 0):\n", " last_i_elements = url_elements[i:]\n", " # i=-3: [\"abcde\",\"co\",\"uk\"]\n", " # i=-2: [\"co\",\"uk\"]\n", " # i=-1: [\"uk\"] etc\n", "\n", " candidate = \".\".join(last_i_elements) # abcde.co.uk, co.uk, uk\n", " wildcard_candidate = \".\".join([\"*\"] + last_i_elements[1:]) # *.co.uk, *.uk, *\n", " exception_candidate = \"!\" + candidate\n", "\n", " # match tlds: \n", " if (exception_candidate in tlds):\n", " return \".\".join(url_elements[i:]) \n", " if (candidate in tlds or wildcard_candidate in tlds):\n", " return \".\".join(url_elements[i-0:]) # i-1 returns domain name with TLD, i-0 return TLD\n", " # returns \"abcde.co.uk\" \n", "# raise ValueError(\"Domain not in global list of TLDs \"+url)\n", " print \"Domain not in global list of TLDs \"+url\n", " return \"null\"\n", "\n", "print get_TLdomain(\"http://io.abcde.com\", tlds)\n", "print wikiR.url[3]\n", "print get_TLdomain(wikiR.url[3], tlds)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "com\n", "http://instapedia.com/m/2004\n", "com\n" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "# extract the top level domain from the url for all entries of wikiR. This takes time.\n", "wikiR.loc[:,'tld']=wikiR.apply(lambda x: get_TLdomain(x['url'], tlds), axis=1)\n", "wikiR.head()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Domain not in global list of TLDs http://62.143.88.190/devalco/\n", "Domain not in global list of TLDs 2014-07-28 10:19:00" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs 2014-07-31 17:36:00" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs 2014-08-01 01:43:00" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://62.143.88.190/devalco/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://62.143.88.190/devalco/\n", "Domain not in global list of TLDs 2014-07-28 22:32:00" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs 2014-07-31 17:32:00" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.visualnews.com./" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs 2014-07-10 10:18:00" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs 2014-07-31 09:47:00" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.geni.com./people/Anna-Paquin/6000000005350378009" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.ativanonline.net./" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://195.84.101.101/~goranl/shack/ant_deltaloop40/index.html" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://cd8308.myfoscam.org:8080/evanvaughan_com/default.aspx" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://cd8308.myfoscam.org:8080/evanvaughan_com/default.aspx?pageid=FD30F09484F44DE4\n", "Domain not in global list of TLDs http://cd8308.myfoscam.org:8080/evanvaughan_com/default.aspx?postid=F6B7D3412CD04432\n", "Domain not in global list of TLDs http://multistage.ssel.caltech.edu:8000/multistage/wiki/CamelCase" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://rubygarage.org./" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://gps.nju.edu.cn:88/mediawiki/index.php/Compiler_Crafting" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://173.8.135.113/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://50.87.144.65/~rt/w/index.php?title=Dark_Frame" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://blog.bluehost.com./blog/bluehost/clean-up-online-business-listings-with-yext-1664/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://50.87.144.65/~rt/w/index.php?title=The_Floating_Point_Engine" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://50.87.144.65/~rt/w/index.php?title=Dark_Frame" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://216.119.148.216/update/deleting-online-predators-act/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://198.102.103.39/community/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://198.102.103.39/community/index.php\n", "Domain not in global list of TLDs http://62.143.88.190/devalco/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://viper.infotech.monash.edu:4277/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.visualnews.com./" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://74.220.215.218/~recollec/bleed/0518.htm" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://173.8.135.113/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.visualnews.com./" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.visualnews.com./" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://50.87.144.65/~rt/w/index.php?title=The_Floating_Point_Engine" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.visualnews.com./" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://rubygarage.org./" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs 2014-07-30 21:17:00" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://173.8.135.113/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://multistage.ssel.caltech.edu:8000/multistage/wiki/InterMapTxt" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://multistage.ssel.caltech.edu:8000/multistage/wiki/InterMapTxt" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://rubygarage.org./" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.klonopinonline.org./" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://204.14.213.185/Laptops-Notebooks/SubCategory/ID-32?Category=223" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.visualnews.com./" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://74.220.215.218/~recollec/bleed/0514.htm" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs 2014-07-30 21:16:00" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://gps.nju.edu.cn:88/mediawiki/index.php/Compiler_Crafting" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs 2014-07-29 16:49:00" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.scotxblog.com./" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://62.143.88.190/devalco/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://62.143.88.190/devalco/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://62.143.88.190/devalco/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://144.76.109.211/redirect.php?dst=" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.thegatewaypundit.com./" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://multistage.ssel.caltech.edu:8000/multistage/wiki/InterMapTxt" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.visualnews.com./" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://look.gvsu.edu:8000/opc" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://198.100.46.202/2009/09/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://74.220.215.218/~recollec/bleed/0514.htm" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://198.100.46.202/2009/09/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://173.8.135.113/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.russianseason.net./" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://74.220.215.218/~recollec/bleed/0531.htm" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://62.143.88.190/devalco/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.buy-online-viagra.us./" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.visualnews.com./" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://viper.infotech.monash.edu:4277/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://173.8.135.113/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://195.84.101.101/~goranl/shack/ant_deltaloop40/index.html" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs 2014-07-29 06:44:00" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://190.19.84.90:8080/pergamo/opac/cgi-bin/pgopac.cgi" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://190.19.84.90:8080/pergamo/opac/cgi-bin/pgopac.cgi" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://190.19.84.90:8080/pergamo/opac/cgi-bin/pgopac.cgi" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs 2014-08-01 15:54:00" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://test2.mafiachat.net:8800/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.xn--e1adkpj5f.xn--p1ai/%D0%B8%D0%B2%D0%B0%D0%BD-%D0%BF%D0%BE%D0%BB%D0%BE%D0%BD%D1%81%D0%BA%D0%B8%D0%B9-%D1%80%D0%B0%D0%B7%D0%B2%D0%BE%D0%B4-%D0%B4%D0%BB%D1%8F-%D0%BD%D0%BE%D0%B2%D0%B8%D1%87%D0%BA%D0%BE%D0%B2/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.xn--80afgqph1c.xn--p1ai/games-for-xbox/1838-skachat-besplatno-igru-betmen-arkhem-origins-na-iksboks-360-batman-arkham-origins-xbox-360-2013-god-russkaya-licenzionnaya-versiya.html" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.xn--80afgqph1c.xn--p1ai/computer-games/1713-skachat-besplatno-igru-pro-zombi-na-kompyuter-dead-island-2-dead-island-riptide-russkaya-versiya-repak.html" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://xn--e1abgfitfjgcl0c.xn--p1ai/?page_id=44" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://xn--e1abgfitfjgcl0c.xn--p1ai/?page_id=44" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://xn--e1abgfitfjgcl0c.xn--p1ai/?page_id=44" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://xn--p1af1b.xn--p1ai/%D0%91%D0%B0%D1%88%D0%B0%D1%80_%D0%90%D1%81%D0%B0%D0%B4" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.xn----dtbikagememahdgab5aia4a3b3k.xn--p1ai/" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.xn--80afgqph1c.xn--p1ai/computer-games/1713-skachat-besplatno-igru-pro-zombi-na-kompyuter-dead-island-2-dead-island-riptide-russkaya-versiya-repak.html" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://www.xn--80afgqph1c.xn--p1ai/computer-games/1804-skachat-besplatno-igru-ekspediciya-konkistadorov-na-kompyuter-expeditions-conquistador-2013-god-russkaya-versiya-repak.html" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://xn----7sbiew6aadnema7p.xn--p1ai/sity_id.php?id=3" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://xn----7sbb5ahj4aiadq2m.xn--p1ai/guide/army/ta/t10.shtml" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Domain not in global list of TLDs http://xn----7sbiew6aadnema7p.xn--p1ai/sity_id.php?id=25" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] }, { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lanwikiurlWebTitletimetld
0 ab ???????? http://mail-archives.apache.org/mod_mbox/openo... NaN 2014-07-13 08:10:00 org
1 ab 1939 http://rarplayer.appspot.com/wiki/1939 1939 \u2013 Wikipedia, wolna encyklopedia 2014-08-02 06:30:00 appspot.com
2 ab 1994 http://rarplayer.appspot.com/wiki/1994 1994 \u2013 Wikipedia, wolna encyklopedia 2014-08-01 00:02:00 appspot.com
3 ab 2004 http://instapedia.com/m/2004 2004 - iPhone/Mobile Wikipedia 2014-07-13 16:18:00 com
4 ab 2007 http://rarplayer.appspot.com/wiki/2007 2007 \u2013 Wikipedia, wolna encyklopedia 2014-07-31 09:37:00 appspot.com
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ " lan wiki url \\\n", "0 ab ???????? http://mail-archives.apache.org/mod_mbox/openo... \n", "1 ab 1939 http://rarplayer.appspot.com/wiki/1939 \n", "2 ab 1994 http://rarplayer.appspot.com/wiki/1994 \n", "3 ab 2004 http://instapedia.com/m/2004 \n", "4 ab 2007 http://rarplayer.appspot.com/wiki/2007 \n", "\n", " WebTitle time tld \n", "0 NaN 2014-07-13 08:10:00 org \n", "1 1939 \u2013 Wikipedia, wolna encyklopedia 2014-08-02 06:30:00 appspot.com \n", "2 1994 \u2013 Wikipedia, wolna encyklopedia 2014-08-01 00:02:00 appspot.com \n", "3 2004 - iPhone/Mobile Wikipedia 2014-07-13 16:18:00 com \n", "4 2007 \u2013 Wikipedia, wolna encyklopedia 2014-07-31 09:37:00 appspot.com " ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "# seralize dataframe now\n", "wikiR.to_pickle('/Users/gianlucademartini/wikiRall.pickle')\n", "\n", "#group by and count inlinks for each tld and wikipedia edition\n", "tldCount=wikiR.groupby(['lan','tld']).count()\n", "\n", "# save as tab separated\n", "tldCount.reset_index()\n", "tldCount.to_csv('tldCount.tsv', sep='\\t')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }