{
"metadata": {
"name": "",
"signature": "sha256:b04a5628f8bccb432904e5231ba01716c9a0718df099ad83e97aaaf6eff8e63e"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"#\n",
"# Gianluca Demartini, 2015\n",
"# http://gianlucademartini.net\n",
"# Processing of WikiR data https://wikireverse.org/data (version 2014-23-data) to see which top-level domain (e.g., .com .net) link to each Wikipedia\n",
"# The final data goes to a tab separated file which is load in to Tableau Public to generate this visualisation: http:\n",
"#\n",
"\n",
"import pandas as pd\n",
"file0='./Desktop/2014-23-data/part-0'\n",
"file1='./Desktop/2014-23-data/part-1'\n",
"file2='./Desktop/2014-23-data/part-2'\n",
"file3='./Desktop/2014-23-data/part-3'\n",
"header_row=['lan','wiki','url','WebTitle', 'time']\n",
"\n",
"#load data from files\n",
"try:\n",
" wikiR0 = pd.read_csv(file0, error_bad_lines=False, header=None, names=header_row, sep='\\t')\n",
" wikiR1 = pd.read_csv(file1, error_bad_lines=False, header=None, names=header_row, sep='\\t')\n",
" wikiR2 = pd.read_csv(file2, error_bad_lines=False, header=None, names=header_row, sep='\\t')\n",
" wikiR3 = pd.read_csv(file3, error_bad_lines=False, header=None, names=header_row, sep='\\t')\n",
"except pd.parser.CParserError as e:\n",
" print e\n",
"\n",
"#put all in one dataframe\n",
"wikiR = wikiR0.append(wikiR1, ignore_index=True)\n",
"wikiR = wikiR.append(wikiR2, ignore_index=True)\n",
"wikiR = wikiR.append(wikiR3, ignore_index=True)\n",
"\n",
"wikiR.head()\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" lan | \n",
" wiki | \n",
" url | \n",
" WebTitle | \n",
" time | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" ab | \n",
" ???????? | \n",
" http://mail-archives.apache.org/mod_mbox/openo... | \n",
" NaN | \n",
" 2014-07-13 08:10:00 | \n",
"
\n",
" \n",
" 1 | \n",
" ab | \n",
" 1939 | \n",
" http://rarplayer.appspot.com/wiki/1939 | \n",
" 1939 \u2013 Wikipedia, wolna encyklopedia | \n",
" 2014-08-02 06:30:00 | \n",
"
\n",
" \n",
" 2 | \n",
" ab | \n",
" 1994 | \n",
" http://rarplayer.appspot.com/wiki/1994 | \n",
" 1994 \u2013 Wikipedia, wolna encyklopedia | \n",
" 2014-08-01 00:02:00 | \n",
"
\n",
" \n",
" 3 | \n",
" ab | \n",
" 2004 | \n",
" http://instapedia.com/m/2004 | \n",
" 2004 - iPhone/Mobile Wikipedia | \n",
" 2014-07-13 16:18:00 | \n",
"
\n",
" \n",
" 4 | \n",
" ab | \n",
" 2007 | \n",
" http://rarplayer.appspot.com/wiki/2007 | \n",
" 2007 \u2013 Wikipedia, wolna encyklopedia | \n",
" 2014-07-31 09:37:00 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 3,
"text": [
" lan wiki url \\\n",
"0 ab ???????? http://mail-archives.apache.org/mod_mbox/openo... \n",
"1 ab 1939 http://rarplayer.appspot.com/wiki/1939 \n",
"2 ab 1994 http://rarplayer.appspot.com/wiki/1994 \n",
"3 ab 2004 http://instapedia.com/m/2004 \n",
"4 ab 2007 http://rarplayer.appspot.com/wiki/2007 \n",
"\n",
" WebTitle time \n",
"0 NaN 2014-07-13 08:10:00 \n",
"1 1939 \u2013 Wikipedia, wolna encyklopedia 2014-08-02 06:30:00 \n",
"2 1994 \u2013 Wikipedia, wolna encyklopedia 2014-08-01 00:02:00 \n",
"3 2004 - iPhone/Mobile Wikipedia 2014-07-13 16:18:00 \n",
"4 2007 \u2013 Wikipedia, wolna encyklopedia 2014-07-31 09:37:00 "
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#how much data?\n",
"len(wikiR.index)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
"36301008"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import with_statement\n",
"from urlparse import urlparse\n",
"import urllib2\n",
"\n",
"# load tlds, ignore comments and empty lines:\n",
"file=\"https://publicsuffix.org/list/effective_tld_names.dat\"\n",
"filehandle = urllib2.urlopen(file) \n",
"tlds = [line.strip() for line in filehandle if line[0] not in \"/\\n\"]\n",
"\n",
"#function to get the top level domain from the url given the list of tlds\n",
"def get_TLdomain(url, tlds):\n",
" # print url\n",
" url_elements = urlparse(url)[1].split('.')\n",
" # url_elements = [\"abcde\",\"co\",\"uk\"]\n",
"\n",
" for i in range(-len(url_elements), 0):\n",
" last_i_elements = url_elements[i:]\n",
" # i=-3: [\"abcde\",\"co\",\"uk\"]\n",
" # i=-2: [\"co\",\"uk\"]\n",
" # i=-1: [\"uk\"] etc\n",
"\n",
" candidate = \".\".join(last_i_elements) # abcde.co.uk, co.uk, uk\n",
" wildcard_candidate = \".\".join([\"*\"] + last_i_elements[1:]) # *.co.uk, *.uk, *\n",
" exception_candidate = \"!\" + candidate\n",
"\n",
" # match tlds: \n",
" if (exception_candidate in tlds):\n",
" return \".\".join(url_elements[i:]) \n",
" if (candidate in tlds or wildcard_candidate in tlds):\n",
" return \".\".join(url_elements[i-0:]) # i-1 returns domain name with TLD, i-0 return TLD\n",
" # returns \"abcde.co.uk\" \n",
"# raise ValueError(\"Domain not in global list of TLDs \"+url)\n",
" print \"Domain not in global list of TLDs \"+url\n",
" return \"null\"\n",
"\n",
"print get_TLdomain(\"http://io.abcde.com\", tlds)\n",
"print wikiR.url[3]\n",
"print get_TLdomain(wikiR.url[3], tlds)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"com\n",
"http://instapedia.com/m/2004\n",
"com\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# extract the top level domain from the url for all entries of wikiR. This takes time.\n",
"wikiR.loc[:,'tld']=wikiR.apply(lambda x: get_TLdomain(x['url'], tlds), axis=1)\n",
"wikiR.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Domain not in global list of TLDs http://62.143.88.190/devalco/\n",
"Domain not in global list of TLDs 2014-07-28 10:19:00"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs 2014-07-31 17:36:00"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs 2014-08-01 01:43:00"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://62.143.88.190/devalco/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://62.143.88.190/devalco/\n",
"Domain not in global list of TLDs 2014-07-28 22:32:00"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs 2014-07-31 17:32:00"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.visualnews.com./"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs 2014-07-10 10:18:00"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs 2014-07-31 09:47:00"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.geni.com./people/Anna-Paquin/6000000005350378009"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.ativanonline.net./"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://195.84.101.101/~goranl/shack/ant_deltaloop40/index.html"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://cd8308.myfoscam.org:8080/evanvaughan_com/default.aspx"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://cd8308.myfoscam.org:8080/evanvaughan_com/default.aspx?pageid=FD30F09484F44DE4\n",
"Domain not in global list of TLDs http://cd8308.myfoscam.org:8080/evanvaughan_com/default.aspx?postid=F6B7D3412CD04432\n",
"Domain not in global list of TLDs http://multistage.ssel.caltech.edu:8000/multistage/wiki/CamelCase"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://rubygarage.org./"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://gps.nju.edu.cn:88/mediawiki/index.php/Compiler_Crafting"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://173.8.135.113/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://50.87.144.65/~rt/w/index.php?title=Dark_Frame"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://blog.bluehost.com./blog/bluehost/clean-up-online-business-listings-with-yext-1664/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://50.87.144.65/~rt/w/index.php?title=The_Floating_Point_Engine"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://50.87.144.65/~rt/w/index.php?title=Dark_Frame"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://216.119.148.216/update/deleting-online-predators-act/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://198.102.103.39/community/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://198.102.103.39/community/index.php\n",
"Domain not in global list of TLDs http://62.143.88.190/devalco/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://viper.infotech.monash.edu:4277/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.visualnews.com./"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://74.220.215.218/~recollec/bleed/0518.htm"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://173.8.135.113/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.visualnews.com./"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.visualnews.com./"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://50.87.144.65/~rt/w/index.php?title=The_Floating_Point_Engine"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.visualnews.com./"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://rubygarage.org./"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs 2014-07-30 21:17:00"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://173.8.135.113/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://multistage.ssel.caltech.edu:8000/multistage/wiki/InterMapTxt"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://multistage.ssel.caltech.edu:8000/multistage/wiki/InterMapTxt"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://rubygarage.org./"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.klonopinonline.org./"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://204.14.213.185/Laptops-Notebooks/SubCategory/ID-32?Category=223"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.visualnews.com./"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://74.220.215.218/~recollec/bleed/0514.htm"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs 2014-07-30 21:16:00"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://gps.nju.edu.cn:88/mediawiki/index.php/Compiler_Crafting"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs 2014-07-29 16:49:00"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.scotxblog.com./"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://62.143.88.190/devalco/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://62.143.88.190/devalco/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://62.143.88.190/devalco/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://144.76.109.211/redirect.php?dst="
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.thegatewaypundit.com./"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://multistage.ssel.caltech.edu:8000/multistage/wiki/InterMapTxt"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.visualnews.com./"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://look.gvsu.edu:8000/opc"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://198.100.46.202/2009/09/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://74.220.215.218/~recollec/bleed/0514.htm"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://198.100.46.202/2009/09/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://173.8.135.113/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.russianseason.net./"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://74.220.215.218/~recollec/bleed/0531.htm"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://62.143.88.190/devalco/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.buy-online-viagra.us./"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.visualnews.com./"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://viper.infotech.monash.edu:4277/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://173.8.135.113/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://195.84.101.101/~goranl/shack/ant_deltaloop40/index.html"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs 2014-07-29 06:44:00"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://190.19.84.90:8080/pergamo/opac/cgi-bin/pgopac.cgi"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://190.19.84.90:8080/pergamo/opac/cgi-bin/pgopac.cgi"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://190.19.84.90:8080/pergamo/opac/cgi-bin/pgopac.cgi"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs 2014-08-01 15:54:00"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://test2.mafiachat.net:8800/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.xn--e1adkpj5f.xn--p1ai/%D0%B8%D0%B2%D0%B0%D0%BD-%D0%BF%D0%BE%D0%BB%D0%BE%D0%BD%D1%81%D0%BA%D0%B8%D0%B9-%D1%80%D0%B0%D0%B7%D0%B2%D0%BE%D0%B4-%D0%B4%D0%BB%D1%8F-%D0%BD%D0%BE%D0%B2%D0%B8%D1%87%D0%BA%D0%BE%D0%B2/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.xn--80afgqph1c.xn--p1ai/games-for-xbox/1838-skachat-besplatno-igru-betmen-arkhem-origins-na-iksboks-360-batman-arkham-origins-xbox-360-2013-god-russkaya-licenzionnaya-versiya.html"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.xn--80afgqph1c.xn--p1ai/computer-games/1713-skachat-besplatno-igru-pro-zombi-na-kompyuter-dead-island-2-dead-island-riptide-russkaya-versiya-repak.html"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://xn--e1abgfitfjgcl0c.xn--p1ai/?page_id=44"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://xn--e1abgfitfjgcl0c.xn--p1ai/?page_id=44"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://xn--e1abgfitfjgcl0c.xn--p1ai/?page_id=44"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://xn--p1af1b.xn--p1ai/%D0%91%D0%B0%D1%88%D0%B0%D1%80_%D0%90%D1%81%D0%B0%D0%B4"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.xn----dtbikagememahdgab5aia4a3b3k.xn--p1ai/"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.xn--80afgqph1c.xn--p1ai/computer-games/1713-skachat-besplatno-igru-pro-zombi-na-kompyuter-dead-island-2-dead-island-riptide-russkaya-versiya-repak.html"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://www.xn--80afgqph1c.xn--p1ai/computer-games/1804-skachat-besplatno-igru-ekspediciya-konkistadorov-na-kompyuter-expeditions-conquistador-2013-god-russkaya-versiya-repak.html"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://xn----7sbiew6aadnema7p.xn--p1ai/sity_id.php?id=3"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://xn----7sbb5ahj4aiadq2m.xn--p1ai/guide/army/ta/t10.shtml"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Domain not in global list of TLDs http://xn----7sbiew6aadnema7p.xn--p1ai/sity_id.php?id=25"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
},
{
"html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" lan | \n",
" wiki | \n",
" url | \n",
" WebTitle | \n",
" time | \n",
" tld | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" ab | \n",
" ???????? | \n",
" http://mail-archives.apache.org/mod_mbox/openo... | \n",
" NaN | \n",
" 2014-07-13 08:10:00 | \n",
" org | \n",
"
\n",
" \n",
" 1 | \n",
" ab | \n",
" 1939 | \n",
" http://rarplayer.appspot.com/wiki/1939 | \n",
" 1939 \u2013 Wikipedia, wolna encyklopedia | \n",
" 2014-08-02 06:30:00 | \n",
" appspot.com | \n",
"
\n",
" \n",
" 2 | \n",
" ab | \n",
" 1994 | \n",
" http://rarplayer.appspot.com/wiki/1994 | \n",
" 1994 \u2013 Wikipedia, wolna encyklopedia | \n",
" 2014-08-01 00:02:00 | \n",
" appspot.com | \n",
"
\n",
" \n",
" 3 | \n",
" ab | \n",
" 2004 | \n",
" http://instapedia.com/m/2004 | \n",
" 2004 - iPhone/Mobile Wikipedia | \n",
" 2014-07-13 16:18:00 | \n",
" com | \n",
"
\n",
" \n",
" 4 | \n",
" ab | \n",
" 2007 | \n",
" http://rarplayer.appspot.com/wiki/2007 | \n",
" 2007 \u2013 Wikipedia, wolna encyklopedia | \n",
" 2014-07-31 09:37:00 | \n",
" appspot.com | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 8,
"text": [
" lan wiki url \\\n",
"0 ab ???????? http://mail-archives.apache.org/mod_mbox/openo... \n",
"1 ab 1939 http://rarplayer.appspot.com/wiki/1939 \n",
"2 ab 1994 http://rarplayer.appspot.com/wiki/1994 \n",
"3 ab 2004 http://instapedia.com/m/2004 \n",
"4 ab 2007 http://rarplayer.appspot.com/wiki/2007 \n",
"\n",
" WebTitle time tld \n",
"0 NaN 2014-07-13 08:10:00 org \n",
"1 1939 \u2013 Wikipedia, wolna encyklopedia 2014-08-02 06:30:00 appspot.com \n",
"2 1994 \u2013 Wikipedia, wolna encyklopedia 2014-08-01 00:02:00 appspot.com \n",
"3 2004 - iPhone/Mobile Wikipedia 2014-07-13 16:18:00 com \n",
"4 2007 \u2013 Wikipedia, wolna encyklopedia 2014-07-31 09:37:00 appspot.com "
]
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# seralize dataframe now\n",
"wikiR.to_pickle('/Users/gianlucademartini/wikiRall.pickle')\n",
"\n",
"#group by and count inlinks for each tld and wikipedia edition\n",
"tldCount=wikiR.groupby(['lan','tld']).count()\n",
"\n",
"# save as tab separated\n",
"tldCount.reset_index()\n",
"tldCount.to_csv('tldCount.tsv', sep='\\t')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}