{ "metadata": { "name": "", "signature": "sha256:42409c1eb40f5272df81beadf0dd65f04dff7a16340a24321d90444b487b7138" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "# For parsing the csv\n", "import csv\n", "import urllib2\n", "import StringIO\n", "import math\n", "\n", "# For actual computations\n", "import random as rand \n", "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "markdown", "metadata": {}, "source": [ "In 2010 Google released a page listing the top 1000 websites on the internet: https://web.archive.org/web/20130102235318/http://www.google.com/adplanner/static/top1000\n", "\n", "The page has since been taken down, but you can still find the data mirrored. \n", "\n", "Let's take a look at it!" ] }, { "cell_type": "code", "collapsed": false, "input": [ "websites_url = 'https://raw.githubusercontent.com/ledeprogram/courses/master/platforms/anonymization/googletop1000april2010.csv'\n", "websites_response = urllib2.urlopen(websites_url)\n", "websites = pd.read_csv(websites_response)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "websites" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RankUnique Visitors (users)Page ViewsReachSiteCategoryHas Advertising
0 1 540000000 570000000000 35.2% facebook.com Social Networks Yes
1 2 490000000 70000000000 31.8% yahoo.com Web Portals Yes
2 3 370000000 39000000000 24.1% live.com Search Engines Yes
3 4 310000000 7900000000 20% wikipedia.org Dictionaries & Encyclopedias No
4 5 280000000 11000000000 18.1% msn.com Web Portals Yes
5 6 230000000 3300000000 14.8% microsoft.com Software Yes
6 7 230000000 4400000000 14.7% blogspot.com Blogging Resources & Services Yes
7 8 230000000 27000000000 15% baidu.com Web Portals Yes
8 9 170000000 25000000000 11.1% qq.com Email & Messaging Yes
9 10 140000000 2100000000 9.2% mozilla.com Internet Clients & Browsers No
10 11 130000000 3600000000 8.4% sina.com.cn Web Portals Yes
11 12 120000000 1200000000 7.7% wordpress.com Blogging Resources & Services Yes
12 13 110000000 2700000000 7.% bing.com Search Engines Yes
13 14 110000000 1000000000 6.9% adobe.com Programming Yes
14 15 98000000 2700000000 6.3% 163.com Web Portals Yes
15 16 98000000 10000000000 6.3% taobao.com Shopping No
16 17 97000000 1400000000 6.3% soso.com Entertainment No
17 18 96000000 5400000000 6.2% twitter.com Email & Messaging No
18 19 89000000 1700000000 5.8% youku.com Video Clips & Movie Downloads Yes
19 20 88000000 1700000000 5.7% ask.com Search Engines Yes
20 21 82000000 1900000000 5.3% sohu.com Web Portals Yes
21 22 74000000 3300000000 4.8% amazon.com Shopping Yes
22 23 74000000 490000000 4.8% windows.com Windows No
23 24 74000000 9400000000 4.8% ebay.com Auctions Yes
24 25 72000000 27000000000 4.7% yahoo.co.jp Web Portals Yes
25 26 72000000 27000000000 4.7% myspace.com Social Networks Yes
26 27 72000000 960000000 4.7% apple.com Mac Yes
27 28 66000000 1100000000 4.3% tudou.com Photo & Video Sharing No
28 29 60000000 2000000000 3.9% conduit.com Advertising & Marketing No
29 30 60000000 1100000000 3.9% hotmail.com Email & Messaging Yes
30 31 55000000 1800000000 3.6% flickr.com Photo & Video Sharing Yes
31 32 55000000 1100000000 3.6% photobucket.com Photo & Video Sharing Yes
32 33 55000000 590000000 3.6% tianya.cn Online Communities Yes
33 34 55000000 710000000 3.6% about.com How-To & Expert Content Yes
34 35 55000000 490000000 3.6% cnet.com Technology News Yes
35 36 50000000 1400000000 3.3% hao123.com Online Directories No
36 37 50000000 270000000 3.2% iefxz.com NaN No
37 38 50000000 870000000 3.2% xunlei.com TV Programs No
38 39 49000000 1900000000 3.2% paypal.com Merchant Services & Payment Systems Yes
39 40 46000000 800000000 3% rapidshare.com File Sharing & Hosting No
40 41 46000000 3000000000 3% go.com Web Portals Yes
41 42 45000000 2400000000 2.9% fc2.com Blogging Resources & Services Yes
42 43 45000000 2500000000 2.9% bbc.co.uk News & Current Events Yes
43 44 45000000 1400000000 2.9% imdb.com Movies Yes
44 45 45000000 5300000000 2.9% orkut.com Social Networks Yes
45 46 45000000 540000000 2.9% sogou.com Web Portals No
46 47 42000000 450000000 2.7% 56.com Multimedia Content No
47 48 42000000 4400000000 2.7% aol.com Web Portals Yes
48 49 42000000 14000000000 2.7% craigslist.org Classifieds No
49 50 41000000 4000000000 2.6% rakuten.co.jp Shopping Portals & Search Engines Yes
50 51 41000000 310000000 2.7% imageshack.us File Sharing & Hosting Yes
51 52 41000000 410000000 2.7% ku6.com Multimedia Content Yes
52 53 41000000 1700000000 2.7% blogger.com Blogging Resources & Services Yes
53 54 41000000 810000000 2.6% goo.ne.jp Web Services Yes
54 55 41000000 860000000 2.7% ifeng.com News & Current Events Yes
55 56 38000000 1700000000 2.5% linkedin.com Social Networks Yes
56 57 38000000 7000000000 2.4% yandex.ru Search Engines Yes
57 58 37000000 10000000000 2.4% mail.ru Email & Messaging Yes
58 59 35000000 280000000 2.2% partypoker.com Cards & Casino Games No
59 60 34000000 880000000 2.2% megaupload.com File Sharing & Hosting No
.....................
\n", "

1001 rows \u00d7 7 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ " Rank Unique Visitors (users) Page Views Reach Site \\\n", "0 1 540000000 570000000000 35.2% facebook.com \n", "1 2 490000000 70000000000 31.8% yahoo.com \n", "2 3 370000000 39000000000 24.1% live.com \n", "3 4 310000000 7900000000 20% wikipedia.org \n", "4 5 280000000 11000000000 18.1% msn.com \n", "5 6 230000000 3300000000 14.8% microsoft.com \n", "6 7 230000000 4400000000 14.7% blogspot.com \n", "7 8 230000000 27000000000 15% baidu.com \n", "8 9 170000000 25000000000 11.1% qq.com \n", "9 10 140000000 2100000000 9.2% mozilla.com \n", "10 11 130000000 3600000000 8.4% sina.com.cn \n", "11 12 120000000 1200000000 7.7% wordpress.com \n", "12 13 110000000 2700000000 7.% bing.com \n", "13 14 110000000 1000000000 6.9% adobe.com \n", "14 15 98000000 2700000000 6.3% 163.com \n", "15 16 98000000 10000000000 6.3% taobao.com \n", "16 17 97000000 1400000000 6.3% soso.com \n", "17 18 96000000 5400000000 6.2% twitter.com \n", "18 19 89000000 1700000000 5.8% youku.com \n", "19 20 88000000 1700000000 5.7% ask.com \n", "20 21 82000000 1900000000 5.3% sohu.com \n", "21 22 74000000 3300000000 4.8% amazon.com \n", "22 23 74000000 490000000 4.8% windows.com \n", "23 24 74000000 9400000000 4.8% ebay.com \n", "24 25 72000000 27000000000 4.7% yahoo.co.jp \n", "25 26 72000000 27000000000 4.7% myspace.com \n", "26 27 72000000 960000000 4.7% apple.com \n", "27 28 66000000 1100000000 4.3% tudou.com \n", "28 29 60000000 2000000000 3.9% conduit.com \n", "29 30 60000000 1100000000 3.9% hotmail.com \n", "30 31 55000000 1800000000 3.6% flickr.com \n", "31 32 55000000 1100000000 3.6% photobucket.com \n", "32 33 55000000 590000000 3.6% tianya.cn \n", "33 34 55000000 710000000 3.6% about.com \n", "34 35 55000000 490000000 3.6% cnet.com \n", "35 36 50000000 1400000000 3.3% hao123.com \n", "36 37 50000000 270000000 3.2% iefxz.com \n", "37 38 50000000 870000000 3.2% xunlei.com \n", "38 39 49000000 1900000000 3.2% paypal.com \n", "39 40 46000000 800000000 3% rapidshare.com \n", "40 41 46000000 3000000000 3% go.com \n", "41 42 45000000 2400000000 2.9% fc2.com \n", "42 43 45000000 2500000000 2.9% bbc.co.uk \n", "43 44 45000000 1400000000 2.9% imdb.com \n", "44 45 45000000 5300000000 2.9% orkut.com \n", "45 46 45000000 540000000 2.9% sogou.com \n", "46 47 42000000 450000000 2.7% 56.com \n", "47 48 42000000 4400000000 2.7% aol.com \n", "48 49 42000000 14000000000 2.7% craigslist.org \n", "49 50 41000000 4000000000 2.6% rakuten.co.jp \n", "50 51 41000000 310000000 2.7% imageshack.us \n", "51 52 41000000 410000000 2.7% ku6.com \n", "52 53 41000000 1700000000 2.7% blogger.com \n", "53 54 41000000 810000000 2.6% goo.ne.jp \n", "54 55 41000000 860000000 2.7% ifeng.com \n", "55 56 38000000 1700000000 2.5% linkedin.com \n", "56 57 38000000 7000000000 2.4% yandex.ru \n", "57 58 37000000 10000000000 2.4% mail.ru \n", "58 59 35000000 280000000 2.2% partypoker.com \n", "59 60 34000000 880000000 2.2% megaupload.com \n", " ... ... ... ... ... \n", "\n", " Category Has Advertising \n", "0 Social Networks Yes \n", "1 Web Portals Yes \n", "2 Search Engines Yes \n", "3 Dictionaries & Encyclopedias No \n", "4 Web Portals Yes \n", "5 Software Yes \n", "6 Blogging Resources & Services Yes \n", "7 Web Portals Yes \n", "8 Email & Messaging Yes \n", "9 Internet Clients & Browsers No \n", "10 Web Portals Yes \n", "11 Blogging Resources & Services Yes \n", "12 Search Engines Yes \n", "13 Programming Yes \n", "14 Web Portals Yes \n", "15 Shopping No \n", "16 Entertainment No \n", "17 Email & Messaging No \n", "18 Video Clips & Movie Downloads Yes \n", "19 Search Engines Yes \n", "20 Web Portals Yes \n", "21 Shopping Yes \n", "22 Windows No \n", "23 Auctions Yes \n", "24 Web Portals Yes \n", "25 Social Networks Yes \n", "26 Mac Yes \n", "27 Photo & Video Sharing No \n", "28 Advertising & Marketing No \n", "29 Email & Messaging Yes \n", "30 Photo & Video Sharing Yes \n", "31 Photo & Video Sharing Yes \n", "32 Online Communities Yes \n", "33 How-To & Expert Content Yes \n", "34 Technology News Yes \n", "35 Online Directories No \n", "36 NaN No \n", "37 TV Programs No \n", "38 Merchant Services & Payment Systems Yes \n", "39 File Sharing & Hosting No \n", "40 Web Portals Yes \n", "41 Blogging Resources & Services Yes \n", "42 News & Current Events Yes \n", "43 Movies Yes \n", "44 Social Networks Yes \n", "45 Web Portals No \n", "46 Multimedia Content No \n", "47 Web Portals Yes \n", "48 Classifieds No \n", "49 Shopping Portals & Search Engines Yes \n", "50 File Sharing & Hosting Yes \n", "51 Multimedia Content Yes \n", "52 Blogging Resources & Services Yes \n", "53 Web Services Yes \n", "54 News & Current Events Yes \n", "55 Social Networks Yes \n", "56 Search Engines Yes \n", "57 Email & Messaging Yes \n", "58 Cards & Casino Games No \n", "59 File Sharing & Hosting No \n", " ... ... \n", "\n", "[1001 rows x 7 columns]" ] } ], "prompt_number": 8 }, { "cell_type": "markdown", "metadata": {}, "source": [ "http://www.insidefacebook.com/2013/08/09/how-to-snipe-just-one-persons-news-feed-with-facebook-ads/" ] } ], "metadata": {} } ] }