{ "metadata": { "name": "", "signature": "sha256:42409c1eb40f5272df81beadf0dd65f04dff7a16340a24321d90444b487b7138" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "# For parsing the csv\n", "import csv\n", "import urllib2\n", "import StringIO\n", "import math\n", "\n", "# For actual computations\n", "import random as rand \n", "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "markdown", "metadata": {}, "source": [ "In 2010 Google released a page listing the top 1000 websites on the internet: https://web.archive.org/web/20130102235318/http://www.google.com/adplanner/static/top1000\n", "\n", "The page has since been taken down, but you can still find the data mirrored. \n", "\n", "Let's take a look at it!" ] }, { "cell_type": "code", "collapsed": false, "input": [ "websites_url = 'https://raw.githubusercontent.com/ledeprogram/courses/master/platforms/anonymization/googletop1000april2010.csv'\n", "websites_response = urllib2.urlopen(websites_url)\n", "websites = pd.read_csv(websites_response)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "websites" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
| \n", " | Rank | \n", "Unique Visitors (users) | \n", "Page Views | \n", "Reach | \n", "Site | \n", "Category | \n", "Has Advertising | \n", "
|---|---|---|---|---|---|---|---|
| 0 | \n", "1 | \n", "540000000 | \n", "570000000000 | \n", "35.2% | \n", "facebook.com | \n", "Social Networks | \n", "Yes | \n", "
| 1 | \n", "2 | \n", "490000000 | \n", "70000000000 | \n", "31.8% | \n", "yahoo.com | \n", "Web Portals | \n", "Yes | \n", "
| 2 | \n", "3 | \n", "370000000 | \n", "39000000000 | \n", "24.1% | \n", "live.com | \n", "Search Engines | \n", "Yes | \n", "
| 3 | \n", "4 | \n", "310000000 | \n", "7900000000 | \n", "20% | \n", "wikipedia.org | \n", "Dictionaries & Encyclopedias | \n", "No | \n", "
| 4 | \n", "5 | \n", "280000000 | \n", "11000000000 | \n", "18.1% | \n", "msn.com | \n", "Web Portals | \n", "Yes | \n", "
| 5 | \n", "6 | \n", "230000000 | \n", "3300000000 | \n", "14.8% | \n", "microsoft.com | \n", "Software | \n", "Yes | \n", "
| 6 | \n", "7 | \n", "230000000 | \n", "4400000000 | \n", "14.7% | \n", "blogspot.com | \n", "Blogging Resources & Services | \n", "Yes | \n", "
| 7 | \n", "8 | \n", "230000000 | \n", "27000000000 | \n", "15% | \n", "baidu.com | \n", "Web Portals | \n", "Yes | \n", "
| 8 | \n", "9 | \n", "170000000 | \n", "25000000000 | \n", "11.1% | \n", "qq.com | \n", "Email & Messaging | \n", "Yes | \n", "
| 9 | \n", "10 | \n", "140000000 | \n", "2100000000 | \n", "9.2% | \n", "mozilla.com | \n", "Internet Clients & Browsers | \n", "No | \n", "
| 10 | \n", "11 | \n", "130000000 | \n", "3600000000 | \n", "8.4% | \n", "sina.com.cn | \n", "Web Portals | \n", "Yes | \n", "
| 11 | \n", "12 | \n", "120000000 | \n", "1200000000 | \n", "7.7% | \n", "wordpress.com | \n", "Blogging Resources & Services | \n", "Yes | \n", "
| 12 | \n", "13 | \n", "110000000 | \n", "2700000000 | \n", "7.% | \n", "bing.com | \n", "Search Engines | \n", "Yes | \n", "
| 13 | \n", "14 | \n", "110000000 | \n", "1000000000 | \n", "6.9% | \n", "adobe.com | \n", "Programming | \n", "Yes | \n", "
| 14 | \n", "15 | \n", "98000000 | \n", "2700000000 | \n", "6.3% | \n", "163.com | \n", "Web Portals | \n", "Yes | \n", "
| 15 | \n", "16 | \n", "98000000 | \n", "10000000000 | \n", "6.3% | \n", "taobao.com | \n", "Shopping | \n", "No | \n", "
| 16 | \n", "17 | \n", "97000000 | \n", "1400000000 | \n", "6.3% | \n", "soso.com | \n", "Entertainment | \n", "No | \n", "
| 17 | \n", "18 | \n", "96000000 | \n", "5400000000 | \n", "6.2% | \n", "twitter.com | \n", "Email & Messaging | \n", "No | \n", "
| 18 | \n", "19 | \n", "89000000 | \n", "1700000000 | \n", "5.8% | \n", "youku.com | \n", "Video Clips & Movie Downloads | \n", "Yes | \n", "
| 19 | \n", "20 | \n", "88000000 | \n", "1700000000 | \n", "5.7% | \n", "ask.com | \n", "Search Engines | \n", "Yes | \n", "
| 20 | \n", "21 | \n", "82000000 | \n", "1900000000 | \n", "5.3% | \n", "sohu.com | \n", "Web Portals | \n", "Yes | \n", "
| 21 | \n", "22 | \n", "74000000 | \n", "3300000000 | \n", "4.8% | \n", "amazon.com | \n", "Shopping | \n", "Yes | \n", "
| 22 | \n", "23 | \n", "74000000 | \n", "490000000 | \n", "4.8% | \n", "windows.com | \n", "Windows | \n", "No | \n", "
| 23 | \n", "24 | \n", "74000000 | \n", "9400000000 | \n", "4.8% | \n", "ebay.com | \n", "Auctions | \n", "Yes | \n", "
| 24 | \n", "25 | \n", "72000000 | \n", "27000000000 | \n", "4.7% | \n", "yahoo.co.jp | \n", "Web Portals | \n", "Yes | \n", "
| 25 | \n", "26 | \n", "72000000 | \n", "27000000000 | \n", "4.7% | \n", "myspace.com | \n", "Social Networks | \n", "Yes | \n", "
| 26 | \n", "27 | \n", "72000000 | \n", "960000000 | \n", "4.7% | \n", "apple.com | \n", "Mac | \n", "Yes | \n", "
| 27 | \n", "28 | \n", "66000000 | \n", "1100000000 | \n", "4.3% | \n", "tudou.com | \n", "Photo & Video Sharing | \n", "No | \n", "
| 28 | \n", "29 | \n", "60000000 | \n", "2000000000 | \n", "3.9% | \n", "conduit.com | \n", "Advertising & Marketing | \n", "No | \n", "
| 29 | \n", "30 | \n", "60000000 | \n", "1100000000 | \n", "3.9% | \n", "hotmail.com | \n", "Email & Messaging | \n", "Yes | \n", "
| 30 | \n", "31 | \n", "55000000 | \n", "1800000000 | \n", "3.6% | \n", "flickr.com | \n", "Photo & Video Sharing | \n", "Yes | \n", "
| 31 | \n", "32 | \n", "55000000 | \n", "1100000000 | \n", "3.6% | \n", "photobucket.com | \n", "Photo & Video Sharing | \n", "Yes | \n", "
| 32 | \n", "33 | \n", "55000000 | \n", "590000000 | \n", "3.6% | \n", "tianya.cn | \n", "Online Communities | \n", "Yes | \n", "
| 33 | \n", "34 | \n", "55000000 | \n", "710000000 | \n", "3.6% | \n", "about.com | \n", "How-To & Expert Content | \n", "Yes | \n", "
| 34 | \n", "35 | \n", "55000000 | \n", "490000000 | \n", "3.6% | \n", "cnet.com | \n", "Technology News | \n", "Yes | \n", "
| 35 | \n", "36 | \n", "50000000 | \n", "1400000000 | \n", "3.3% | \n", "hao123.com | \n", "Online Directories | \n", "No | \n", "
| 36 | \n", "37 | \n", "50000000 | \n", "270000000 | \n", "3.2% | \n", "iefxz.com | \n", "NaN | \n", "No | \n", "
| 37 | \n", "38 | \n", "50000000 | \n", "870000000 | \n", "3.2% | \n", "xunlei.com | \n", "TV Programs | \n", "No | \n", "
| 38 | \n", "39 | \n", "49000000 | \n", "1900000000 | \n", "3.2% | \n", "paypal.com | \n", "Merchant Services & Payment Systems | \n", "Yes | \n", "
| 39 | \n", "40 | \n", "46000000 | \n", "800000000 | \n", "3% | \n", "rapidshare.com | \n", "File Sharing & Hosting | \n", "No | \n", "
| 40 | \n", "41 | \n", "46000000 | \n", "3000000000 | \n", "3% | \n", "go.com | \n", "Web Portals | \n", "Yes | \n", "
| 41 | \n", "42 | \n", "45000000 | \n", "2400000000 | \n", "2.9% | \n", "fc2.com | \n", "Blogging Resources & Services | \n", "Yes | \n", "
| 42 | \n", "43 | \n", "45000000 | \n", "2500000000 | \n", "2.9% | \n", "bbc.co.uk | \n", "News & Current Events | \n", "Yes | \n", "
| 43 | \n", "44 | \n", "45000000 | \n", "1400000000 | \n", "2.9% | \n", "imdb.com | \n", "Movies | \n", "Yes | \n", "
| 44 | \n", "45 | \n", "45000000 | \n", "5300000000 | \n", "2.9% | \n", "orkut.com | \n", "Social Networks | \n", "Yes | \n", "
| 45 | \n", "46 | \n", "45000000 | \n", "540000000 | \n", "2.9% | \n", "sogou.com | \n", "Web Portals | \n", "No | \n", "
| 46 | \n", "47 | \n", "42000000 | \n", "450000000 | \n", "2.7% | \n", "56.com | \n", "Multimedia Content | \n", "No | \n", "
| 47 | \n", "48 | \n", "42000000 | \n", "4400000000 | \n", "2.7% | \n", "aol.com | \n", "Web Portals | \n", "Yes | \n", "
| 48 | \n", "49 | \n", "42000000 | \n", "14000000000 | \n", "2.7% | \n", "craigslist.org | \n", "Classifieds | \n", "No | \n", "
| 49 | \n", "50 | \n", "41000000 | \n", "4000000000 | \n", "2.6% | \n", "rakuten.co.jp | \n", "Shopping Portals & Search Engines | \n", "Yes | \n", "
| 50 | \n", "51 | \n", "41000000 | \n", "310000000 | \n", "2.7% | \n", "imageshack.us | \n", "File Sharing & Hosting | \n", "Yes | \n", "
| 51 | \n", "52 | \n", "41000000 | \n", "410000000 | \n", "2.7% | \n", "ku6.com | \n", "Multimedia Content | \n", "Yes | \n", "
| 52 | \n", "53 | \n", "41000000 | \n", "1700000000 | \n", "2.7% | \n", "blogger.com | \n", "Blogging Resources & Services | \n", "Yes | \n", "
| 53 | \n", "54 | \n", "41000000 | \n", "810000000 | \n", "2.6% | \n", "goo.ne.jp | \n", "Web Services | \n", "Yes | \n", "
| 54 | \n", "55 | \n", "41000000 | \n", "860000000 | \n", "2.7% | \n", "ifeng.com | \n", "News & Current Events | \n", "Yes | \n", "
| 55 | \n", "56 | \n", "38000000 | \n", "1700000000 | \n", "2.5% | \n", "linkedin.com | \n", "Social Networks | \n", "Yes | \n", "
| 56 | \n", "57 | \n", "38000000 | \n", "7000000000 | \n", "2.4% | \n", "yandex.ru | \n", "Search Engines | \n", "Yes | \n", "
| 57 | \n", "58 | \n", "37000000 | \n", "10000000000 | \n", "2.4% | \n", "mail.ru | \n", "Email & Messaging | \n", "Yes | \n", "
| 58 | \n", "59 | \n", "35000000 | \n", "280000000 | \n", "2.2% | \n", "partypoker.com | \n", "Cards & Casino Games | \n", "No | \n", "
| 59 | \n", "60 | \n", "34000000 | \n", "880000000 | \n", "2.2% | \n", "megaupload.com | \n", "File Sharing & Hosting | \n", "No | \n", "
| \n", " | ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1001 rows \u00d7 7 columns
\n", "