{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Scraping Population from Wikipedia" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# http://lxml.de/lxmlhtml.html\n", "import requests\n", "from lxml.html import fromstring, parse\n", "from itertools import islice\n", "\n", "# http://stackoverflow.com/a/1779324/7782\n", "import locale\n", "locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' ) \n", "\n", "url = \"https://en.wikipedia.org/w/index.php?title=List_of_countries_by_population_(United_Nations)&oldid=590438477\"\n", "page = requests.get(url).content.decode(\"UTF-8\")\n", "\n", "doc = fromstring(page)\n", "\n", "def parse_rank(col):\n", " try:\n", " rank = int(col.text)\n", " return rank\n", " except:\n", " return None\n", "\n", "def parse_name(col):\n", " try:\n", " # find all the anchors and if href points is the form \"/wiki\"\n", " name = \"; \".join([a.text for a in col.findall(\".//a\") if a.attrib[\"href\"].startswith(\"/wiki/\")])\n", " return name\n", " except:\n", " return None\n", "\n", "def parse_pop(col):\n", " return locale.atoi(col.text)\n", "\n", "\n", "def country_by_pop():\n", " \n", " for row in islice(doc.xpath(\"\"\"//*[@id=\"mw-content-text\"]/table[1]/tr\"\"\"),2, None):\n", " cols = row.findall(\".//td\")\n", " yield (parse_rank(cols[0]), parse_name(cols[1]), parse_pop(cols[2]))\n", " \n", "for (i, row) in enumerate(islice(country_by_pop(), None)):\n", " print i, \n", " for col in row:\n", " if type(col) == 'unicode':\n", " print col.encode(\"UTF-8\"), \n", " else:\n", " print col, \n", " print" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0 1 China 1385566537\n", "1 2 India 1252139596\n", "2 3 United States 320050716\n", "3 4 Indonesia 249865631\n", "4 5 Brazil 200361925\n", "5 6 Pakistan 182142594\n", "6 7 Nigeria 173615345\n", "7 8 Bangladesh 156594962\n", "8 9 Russia 142833689\n", "9 10 Japan 127143577\n", "10 11 Mexico 122332399\n", "11 12 Philippines 98393574\n", "12 13 Ethiopia 94100756\n", "13 14 Vietnam 91679733\n", "14 15 Germany 82726626\n", "15 16 Egypt 82056378\n", "16 17 Iran 77447168\n", "17 18 Turkey 74932641\n", "18 19 Congo, Democratic Republic of the 67513677\n", "19 20 Thailand 67010502\n", "20 21 France 64291280\n", "21 22 United Kingdom 63136265\n", "22 23 Italy 60990277\n", "23 24 Myanmar 53259018\n", "24 25 South Africa 52776130\n", "25 26 Korea, South 49262698\n", "26 27 Tanzania 49253126\n", "27 28 Colombia 48321405\n", "28 29 Spain 46926963\n", "29 30 Ukraine 45238805\n", "30 31 Kenya 44353691\n", "31 32 Argentina 41446246\n", "32 33 Algeria 39208194\n", "33 34 Poland 38216635\n", "34 35 Sudan 37964306\n", "35 36 Uganda 37578876\n", "36 37 Canada 35181704\n", "37 38 Iraq 33765232\n", "38 39 Morocco 33008150\n", "39 40 Afghanistan 30551674\n", "40 41 Venezuela 30405207\n", "41 42 Peru 30375603\n", "42 43 Malaysia 29716965\n", "43 44 Uzbekistan 28934102\n", "44 45 Saudi Arabia 28828870\n", "45 46 Nepal 27797457\n", "46 47 Ghana 25904598\n", "47 48 Mozambique 25833752\n", "48 49 Korea, North 24895480\n", "49 50 Yemen 24407381\n", "50 51 Australia 23342553\n", "51 52 Taiwan 23329772\n", "52 53 Madagascar 22924851\n", "53 54 Cameroon 22253959\n", "54 55 Syria 21898061\n", "55 56 Romania 21698585\n", "56 57 Angola 21471618\n", "57 58 Sri Lanka 21273228\n", "58 59 C\u00f4te d'Ivoire 20316086\n", "59 60 Niger 17831270\n", "60 61 Chile 17619708\n", "61 62 Burkina Faso 16934839\n", "62 63 Netherlands 16759229\n", "63 64 Kazakhstan 16440586\n", "64 65 Malawi 16362567\n", "65 66 Ecuador 15737878\n", "66 67 Guatemala 15468203\n", "67 68 Mali 15301650\n", "68 69 Cambodia 15135169\n", "69 70 Zambia 14538640\n", "70 71 Zimbabwe 14149648\n", "71 72 Senegal 14133280\n", "72 73 Chad 12825314\n", "73 74 Rwanda 11776522\n", "74 75 Guinea 11745189\n", "75 76 South Sudan 11296173\n", "76 77 Cuba 11265629\n", "77 78 Greece 11127990\n", "78 79 Belgium 11104476\n", "79 80 Tunisia 10996515\n", "80 81 Czech Republic 10702197\n", "81 82 Bolivia 10671200\n", "82 83 Portugal 10608156\n", "83 84 Somalia 10495583\n", "84 85 Dominican Republic 10403761\n", "85 86 Benin 10323474\n", "86 87 Haiti 10317461\n", "87 88 Burundi 10162532\n", "88 89 Hungary 9954941\n", "89 90 Sweden 9571105\n", "90 91 Serbia; Kosovo 9510506\n", "91 92 Azerbaijan 9413420\n", "92 93 Belarus 9356678\n", "93 94 United Arab Emirates 9346129\n", "94 95 Austria 8495145\n", "95 96 Tajikistan 8207834\n", "96 97 Honduras 8097688\n", "97 98 Switzerland 8077833\n", "98 99 Israel 7733144\n", "99 100 Papua New Guinea 7321262\n", "100 101 Jordan 7273799\n", "101 102 Bulgaria 7222943\n", "102 None Hong Kong 7203836\n", "103 103 Togo 6816982\n", "104 104 Paraguay 6802295\n", "105 105 Laos 6769727\n", "106 106 El Salvador 6340454\n", "107 107 Eritrea 6333135\n", "108 108 Libya 6201521\n", "109 109 Sierra Leone 6092075\n", "110 110 Nicaragua 6080478\n", "111 111 Denmark 5619096\n", "112 112 Kyrgyzstan 5547548\n", "113 113 Slovakia 5450223\n", "114 114 Finland 5426323\n", "115 115 Singapore 5411737\n", "116 116 Turkmenistan 5240072\n", "117 117 Norway 5042671\n", "118 118 Costa Rica 4872166\n", "119 119 Lebanon 4821971\n", "120 120 Ireland 4627173\n", "121 121 Central African Republic 4616417\n", "122 122 New Zealand 4505761\n", "123 123 Congo, Republic of the 4447632\n", "124 124 Georgia 4340895\n", "125 125 Palestine 4326295\n", "126 126 Liberia 4294077\n", "127 127 Croatia 4289714\n", "128 128 Mauritania 3889880\n", "129 129 Panama 3864170\n", "130 130 Bosnia and Herzegovina 3829307\n", "131 None Puerto Rico 3688318\n", "132 131 Oman 3632444\n", "133 132 Moldova 3487204\n", "134 133 Uruguay 3407062\n", "135 134 Kuwait 3368572\n", "136 135 Albania 3173271\n", "137 136 Lithuania 3016933\n", "138 137 Armenia 2976566\n", "139 138 Mongolia 2839073\n", "140 139 Jamaica 2783888\n", "141 140 Namibia 2303315\n", "142 141 Qatar 2168673\n", "143 142 Macedonia 2107158\n", "144 143 Lesotho 2074465\n", "145 144 Slovenia 2071997\n", "146 145 Latvia 2050317\n", "147 146 Botswana 2021144\n", "148 147 Gambia 1849285\n", "149 148 Guinea-Bissau 1704255\n", "150 149 Gabon 1671711\n", "151 150 Trinidad and Tobago 1341151\n", "152 151 Bahrain 1332171\n", "153 152 Estonia 1287251\n", "154 153 Swaziland 1249514\n", "155 154 Mauritius 1244403\n", "156 155 Cyprus 1141166\n", "157 156 Timor-Leste 1132879\n", "158 157 Fiji 881065\n", "159 None R\u00e9union 875375\n", "160 158 Djibouti 872932\n", "161 159 Guyana 799613\n", "162 160 Equatorial Guinea 757014\n", "163 161 Bhutan 753947\n", "164 162 Comoros 734917\n", "165 163 Montenegro 621383\n", "166 None Western Sahara 567315\n", "167 None Macau 566375\n", "168 164 Solomon Islands 561231\n", "169 165 Suriname 539276\n", "170 166 Luxembourg 530380\n", "171 167 Cape Verde 498897\n", "172 None Guadeloupe 465800\n", "173 168 Malta 429004\n", "174 169 Brunei 417784\n", "175 None Martinique 403682\n", "176 170 Bahamas 377374\n", "177 171 Maldives 345023\n", "178 172 Belize 331900\n", "179 173 Iceland 329535\n", "180 174 Barbados 284644\n", "181 None French Polynesia 276831\n", "182 None New Caledonia 256496\n", "183 175 Vanuatu 252763\n", "184 None French Guiana 249227\n", "185 None Mayotte 222152\n", "186 176 S\u00e3o Tom\u00e9 and Pr\u00edncipe 192993\n", "187 177 Samoa 190372\n", "188 178 Saint Lucia 182273\n", "189 None Guam 165124\n", "190 None Guernsey; Jersey 162018\n", "191 None Cura\u00e7ao 158760\n", "192 179 Saint Vincent and the Grenadines 109373\n", "193 None Virgin Islands, United States 106627\n", "194 180 Grenada 105897\n", "195 181 Tonga 105323\n", "196 182 Micronesia, Federated States of 103549\n", "197 None Aruba 102911\n", "198 183 Kiribati 102351\n", "199 184 Seychelles 92838\n", "200 185 Antigua and Barbuda 89985\n", "201 None Isle of Man 85888\n", "202 186 Andorra 79218\n", "203 187 Dominica 72003\n", "204 None Bermuda 65341\n", "205 None Cayman Islands 58435\n", "206 None Greenland 56987\n", "207 None American Samoa 55165\n", "208 188 Saint Kitts and Nevis 54191\n", "209 None Northern Mariana Islands 53855\n", "210 189 Marshall Islands 52634\n", "211 None Faroe Islands 49469\n", "212 None Sint Maarten 45233\n", "213 190 Monaco 37831\n", "214 191 Liechtenstein 36925\n", "215 None Turks and Caicos Islands 33098\n", "216 192 San Marino 31448\n", "217 None Gibraltar 29310\n", "218 None Virgin Islands, British 28341\n", "219 193 Palau 20918\n", "220 None Cook Islands 20629\n", "221 None Caribbean Netherlands 19130\n", "222 None Anguilla 14300\n", "223 None Wallis and Futuna 13272\n", "224 194 Nauru 10051\n", "225 195 Tuvalu 9876\n", "226 None Saint Pierre and Miquelon 6043\n", "227 None Montserrat 5091\n", "228 None Saint Helena, Ascension and Tristan da Cunha 4129\n", "229 None Falkland Islands 3044\n", "230 None Niue 1344\n", "231 None Tokelau 1195\n", "232 196 Vatican City 799\n" ] } ], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "import json\n", "s = json.dumps([row for row in country_by_pop()], ensure_ascii=True)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "type(s)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "print s" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# https://gist.github.com/rdhyee/8511607/raw/f16257434352916574473e63612fcea55a0c1b1c/population_of_countries.json\n", "\n", "# read population in\n", "import json\n", "import requests\n", "\n", "pop_json_url = \"https://gist.github.com/rdhyee/8511607/raw/f16257434352916574473e63612fcea55a0c1b1c/population_of_countries.json\"\n", "pop_list= requests.get(pop_json_url).json()\n", "pop_list" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 3, "text": [ "[[1, u'China', 1385566537],\n", " [2, u'India', 1252139596],\n", " [3, u'United States', 320050716],\n", " [4, u'Indonesia', 249865631],\n", " [5, u'Brazil', 200361925],\n", " [6, u'Pakistan', 182142594],\n", " [7, u'Nigeria', 173615345],\n", " [8, u'Bangladesh', 156594962],\n", " [9, u'Russia', 142833689],\n", " [10, u'Japan', 127143577],\n", " [11, u'Mexico', 122332399],\n", " [12, u'Philippines', 98393574],\n", " [13, u'Ethiopia', 94100756],\n", " [14, u'Vietnam', 91679733],\n", " [15, u'Germany', 82726626],\n", " [16, u'Egypt', 82056378],\n", " [17, u'Iran', 77447168],\n", " [18, u'Turkey', 74932641],\n", " [19, u'Congo, Democratic Republic of the', 67513677],\n", " [20, u'Thailand', 67010502],\n", " [21, u'France', 64291280],\n", " [22, u'United Kingdom', 63136265],\n", " [23, u'Italy', 60990277],\n", " [24, u'Myanmar', 53259018],\n", " [25, u'South Africa', 52776130],\n", " [26, u'Korea, South', 49262698],\n", " [27, u'Tanzania', 49253126],\n", " [28, u'Colombia', 48321405],\n", " [29, u'Spain', 46926963],\n", " [30, u'Ukraine', 45238805],\n", " [31, u'Kenya', 44353691],\n", " [32, u'Argentina', 41446246],\n", " [33, u'Algeria', 39208194],\n", " [34, u'Poland', 38216635],\n", " [35, u'Sudan', 37964306],\n", " [36, u'Uganda', 37578876],\n", " [37, u'Canada', 35181704],\n", " [38, u'Iraq', 33765232],\n", " [39, u'Morocco', 33008150],\n", " [40, u'Afghanistan', 30551674],\n", " [41, u'Venezuela', 30405207],\n", " [42, u'Peru', 30375603],\n", " [43, u'Malaysia', 29716965],\n", " [44, u'Uzbekistan', 28934102],\n", " [45, u'Saudi Arabia', 28828870],\n", " [46, u'Nepal', 27797457],\n", " [47, u'Ghana', 25904598],\n", " [48, u'Mozambique', 25833752],\n", " [49, u'Korea, North', 24895480],\n", " [50, u'Yemen', 24407381],\n", " [51, u'Australia', 23342553],\n", " [52, u'Taiwan', 23329772],\n", " [53, u'Madagascar', 22924851],\n", " [54, u'Cameroon', 22253959],\n", " [55, u'Syria', 21898061],\n", " [56, u'Romania', 21698585],\n", " [57, u'Angola', 21471618],\n", " [58, u'Sri Lanka', 21273228],\n", " [59, u\"C\\xf4te d'Ivoire\", 20316086],\n", " [60, u'Niger', 17831270],\n", " [61, u'Chile', 17619708],\n", " [62, u'Burkina Faso', 16934839],\n", " [63, u'Netherlands', 16759229],\n", " [64, u'Kazakhstan', 16440586],\n", " [65, u'Malawi', 16362567],\n", " [66, u'Ecuador', 15737878],\n", " [67, u'Guatemala', 15468203],\n", " [68, u'Mali', 15301650],\n", " [69, u'Cambodia', 15135169],\n", " [70, u'Zambia', 14538640],\n", " [71, u'Zimbabwe', 14149648],\n", " [72, u'Senegal', 14133280],\n", " [73, u'Chad', 12825314],\n", " [74, u'Rwanda', 11776522],\n", " [75, u'Guinea', 11745189],\n", " [76, u'South Sudan', 11296173],\n", " [77, u'Cuba', 11265629],\n", " [78, u'Greece', 11127990],\n", " [79, u'Belgium', 11104476],\n", " [80, u'Tunisia', 10996515],\n", " [81, u'Czech Republic', 10702197],\n", " [82, u'Bolivia', 10671200],\n", " [83, u'Portugal', 10608156],\n", " [84, u'Somalia', 10495583],\n", " [85, u'Dominican Republic', 10403761],\n", " [86, u'Benin', 10323474],\n", " [87, u'Haiti', 10317461],\n", " [88, u'Burundi', 10162532],\n", " [89, u'Hungary', 9954941],\n", " [90, u'Sweden', 9571105],\n", " [91, u'Serbia; Kosovo', 9510506],\n", " [92, u'Azerbaijan', 9413420],\n", " [93, u'Belarus', 9356678],\n", " [94, u'United Arab Emirates', 9346129],\n", " [95, u'Austria', 8495145],\n", " [96, u'Tajikistan', 8207834],\n", " [97, u'Honduras', 8097688],\n", " [98, u'Switzerland', 8077833],\n", " [99, u'Israel', 7733144],\n", " [100, u'Papua New Guinea', 7321262],\n", " [101, u'Jordan', 7273799],\n", " [102, u'Bulgaria', 7222943],\n", " [None, u'Hong Kong', 7203836],\n", " [103, u'Togo', 6816982],\n", " [104, u'Paraguay', 6802295],\n", " [105, u'Laos', 6769727],\n", " [106, u'El Salvador', 6340454],\n", " [107, u'Eritrea', 6333135],\n", " [108, u'Libya', 6201521],\n", " [109, u'Sierra Leone', 6092075],\n", " [110, u'Nicaragua', 6080478],\n", " [111, u'Denmark', 5619096],\n", " [112, u'Kyrgyzstan', 5547548],\n", " [113, u'Slovakia', 5450223],\n", " [114, u'Finland', 5426323],\n", " [115, u'Singapore', 5411737],\n", " [116, u'Turkmenistan', 5240072],\n", " [117, u'Norway', 5042671],\n", " [118, u'Costa Rica', 4872166],\n", " [119, u'Lebanon', 4821971],\n", " [120, u'Ireland', 4627173],\n", " [121, u'Central African Republic', 4616417],\n", " [122, u'New Zealand', 4505761],\n", " [123, u'Congo, Republic of the', 4447632],\n", " [124, u'Georgia', 4340895],\n", " [125, u'Palestine', 4326295],\n", " [126, u'Liberia', 4294077],\n", " [127, u'Croatia', 4289714],\n", " [128, u'Mauritania', 3889880],\n", " [129, u'Panama', 3864170],\n", " [130, u'Bosnia and Herzegovina', 3829307],\n", " [None, u'Puerto Rico', 3688318],\n", " [131, u'Oman', 3632444],\n", " [132, u'Moldova', 3487204],\n", " [133, u'Uruguay', 3407062],\n", " [134, u'Kuwait', 3368572],\n", " [135, u'Albania', 3173271],\n", " [136, u'Lithuania', 3016933],\n", " [137, u'Armenia', 2976566],\n", " [138, u'Mongolia', 2839073],\n", " [139, u'Jamaica', 2783888],\n", " [140, u'Namibia', 2303315],\n", " [141, u'Qatar', 2168673],\n", " [142, u'Macedonia', 2107158],\n", " [143, u'Lesotho', 2074465],\n", " [144, u'Slovenia', 2071997],\n", " [145, u'Latvia', 2050317],\n", " [146, u'Botswana', 2021144],\n", " [147, u'Gambia', 1849285],\n", " [148, u'Guinea-Bissau', 1704255],\n", " [149, u'Gabon', 1671711],\n", " [150, u'Trinidad and Tobago', 1341151],\n", " [151, u'Bahrain', 1332171],\n", " [152, u'Estonia', 1287251],\n", " [153, u'Swaziland', 1249514],\n", " [154, u'Mauritius', 1244403],\n", " [155, u'Cyprus', 1141166],\n", " [156, u'Timor-Leste', 1132879],\n", " [157, u'Fiji', 881065],\n", " [None, u'R\\xe9union', 875375],\n", " [158, u'Djibouti', 872932],\n", " [159, u'Guyana', 799613],\n", " [160, u'Equatorial Guinea', 757014],\n", " [161, u'Bhutan', 753947],\n", " [162, u'Comoros', 734917],\n", " [163, u'Montenegro', 621383],\n", " [None, u'Western Sahara', 567315],\n", " [None, u'Macau', 566375],\n", " [164, u'Solomon Islands', 561231],\n", " [165, u'Suriname', 539276],\n", " [166, u'Luxembourg', 530380],\n", " [167, u'Cape Verde', 498897],\n", " [None, u'Guadeloupe', 465800],\n", " [168, u'Malta', 429004],\n", " [169, u'Brunei', 417784],\n", " [None, u'Martinique', 403682],\n", " [170, u'Bahamas', 377374],\n", " [171, u'Maldives', 345023],\n", " [172, u'Belize', 331900],\n", " [173, u'Iceland', 329535],\n", " [174, u'Barbados', 284644],\n", " [None, u'French Polynesia', 276831],\n", " [None, u'New Caledonia', 256496],\n", " [175, u'Vanuatu', 252763],\n", " [None, u'French Guiana', 249227],\n", " [None, u'Mayotte', 222152],\n", " [176, u'S\\xe3o Tom\\xe9 and Pr\\xedncipe', 192993],\n", " [177, u'Samoa', 190372],\n", " [178, u'Saint Lucia', 182273],\n", " [None, u'Guam', 165124],\n", " [None, u'Guernsey; Jersey', 162018],\n", " [None, u'Cura\\xe7ao', 158760],\n", " [179, u'Saint Vincent and the Grenadines', 109373],\n", " [None, u'Virgin Islands, United States', 106627],\n", " [180, u'Grenada', 105897],\n", " [181, u'Tonga', 105323],\n", " [182, u'Micronesia, Federated States of', 103549],\n", " [None, u'Aruba', 102911],\n", " [183, u'Kiribati', 102351],\n", " [184, u'Seychelles', 92838],\n", " [185, u'Antigua and Barbuda', 89985],\n", " [None, u'Isle of Man', 85888],\n", " [186, u'Andorra', 79218],\n", " [187, u'Dominica', 72003],\n", " [None, u'Bermuda', 65341],\n", " [None, u'Cayman Islands', 58435],\n", " [None, u'Greenland', 56987],\n", " [None, u'American Samoa', 55165],\n", " [188, u'Saint Kitts and Nevis', 54191],\n", " [None, u'Northern Mariana Islands', 53855],\n", " [189, u'Marshall Islands', 52634],\n", " [None, u'Faroe Islands', 49469],\n", " [None, u'Sint Maarten', 45233],\n", " [190, u'Monaco', 37831],\n", " [191, u'Liechtenstein', 36925],\n", " [None, u'Turks and Caicos Islands', 33098],\n", " [192, u'San Marino', 31448],\n", " [None, u'Gibraltar', 29310],\n", " [None, u'Virgin Islands, British', 28341],\n", " [193, u'Palau', 20918],\n", " [None, u'Cook Islands', 20629],\n", " [None, u'Caribbean Netherlands', 19130],\n", " [None, u'Anguilla', 14300],\n", " [None, u'Wallis and Futuna', 13272],\n", " [194, u'Nauru', 10051],\n", " [195, u'Tuvalu', 9876],\n", " [None, u'Saint Pierre and Miquelon', 6043],\n", " [None, u'Montserrat', 5091],\n", " [None, u'Saint Helena, Ascension and Tristan da Cunha', 4129],\n", " [None, u'Falkland Islands', 3044],\n", " [None, u'Niue', 1344],\n", " [None, u'Tokelau', 1195],\n", " [196, u'Vatican City', 799]]" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "world_pop = sum([r[2] for r in pop_list])\n", "world_pop" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": [ "7162119434L" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "# http://stackoverflow.com/a/15889203/7782\n", "def cumsum(lis):\n", " total = 0\n", " for x in lis:\n", " total += x\n", " yield total\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "cum_pop = list(cumsum((r[2] for r in pop_list)))\n", "cum_pop" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 6, "text": [ "[1385566537,\n", " 2637706133L,\n", " 2957756849L,\n", " 3207622480L,\n", " 3407984405L,\n", " 3590126999L,\n", " 3763742344L,\n", " 3920337306L,\n", " 4063170995L,\n", " 4190314572L,\n", " 4312646971L,\n", " 4411040545L,\n", " 4505141301L,\n", " 4596821034L,\n", " 4679547660L,\n", " 4761604038L,\n", " 4839051206L,\n", " 4913983847L,\n", " 4981497524L,\n", " 5048508026L,\n", " 5112799306L,\n", " 5175935571L,\n", " 5236925848L,\n", " 5290184866L,\n", " 5342960996L,\n", " 5392223694L,\n", " 5441476820L,\n", " 5489798225L,\n", " 5536725188L,\n", " 5581963993L,\n", " 5626317684L,\n", " 5667763930L,\n", " 5706972124L,\n", " 5745188759L,\n", " 5783153065L,\n", " 5820731941L,\n", " 5855913645L,\n", " 5889678877L,\n", " 5922687027L,\n", " 5953238701L,\n", " 5983643908L,\n", " 6014019511L,\n", " 6043736476L,\n", " 6072670578L,\n", " 6101499448L,\n", " 6129296905L,\n", " 6155201503L,\n", " 6181035255L,\n", " 6205930735L,\n", " 6230338116L,\n", " 6253680669L,\n", " 6277010441L,\n", " 6299935292L,\n", " 6322189251L,\n", " 6344087312L,\n", " 6365785897L,\n", " 6387257515L,\n", " 6408530743L,\n", " 6428846829L,\n", " 6446678099L,\n", " 6464297807L,\n", " 6481232646L,\n", " 6497991875L,\n", " 6514432461L,\n", " 6530795028L,\n", " 6546532906L,\n", " 6562001109L,\n", " 6577302759L,\n", " 6592437928L,\n", " 6606976568L,\n", " 6621126216L,\n", " 6635259496L,\n", " 6648084810L,\n", " 6659861332L,\n", " 6671606521L,\n", " 6682902694L,\n", " 6694168323L,\n", " 6705296313L,\n", " 6716400789L,\n", " 6727397304L,\n", " 6738099501L,\n", " 6748770701L,\n", " 6759378857L,\n", " 6769874440L,\n", " 6780278201L,\n", " 6790601675L,\n", " 6800919136L,\n", " 6811081668L,\n", " 6821036609L,\n", " 6830607714L,\n", " 6840118220L,\n", " 6849531640L,\n", " 6858888318L,\n", " 6868234447L,\n", " 6876729592L,\n", " 6884937426L,\n", " 6893035114L,\n", " 6901112947L,\n", " 6908846091L,\n", " 6916167353L,\n", " 6923441152L,\n", " 6930664095L,\n", " 6937867931L,\n", " 6944684913L,\n", " 6951487208L,\n", " 6958256935L,\n", " 6964597389L,\n", " 6970930524L,\n", " 6977132045L,\n", " 6983224120L,\n", " 6989304598L,\n", " 6994923694L,\n", " 7000471242L,\n", " 7005921465L,\n", " 7011347788L,\n", " 7016759525L,\n", " 7021999597L,\n", " 7027042268L,\n", " 7031914434L,\n", " 7036736405L,\n", " 7041363578L,\n", " 7045979995L,\n", " 7050485756L,\n", " 7054933388L,\n", " 7059274283L,\n", " 7063600578L,\n", " 7067894655L,\n", " 7072184369L,\n", " 7076074249L,\n", " 7079938419L,\n", " 7083767726L,\n", " 7087456044L,\n", " 7091088488L,\n", " 7094575692L,\n", " 7097982754L,\n", " 7101351326L,\n", " 7104524597L,\n", " 7107541530L,\n", " 7110518096L,\n", " 7113357169L,\n", " 7116141057L,\n", " 7118444372L,\n", " 7120613045L,\n", " 7122720203L,\n", " 7124794668L,\n", " 7126866665L,\n", " 7128916982L,\n", " 7130938126L,\n", " 7132787411L,\n", " 7134491666L,\n", " 7136163377L,\n", " 7137504528L,\n", " 7138836699L,\n", " 7140123950L,\n", " 7141373464L,\n", " 7142617867L,\n", " 7143759033L,\n", " 7144891912L,\n", " 7145772977L,\n", " 7146648352L,\n", " 7147521284L,\n", " 7148320897L,\n", " 7149077911L,\n", " 7149831858L,\n", " 7150566775L,\n", " 7151188158L,\n", " 7151755473L,\n", " 7152321848L,\n", " 7152883079L,\n", " 7153422355L,\n", " 7153952735L,\n", " 7154451632L,\n", " 7154917432L,\n", " 7155346436L,\n", " 7155764220L,\n", " 7156167902L,\n", " 7156545276L,\n", " 7156890299L,\n", " 7157222199L,\n", " 7157551734L,\n", " 7157836378L,\n", " 7158113209L,\n", " 7158369705L,\n", " 7158622468L,\n", " 7158871695L,\n", " 7159093847L,\n", " 7159286840L,\n", " 7159477212L,\n", " 7159659485L,\n", " 7159824609L,\n", " 7159986627L,\n", " 7160145387L,\n", " 7160254760L,\n", " 7160361387L,\n", " 7160467284L,\n", " 7160572607L,\n", " 7160676156L,\n", " 7160779067L,\n", " 7160881418L,\n", " 7160974256L,\n", " 7161064241L,\n", " 7161150129L,\n", " 7161229347L,\n", " 7161301350L,\n", " 7161366691L,\n", " 7161425126L,\n", " 7161482113L,\n", " 7161537278L,\n", " 7161591469L,\n", " 7161645324L,\n", " 7161697958L,\n", " 7161747427L,\n", " 7161792660L,\n", " 7161830491L,\n", " 7161867416L,\n", " 7161900514L,\n", " 7161931962L,\n", " 7161961272L,\n", " 7161989613L,\n", " 7162010531L,\n", " 7162031160L,\n", " 7162050290L,\n", " 7162064590L,\n", " 7162077862L,\n", " 7162087913L,\n", " 7162097789L,\n", " 7162103832L,\n", " 7162108923L,\n", " 7162113052L,\n", " 7162116096L,\n", " 7162117440L,\n", " 7162118635L,\n", " 7162119434L]" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "import bisect\n", "import random" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "# http://docs.python.org/2/library/bisect.html\n", "bisect.bisect_left(cum_pop,world_pop/2)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ "5" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "float(cum_pop[5])/world_pop" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 9, "text": [ "0.5012660054169099" ] } ], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "len(cum_pop)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 10, "text": [ "233" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "pop_list[0][1]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 11, "text": [ "u'China'" ] } ], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "from itertools import repeat\n", "from collections import Counter\n", "\n", "def random_country_weighted_by_pop():\n", " while True:\n", " yield pop_list[bisect.bisect_left(cum_pop,random.randint(1,world_pop))][1]\n", " \n", "Counter(islice(random_country_weighted_by_pop(),5))" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 12, "text": [ "Counter({u'United Kingdom': 1, u'Thailand': 1, u'Tanzania': 1, u'China': 1, u'Bangladesh': 1})" ] } ], "prompt_number": 12 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "CIA Handbook" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import requests\n", "import locale\n", "import json\n", "\n", "locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' ) \n", "\n", "cia_url = \"https://www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt\"\n", "content = requests.get(cia_url).content" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 40 }, { "cell_type": "code", "collapsed": false, "input": [ "cia_pop_list = [(int(x[0]), x[1], locale.atoi(x[2])) for x in [r.split(\"\\t\") for r in content.strip().split(\"\\r\")]]\n", "cia_pop_list" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 14, "text": [ "[(1, 'China', 1349585838),\n", " (2, 'India', 1220800359),\n", " (3, 'European Union', 509365627),\n", " (4, 'United States', 316438601),\n", " (5, 'Indonesia', 251160124),\n", " (6, 'Brazil', 201009622),\n", " (7, 'Pakistan', 193238868),\n", " (8, 'Nigeria', 174507539),\n", " (9, 'Bangladesh', 163654860),\n", " (10, 'Russia', 142500482),\n", " (11, 'Japan', 127253075),\n", " (12, 'Mexico', 118818228),\n", " (13, 'Philippines', 105720644),\n", " (14, 'Ethiopia', 93877025),\n", " (15, 'Vietnam', 92477857),\n", " (16, 'Egypt', 85294388),\n", " (17, 'Germany', 81147265),\n", " (18, 'Turkey', 80694485),\n", " (19, 'Iran', 79853900),\n", " (20, 'Congo, Democratic Republic of the', 75507308),\n", " (21, 'Thailand', 67497151),\n", " (22, 'France', 65951611),\n", " (23, 'United Kingdom', 63395574),\n", " (24, 'Italy', 61482297),\n", " (25, 'Burma', 55167330),\n", " (26, 'Korea, South', 48955203),\n", " (27, 'South Africa', 48601098),\n", " (28, 'Tanzania', 48261942),\n", " (29, 'Spain', 47370542),\n", " (30, 'Colombia', 45745783),\n", " (31, 'Ukraine', 44573205),\n", " (32, 'Kenya', 44037656),\n", " (33, 'Argentina', 42610981),\n", " (34, 'Poland', 38383809),\n", " (35, 'Algeria', 38087812),\n", " (36, 'Sudan', 34847910),\n", " (37, 'Uganda', 34758809),\n", " (38, 'Canada', 34568211),\n", " (39, 'Morocco', 32649130),\n", " (40, 'Iraq', 31858481),\n", " (41, 'Afghanistan', 31108077),\n", " (42, 'Nepal', 30430267),\n", " (43, 'Peru', 29849303),\n", " (44, 'Malaysia', 29628392),\n", " (45, 'Uzbekistan', 28661637),\n", " (46, 'Venezuela', 28459085),\n", " (47, 'Saudi Arabia', 26939583),\n", " (48, 'Yemen', 25338458),\n", " (49, 'Ghana', 25199609),\n", " (50, 'Korea, North', 24720407),\n", " (51, 'Mozambique', 24096669),\n", " (52, 'Taiwan', 23299716),\n", " (53, 'Madagascar', 22599098),\n", " (54, 'Cameroon', 22534532),\n", " (55, 'Syria', 22457336),\n", " (56, \"Cote d'Ivoire\", 22400835),\n", " (57, 'Australia', 22262501),\n", " (58, 'Romania', 21790479),\n", " (59, 'Sri Lanka', 21675648),\n", " (60, 'Angola', 18565269),\n", " (61, 'Burkina Faso', 17812961),\n", " (62, 'Kazakhstan', 17736896),\n", " (63, 'Chile', 17216945),\n", " (64, 'Niger', 16899327),\n", " (65, 'Netherlands', 16805037),\n", " (66, 'Malawi', 16777547),\n", " (67, 'Mali', 15968882),\n", " (68, 'Ecuador', 15439429),\n", " (69, 'Cambodia', 15205539),\n", " (70, 'Guatemala', 14373472),\n", " (71, 'Zambia', 14222233),\n", " (72, 'Senegal', 13300410),\n", " (73, 'Zimbabwe', 13182908),\n", " (74, 'Rwanda', 12012589),\n", " (75, 'Chad', 11193452),\n", " (76, 'Guinea', 11176026),\n", " (77, 'South Sudan', 11090104),\n", " (78, 'Cuba', 11061886),\n", " (79, 'Tunisia', 10835873),\n", " (80, 'Portugal', 10799270),\n", " (81, 'Greece', 10772967),\n", " (82, 'Czech Republic', 10609762),\n", " (83, 'Bolivia', 10461053),\n", " (84, 'Belgium', 10444268),\n", " (85, 'Somalia', 10251568),\n", " (86, 'Dominican Republic', 10219630),\n", " (87, 'Hungary', 9939470),\n", " (88, 'Haiti', 9893934),\n", " (89, 'Benin', 9877292),\n", " (90, 'Sweden', 9647386),\n", " (91, 'Belarus', 9625888),\n", " (92, 'Azerbaijan', 9590159),\n", " (93, 'Honduras', 8448465),\n", " (94, 'Austria', 8221646),\n", " (95, 'Switzerland', 7996026),\n", " (96, 'Tajikistan', 7910041),\n", " (97, 'Israel', 7707042),\n", " (98, 'Serbia', 7243007),\n", " (99, 'Togo', 7154237),\n", " (100, 'Hong Kong', 7082316),\n", " (101, 'Bulgaria', 6981642),\n", " (102, 'Laos', 6695166),\n", " (103, 'Paraguay', 6623252),\n", " (104, 'Jordan', 6482081),\n", " (105, 'Papua New Guinea', 6431902),\n", " (106, 'Eritrea', 6233682),\n", " (107, 'El Salvador', 6108590),\n", " (108, 'Libya', 6002347),\n", " (109, 'Nicaragua', 5788531),\n", " (110, 'Sierra Leone', 5612685),\n", " (111, 'Denmark', 5556452),\n", " (112, 'Kyrgyzstan', 5548042),\n", " (113, 'Slovakia', 5488339),\n", " (114, 'United Arab Emirates', 5473972),\n", " (115, 'Singapore', 5460302),\n", " (116, 'Finland', 5266114),\n", " (117, 'Central African Republic', 5166510),\n", " (118, 'Turkmenistan', 5113040),\n", " (119, 'Norway', 5085582),\n", " (120, 'Georgia', 4942157),\n", " (121, 'Ireland', 4775982),\n", " (122, 'Costa Rica', 4695942),\n", " (123, 'Congo, Republic of the', 4574099),\n", " (124, 'Croatia', 4475611),\n", " (125, 'New Zealand', 4365113),\n", " (126, 'Lebanon', 4131583),\n", " (127, 'Liberia', 3989703),\n", " (128, 'Bosnia and Herzegovina', 3875723),\n", " (129, 'Puerto Rico', 3645648),\n", " (130, 'Moldova', 3619925),\n", " (131, 'Panama', 3559408),\n", " (132, 'Lithuania', 3515858),\n", " (133, 'Mauritania', 3437610),\n", " (134, 'Uruguay', 3324460),\n", " (135, 'Oman', 3154134),\n", " (136, 'Armenia', 3064267),\n", " (137, 'Albania', 3011405),\n", " (138, 'Mongolia', 2912192),\n", " (139, 'Jamaica', 2909714),\n", " (140, 'Kuwait', 2695316),\n", " (141, 'West Bank', 2676740),\n", " (142, 'Namibia', 2182852),\n", " (143, 'Latvia', 2178443),\n", " (144, 'Botswana', 2127825),\n", " (145, 'Macedonia', 2087171),\n", " (146, 'Qatar', 2042444),\n", " (147, 'Slovenia', 1992690),\n", " (148, 'Lesotho', 1936181),\n", " (149, 'Gambia, The', 1883051),\n", " (150, 'Kosovo', 1847708),\n", " (151, 'Gaza Strip', 1763387),\n", " (152, 'Guinea-Bissau', 1660870),\n", " (153, 'Gabon', 1640286),\n", " (154, 'Swaziland', 1403362),\n", " (155, 'Mauritius', 1322238),\n", " (156, 'Bahrain', 1281332),\n", " (157, 'Estonia', 1266375),\n", " (158, 'Trinidad and Tobago', 1225225),\n", " (159, 'Timor-Leste', 1172390),\n", " (160, 'Cyprus', 1155403),\n", " (161, 'Burundi', 1060714),\n", " (162, 'Fiji', 896758),\n", " (163, 'Djibouti', 792198),\n", " (164, 'Comoros', 752288),\n", " (165, 'Guyana', 739903),\n", " (166, 'Bhutan', 725296),\n", " (167, 'Equatorial Guinea', 704001),\n", " (168, 'Montenegro', 653474),\n", " (169, 'Solomon Islands', 597248),\n", " (170, 'Macau', 583003),\n", " (171, 'Suriname', 566846),\n", " (172, 'Western Sahara', 538811),\n", " (173, 'Cabo Verde', 531046),\n", " (174, 'Luxembourg', 514862),\n", " (175, 'Brunei', 415717),\n", " (176, 'Malta', 411277),\n", " (177, 'Maldives', 393988),\n", " (178, 'Belize', 334297),\n", " (179, 'Bahamas, The', 319031),\n", " (180, 'Iceland', 315281),\n", " (181, 'Barbados', 288725),\n", " (182, 'French Polynesia', 277293),\n", " (183, 'New Caledonia', 264022),\n", " (184, 'Vanuatu', 261565),\n", " (185, 'Samoa', 195476),\n", " (186, 'Sao Tome and Principe', 186817),\n", " (187, 'Saint Lucia', 162781),\n", " (188, 'Guam', 160378),\n", " (189, 'Curacao', 146836),\n", " (190, 'Grenada', 109590),\n", " (191, 'Aruba', 109153),\n", " (192, 'Tonga', 106322),\n", " (193, 'Micronesia, Federated States of', 106104),\n", " (194, 'Virgin Islands', 104737),\n", " (195, 'Kiribati', 103248),\n", " (196, 'Saint Vincent and the Grenadines', 103220),\n", " (197, 'Jersey', 95732),\n", " (198, 'Seychelles', 90846),\n", " (199, 'Antigua and Barbuda', 90156),\n", " (200, 'Isle of Man', 86159),\n", " (201, 'Andorra', 85293),\n", " (202, 'Dominica', 73286),\n", " (203, 'Marshall Islands', 69747),\n", " (204, 'Bermuda', 69467),\n", " (205, 'Guernsey', 65605),\n", " (206, 'Greenland', 57714),\n", " (207, 'American Samoa', 54719),\n", " (208, 'Cayman Islands', 53737),\n", " (209, 'Northern Mariana Islands', 51170),\n", " (210, 'Saint Kitts and Nevis', 51134),\n", " (211, 'Faroe Islands', 49709),\n", " (212, 'Turks and Caicos Islands', 47754),\n", " (213, 'Sint Maarten', 39689),\n", " (214, 'Liechtenstein', 37009),\n", " (215, 'San Marino', 32448),\n", " (216, 'British Virgin Islands', 31912),\n", " (217, 'Saint Martin', 31264),\n", " (218, 'Monaco', 30500),\n", " (219, 'Gibraltar', 29111),\n", " (220, 'Palau', 21108),\n", " (221, 'Anguilla', 15754),\n", " (222, 'Dhekelia', 15700),\n", " (223, 'Akrotiri', 15700),\n", " (224, 'Wallis and Futuna', 15507),\n", " (225, 'Tuvalu', 10698),\n", " (226, 'Cook Islands', 10447),\n", " (227, 'Nauru', 9434),\n", " (228, 'Saint Helena, Ascension, and Tristan da Cunha', 7754),\n", " (229, 'Saint Barthelemy', 7298),\n", " (230, 'Saint Pierre and Miquelon', 5774),\n", " (231, 'Montserrat', 5189),\n", " (232, 'Falkland Islands (Islas Malvinas)', 3140),\n", " (233, 'Norfolk Island', 2196),\n", " (234, 'Svalbard', 1921),\n", " (235, 'Christmas Island', 1513),\n", " (236, 'Tokelau', 1353),\n", " (237, 'Niue', 1229),\n", " (238, 'Holy See (Vatican City)', 839),\n", " (239, 'Cocos (Keeling) Islands', 596),\n", " (240, 'Pitcairn Islands', 65)]" ] } ], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "print json.dumps(cia_pop_list)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[[1, \"China\", 1349585838], [2, \"India\", 1220800359], [3, \"European Union\", 509365627], [4, \"United States\", 316438601], [5, \"Indonesia\", 251160124], [6, \"Brazil\", 201009622], [7, \"Pakistan\", 193238868], [8, \"Nigeria\", 174507539], [9, \"Bangladesh\", 163654860], [10, \"Russia\", 142500482], [11, \"Japan\", 127253075], [12, \"Mexico\", 118818228], [13, \"Philippines\", 105720644], [14, \"Ethiopia\", 93877025], [15, \"Vietnam\", 92477857], [16, \"Egypt\", 85294388], [17, \"Germany\", 81147265], [18, \"Turkey\", 80694485], [19, \"Iran\", 79853900], [20, \"Congo, Democratic Republic of the\", 75507308], [21, \"Thailand\", 67497151], [22, \"France\", 65951611], [23, \"United Kingdom\", 63395574], [24, \"Italy\", 61482297], [25, \"Burma\", 55167330], [26, \"Korea, South\", 48955203], [27, \"South Africa\", 48601098], [28, \"Tanzania\", 48261942], [29, \"Spain\", 47370542], [30, \"Colombia\", 45745783], [31, \"Ukraine\", 44573205], [32, \"Kenya\", 44037656], [33, \"Argentina\", 42610981], [34, \"Poland\", 38383809], [35, \"Algeria\", 38087812], [36, \"Sudan\", 34847910], [37, \"Uganda\", 34758809], [38, \"Canada\", 34568211], [39, \"Morocco\", 32649130], [40, \"Iraq\", 31858481], [41, \"Afghanistan\", 31108077], [42, \"Nepal\", 30430267], [43, \"Peru\", 29849303], [44, \"Malaysia\", 29628392], [45, \"Uzbekistan\", 28661637], [46, \"Venezuela\", 28459085], [47, \"Saudi Arabia\", 26939583], [48, \"Yemen\", 25338458], [49, \"Ghana\", 25199609], [50, \"Korea, North\", 24720407], [51, \"Mozambique\", 24096669], [52, \"Taiwan\", 23299716], [53, \"Madagascar\", 22599098], [54, \"Cameroon\", 22534532], [55, \"Syria\", 22457336], [56, \"Cote d'Ivoire\", 22400835], [57, \"Australia\", 22262501], [58, \"Romania\", 21790479], [59, \"Sri Lanka\", 21675648], [60, \"Angola\", 18565269], [61, \"Burkina Faso\", 17812961], [62, \"Kazakhstan\", 17736896], [63, \"Chile\", 17216945], [64, \"Niger\", 16899327], [65, \"Netherlands\", 16805037], [66, \"Malawi\", 16777547], [67, \"Mali\", 15968882], [68, \"Ecuador\", 15439429], [69, \"Cambodia\", 15205539], [70, \"Guatemala\", 14373472], [71, \"Zambia\", 14222233], [72, \"Senegal\", 13300410], [73, \"Zimbabwe\", 13182908], [74, \"Rwanda\", 12012589], [75, \"Chad\", 11193452], [76, \"Guinea\", 11176026], [77, \"South Sudan\", 11090104], [78, \"Cuba\", 11061886], [79, \"Tunisia\", 10835873], [80, \"Portugal\", 10799270], [81, \"Greece\", 10772967], [82, \"Czech Republic\", 10609762], [83, \"Bolivia\", 10461053], [84, \"Belgium\", 10444268], [85, \"Somalia\", 10251568], [86, \"Dominican Republic\", 10219630], [87, \"Hungary\", 9939470], [88, \"Haiti\", 9893934], [89, \"Benin\", 9877292], [90, \"Sweden\", 9647386], [91, \"Belarus\", 9625888], [92, \"Azerbaijan\", 9590159], [93, \"Honduras\", 8448465], [94, \"Austria\", 8221646], [95, \"Switzerland\", 7996026], [96, \"Tajikistan\", 7910041], [97, \"Israel\", 7707042], [98, \"Serbia\", 7243007], [99, \"Togo\", 7154237], [100, \"Hong Kong\", 7082316], [101, \"Bulgaria\", 6981642], [102, \"Laos\", 6695166], [103, \"Paraguay\", 6623252], [104, \"Jordan\", 6482081], [105, \"Papua New Guinea\", 6431902], [106, \"Eritrea\", 6233682], [107, \"El Salvador\", 6108590], [108, \"Libya\", 6002347], [109, \"Nicaragua\", 5788531], [110, \"Sierra Leone\", 5612685], [111, \"Denmark\", 5556452], [112, \"Kyrgyzstan\", 5548042], [113, \"Slovakia\", 5488339], [114, \"United Arab Emirates\", 5473972], [115, \"Singapore\", 5460302], [116, \"Finland\", 5266114], [117, \"Central African Republic\", 5166510], [118, \"Turkmenistan\", 5113040], [119, \"Norway\", 5085582], [120, \"Georgia\", 4942157], [121, \"Ireland\", 4775982], [122, \"Costa Rica\", 4695942], [123, \"Congo, Republic of the\", 4574099], [124, \"Croatia\", 4475611], [125, \"New Zealand\", 4365113], [126, \"Lebanon\", 4131583], [127, \"Liberia\", 3989703], [128, \"Bosnia and Herzegovina\", 3875723], [129, \"Puerto Rico\", 3645648], [130, \"Moldova\", 3619925], [131, \"Panama\", 3559408], [132, \"Lithuania\", 3515858], [133, \"Mauritania\", 3437610], [134, \"Uruguay\", 3324460], [135, \"Oman\", 3154134], [136, \"Armenia\", 3064267], [137, \"Albania\", 3011405], [138, \"Mongolia\", 2912192], [139, \"Jamaica\", 2909714], [140, \"Kuwait\", 2695316], [141, \"West Bank\", 2676740], [142, \"Namibia\", 2182852], [143, \"Latvia\", 2178443], [144, \"Botswana\", 2127825], [145, \"Macedonia\", 2087171], [146, \"Qatar\", 2042444], [147, \"Slovenia\", 1992690], [148, \"Lesotho\", 1936181], [149, \"Gambia, The\", 1883051], [150, \"Kosovo\", 1847708], [151, \"Gaza Strip\", 1763387], [152, \"Guinea-Bissau\", 1660870], [153, \"Gabon\", 1640286], [154, \"Swaziland\", 1403362], [155, \"Mauritius\", 1322238], [156, \"Bahrain\", 1281332], [157, \"Estonia\", 1266375], [158, \"Trinidad and Tobago\", 1225225], [159, \"Timor-Leste\", 1172390], [160, \"Cyprus\", 1155403], [161, \"Burundi\", 1060714], [162, \"Fiji\", 896758], [163, \"Djibouti\", 792198], [164, \"Comoros\", 752288], [165, \"Guyana\", 739903], [166, \"Bhutan\", 725296], [167, \"Equatorial Guinea\", 704001], [168, \"Montenegro\", 653474], [169, \"Solomon Islands\", 597248], [170, \"Macau\", 583003], [171, \"Suriname\", 566846], [172, \"Western Sahara\", 538811], [173, \"Cabo Verde\", 531046], [174, \"Luxembourg\", 514862], [175, \"Brunei\", 415717], [176, \"Malta\", 411277], [177, \"Maldives\", 393988], [178, \"Belize\", 334297], [179, \"Bahamas, The\", 319031], [180, \"Iceland\", 315281], [181, \"Barbados\", 288725], [182, \"French Polynesia\", 277293], [183, \"New Caledonia\", 264022], [184, \"Vanuatu\", 261565], [185, \"Samoa\", 195476], [186, \"Sao Tome and Principe\", 186817], [187, \"Saint Lucia\", 162781], [188, \"Guam\", 160378], [189, \"Curacao\", 146836], [190, \"Grenada\", 109590], [191, \"Aruba\", 109153], [192, \"Tonga\", 106322], [193, \"Micronesia, Federated States of\", 106104], [194, \"Virgin Islands\", 104737], [195, \"Kiribati\", 103248], [196, \"Saint Vincent and the Grenadines\", 103220], [197, \"Jersey\", 95732], [198, \"Seychelles\", 90846], [199, \"Antigua and Barbuda\", 90156], [200, \"Isle of Man\", 86159], [201, \"Andorra\", 85293], [202, \"Dominica\", 73286], [203, \"Marshall Islands\", 69747], [204, \"Bermuda\", 69467], [205, \"Guernsey\", 65605], [206, \"Greenland\", 57714], [207, \"American Samoa\", 54719], [208, \"Cayman Islands\", 53737], [209, \"Northern Mariana Islands\", 51170], [210, \"Saint Kitts and Nevis\", 51134], [211, \"Faroe Islands\", 49709], [212, \"Turks and Caicos Islands\", 47754], [213, \"Sint Maarten\", 39689], [214, \"Liechtenstein\", 37009], [215, \"San Marino\", 32448], [216, \"British Virgin Islands\", 31912], [217, \"Saint Martin\", 31264], [218, \"Monaco\", 30500], [219, \"Gibraltar\", 29111], [220, \"Palau\", 21108], [221, \"Anguilla\", 15754], [222, \"Dhekelia\", 15700], [223, \"Akrotiri\", 15700], [224, \"Wallis and Futuna\", 15507], [225, \"Tuvalu\", 10698], [226, \"Cook Islands\", 10447], [227, \"Nauru\", 9434], [228, \"Saint Helena, Ascension, and Tristan da Cunha\", 7754], [229, \"Saint Barthelemy\", 7298], [230, \"Saint Pierre and Miquelon\", 5774], [231, \"Montserrat\", 5189], [232, \"Falkland Islands (Islas Malvinas)\", 3140], [233, \"Norfolk Island\", 2196], [234, \"Svalbard\", 1921], [235, \"Christmas Island\", 1513], [236, \"Tokelau\", 1353], [237, \"Niue\", 1229], [238, \"Holy See (Vatican City)\", 839], [239, \"Cocos (Keeling) Islands\", 596], [240, \"Pitcairn Islands\", 65]]\n" ] } ], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "# https://gist.github.com/rdhyee/8530164/raw/f8e842fe8ccd6e3bc424e3a24e41ef5c38f419e8/world_factbook_poulation.json\n", "# https://gist.github.com/rdhyee/8530164\n", "# https://www.cia.gov/library/publications/the-world-factbook/rankorder/2119rank.html\n", "# https://www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt\n", "\n", "\n", "import json\n", "import requests\n", "\n", "cia_json_url = \"https://gist.github.com/rdhyee/8530164/raw/f8e842fe8ccd6e3bc424e3a24e41ef5c38f419e8/world_factbook_poulation.json\"\n", "cia_list= requests.get(cia_json_url).json()\n", "cia_list" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 41, "text": [ "[[1, u'China', 1349585838],\n", " [2, u'India', 1220800359],\n", " [3, u'European Union', 509365627],\n", " [4, u'United States', 316438601],\n", " [5, u'Indonesia', 251160124],\n", " [6, u'Brazil', 201009622],\n", " [7, u'Pakistan', 193238868],\n", " [8, u'Nigeria', 174507539],\n", " [9, u'Bangladesh', 163654860],\n", " [10, u'Russia', 142500482],\n", " [11, u'Japan', 127253075],\n", " [12, u'Mexico', 118818228],\n", " [13, u'Philippines', 105720644],\n", " [14, u'Ethiopia', 93877025],\n", " [15, u'Vietnam', 92477857],\n", " [16, u'Egypt', 85294388],\n", " [17, u'Germany', 81147265],\n", " [18, u'Turkey', 80694485],\n", " [19, u'Iran', 79853900],\n", " [20, u'Congo, Democratic Republic of the', 75507308],\n", " [21, u'Thailand', 67497151],\n", " [22, u'France', 65951611],\n", " [23, u'United Kingdom', 63395574],\n", " [24, u'Italy', 61482297],\n", " [25, u'Burma', 55167330],\n", " [26, u'Korea, South', 48955203],\n", " [27, u'South Africa', 48601098],\n", " [28, u'Tanzania', 48261942],\n", " [29, u'Spain', 47370542],\n", " [30, u'Colombia', 45745783],\n", " [31, u'Ukraine', 44573205],\n", " [32, u'Kenya', 44037656],\n", " [33, u'Argentina', 42610981],\n", " [34, u'Poland', 38383809],\n", " [35, u'Algeria', 38087812],\n", " [36, u'Sudan', 34847910],\n", " [37, u'Uganda', 34758809],\n", " [38, u'Canada', 34568211],\n", " [39, u'Morocco', 32649130],\n", " [40, u'Iraq', 31858481],\n", " [41, u'Afghanistan', 31108077],\n", " [42, u'Nepal', 30430267],\n", " [43, u'Peru', 29849303],\n", " [44, u'Malaysia', 29628392],\n", " [45, u'Uzbekistan', 28661637],\n", " [46, u'Venezuela', 28459085],\n", " [47, u'Saudi Arabia', 26939583],\n", " [48, u'Yemen', 25338458],\n", " [49, u'Ghana', 25199609],\n", " [50, u'Korea, North', 24720407],\n", " [51, u'Mozambique', 24096669],\n", " [52, u'Taiwan', 23299716],\n", " [53, u'Madagascar', 22599098],\n", " [54, u'Cameroon', 22534532],\n", " [55, u'Syria', 22457336],\n", " [56, u\"Cote d'Ivoire\", 22400835],\n", " [57, u'Australia', 22262501],\n", " [58, u'Romania', 21790479],\n", " [59, u'Sri Lanka', 21675648],\n", " [60, u'Angola', 18565269],\n", " [61, u'Burkina Faso', 17812961],\n", " [62, u'Kazakhstan', 17736896],\n", " [63, u'Chile', 17216945],\n", " [64, u'Niger', 16899327],\n", " [65, u'Netherlands', 16805037],\n", " [66, u'Malawi', 16777547],\n", " [67, u'Mali', 15968882],\n", " [68, u'Ecuador', 15439429],\n", " [69, u'Cambodia', 15205539],\n", " [70, u'Guatemala', 14373472],\n", " [71, u'Zambia', 14222233],\n", " [72, u'Senegal', 13300410],\n", " [73, u'Zimbabwe', 13182908],\n", " [74, u'Rwanda', 12012589],\n", " [75, u'Chad', 11193452],\n", " [76, u'Guinea', 11176026],\n", " [77, u'South Sudan', 11090104],\n", " [78, u'Cuba', 11061886],\n", " [79, u'Tunisia', 10835873],\n", " [80, u'Portugal', 10799270],\n", " [81, u'Greece', 10772967],\n", " [82, u'Czech Republic', 10609762],\n", " [83, u'Bolivia', 10461053],\n", " [84, u'Belgium', 10444268],\n", " [85, u'Somalia', 10251568],\n", " [86, u'Dominican Republic', 10219630],\n", " [87, u'Hungary', 9939470],\n", " [88, u'Haiti', 9893934],\n", " [89, u'Benin', 9877292],\n", " [90, u'Sweden', 9647386],\n", " [91, u'Belarus', 9625888],\n", " [92, u'Azerbaijan', 9590159],\n", " [93, u'Honduras', 8448465],\n", " [94, u'Austria', 8221646],\n", " [95, u'Switzerland', 7996026],\n", " [96, u'Tajikistan', 7910041],\n", " [97, u'Israel', 7707042],\n", " [98, u'Serbia', 7243007],\n", " [99, u'Togo', 7154237],\n", " [100, u'Hong Kong', 7082316],\n", " [101, u'Bulgaria', 6981642],\n", " [102, u'Laos', 6695166],\n", " [103, u'Paraguay', 6623252],\n", " [104, u'Jordan', 6482081],\n", " [105, u'Papua New Guinea', 6431902],\n", " [106, u'Eritrea', 6233682],\n", " [107, u'El Salvador', 6108590],\n", " [108, u'Libya', 6002347],\n", " [109, u'Nicaragua', 5788531],\n", " [110, u'Sierra Leone', 5612685],\n", " [111, u'Denmark', 5556452],\n", " [112, u'Kyrgyzstan', 5548042],\n", " [113, u'Slovakia', 5488339],\n", " [114, u'United Arab Emirates', 5473972],\n", " [115, u'Singapore', 5460302],\n", " [116, u'Finland', 5266114],\n", " [117, u'Central African Republic', 5166510],\n", " [118, u'Turkmenistan', 5113040],\n", " [119, u'Norway', 5085582],\n", " [120, u'Georgia', 4942157],\n", " [121, u'Ireland', 4775982],\n", " [122, u'Costa Rica', 4695942],\n", " [123, u'Congo, Republic of the', 4574099],\n", " [124, u'Croatia', 4475611],\n", " [125, u'New Zealand', 4365113],\n", " [126, u'Lebanon', 4131583],\n", " [127, u'Liberia', 3989703],\n", " [128, u'Bosnia and Herzegovina', 3875723],\n", " [129, u'Puerto Rico', 3645648],\n", " [130, u'Moldova', 3619925],\n", " [131, u'Panama', 3559408],\n", " [132, u'Lithuania', 3515858],\n", " [133, u'Mauritania', 3437610],\n", " [134, u'Uruguay', 3324460],\n", " [135, u'Oman', 3154134],\n", " [136, u'Armenia', 3064267],\n", " [137, u'Albania', 3011405],\n", " [138, u'Mongolia', 2912192],\n", " [139, u'Jamaica', 2909714],\n", " [140, u'Kuwait', 2695316],\n", " [141, u'West Bank', 2676740],\n", " [142, u'Namibia', 2182852],\n", " [143, u'Latvia', 2178443],\n", " [144, u'Botswana', 2127825],\n", " [145, u'Macedonia', 2087171],\n", " [146, u'Qatar', 2042444],\n", " [147, u'Slovenia', 1992690],\n", " [148, u'Lesotho', 1936181],\n", " [149, u'Gambia, The', 1883051],\n", " [150, u'Kosovo', 1847708],\n", " [151, u'Gaza Strip', 1763387],\n", " [152, u'Guinea-Bissau', 1660870],\n", " [153, u'Gabon', 1640286],\n", " [154, u'Swaziland', 1403362],\n", " [155, u'Mauritius', 1322238],\n", " [156, u'Bahrain', 1281332],\n", " [157, u'Estonia', 1266375],\n", " [158, u'Trinidad and Tobago', 1225225],\n", " [159, u'Timor-Leste', 1172390],\n", " [160, u'Cyprus', 1155403],\n", " [161, u'Burundi', 1060714],\n", " [162, u'Fiji', 896758],\n", " [163, u'Djibouti', 792198],\n", " [164, u'Comoros', 752288],\n", " [165, u'Guyana', 739903],\n", " [166, u'Bhutan', 725296],\n", " [167, u'Equatorial Guinea', 704001],\n", " [168, u'Montenegro', 653474],\n", " [169, u'Solomon Islands', 597248],\n", " [170, u'Macau', 583003],\n", " [171, u'Suriname', 566846],\n", " [172, u'Western Sahara', 538811],\n", " [173, u'Cabo Verde', 531046],\n", " [174, u'Luxembourg', 514862],\n", " [175, u'Brunei', 415717],\n", " [176, u'Malta', 411277],\n", " [177, u'Maldives', 393988],\n", " [178, u'Belize', 334297],\n", " [179, u'Bahamas, The', 319031],\n", " [180, u'Iceland', 315281],\n", " [181, u'Barbados', 288725],\n", " [182, u'French Polynesia', 277293],\n", " [183, u'New Caledonia', 264022],\n", " [184, u'Vanuatu', 261565],\n", " [185, u'Samoa', 195476],\n", " [186, u'Sao Tome and Principe', 186817],\n", " [187, u'Saint Lucia', 162781],\n", " [188, u'Guam', 160378],\n", " [189, u'Curacao', 146836],\n", " [190, u'Grenada', 109590],\n", " [191, u'Aruba', 109153],\n", " [192, u'Tonga', 106322],\n", " [193, u'Micronesia, Federated States of', 106104],\n", " [194, u'Virgin Islands', 104737],\n", " [195, u'Kiribati', 103248],\n", " [196, u'Saint Vincent and the Grenadines', 103220],\n", " [197, u'Jersey', 95732],\n", " [198, u'Seychelles', 90846],\n", " [199, u'Antigua and Barbuda', 90156],\n", " [200, u'Isle of Man', 86159],\n", " [201, u'Andorra', 85293],\n", " [202, u'Dominica', 73286],\n", " [203, u'Marshall Islands', 69747],\n", " [204, u'Bermuda', 69467],\n", " [205, u'Guernsey', 65605],\n", " [206, u'Greenland', 57714],\n", " [207, u'American Samoa', 54719],\n", " [208, u'Cayman Islands', 53737],\n", " [209, u'Northern Mariana Islands', 51170],\n", " [210, u'Saint Kitts and Nevis', 51134],\n", " [211, u'Faroe Islands', 49709],\n", " [212, u'Turks and Caicos Islands', 47754],\n", " [213, u'Sint Maarten', 39689],\n", " [214, u'Liechtenstein', 37009],\n", " [215, u'San Marino', 32448],\n", " [216, u'British Virgin Islands', 31912],\n", " [217, u'Saint Martin', 31264],\n", " [218, u'Monaco', 30500],\n", " [219, u'Gibraltar', 29111],\n", " [220, u'Palau', 21108],\n", " [221, u'Anguilla', 15754],\n", " [222, u'Dhekelia', 15700],\n", " [223, u'Akrotiri', 15700],\n", " [224, u'Wallis and Futuna', 15507],\n", " [225, u'Tuvalu', 10698],\n", " [226, u'Cook Islands', 10447],\n", " [227, u'Nauru', 9434],\n", " [228, u'Saint Helena, Ascension, and Tristan da Cunha', 7754],\n", " [229, u'Saint Barthelemy', 7298],\n", " [230, u'Saint Pierre and Miquelon', 5774],\n", " [231, u'Montserrat', 5189],\n", " [232, u'Falkland Islands (Islas Malvinas)', 3140],\n", " [233, u'Norfolk Island', 2196],\n", " [234, u'Svalbard', 1921],\n", " [235, u'Christmas Island', 1513],\n", " [236, u'Tokelau', 1353],\n", " [237, u'Niue', 1229],\n", " [238, u'Holy See (Vatican City)', 839],\n", " [239, u'Cocos (Keeling) Islands', 596],\n", " [240, u'Pitcairn Islands', 65]]" ] } ], "prompt_number": 41 }, { "cell_type": "code", "collapsed": false, "input": [ "cia_world_pop = sum([r[2] for r in cia_list if r[1] != 'European Union'])\n", "cia_world_pop" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 43, "text": [ "7091218583L" ] } ], "prompt_number": 43 }, { "cell_type": "code", "collapsed": true, "input": [ "cia_world_pop, world_pop, cia_world_pop/float(world_pop)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 44, "text": [ "(7091218583L, 7162119434L, 0.9901005768399478)" ] } ], "prompt_number": 44 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Comparing two lists" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# set of entities for Wikipedia\n", "wk_entities = set([r[1] for r in pop_list])\n", "wk_entities" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 46, "text": [ "{u'Afghanistan',\n", " u'Albania',\n", " u'Algeria',\n", " u'American Samoa',\n", " u'Andorra',\n", " u'Angola',\n", " u'Anguilla',\n", " u'Antigua and Barbuda',\n", " u'Argentina',\n", " u'Armenia',\n", " u'Aruba',\n", " u'Australia',\n", " u'Austria',\n", " u'Azerbaijan',\n", " u'Bahamas',\n", " u'Bahrain',\n", " u'Bangladesh',\n", " u'Barbados',\n", " u'Belarus',\n", " u'Belgium',\n", " u'Belize',\n", " u'Benin',\n", " u'Bermuda',\n", " u'Bhutan',\n", " u'Bolivia',\n", " u'Bosnia and Herzegovina',\n", " u'Botswana',\n", " u'Brazil',\n", " u'Brunei',\n", " u'Bulgaria',\n", " u'Burkina Faso',\n", " u'Burundi',\n", " u'Cambodia',\n", " u'Cameroon',\n", " u'Canada',\n", " u'Cape Verde',\n", " u'Caribbean Netherlands',\n", " u'Cayman Islands',\n", " u'Central African Republic',\n", " u'Chad',\n", " u'Chile',\n", " u'China',\n", " u'Colombia',\n", " u'Comoros',\n", " u'Congo, Democratic Republic of the',\n", " u'Congo, Republic of the',\n", " u'Cook Islands',\n", " u'Costa Rica',\n", " u'Croatia',\n", " u'Cuba',\n", " u'Cura\\xe7ao',\n", " u'Cyprus',\n", " u'Czech Republic',\n", " u\"C\\xf4te d'Ivoire\",\n", " u'Denmark',\n", " u'Djibouti',\n", " u'Dominica',\n", " u'Dominican Republic',\n", " u'Ecuador',\n", " u'Egypt',\n", " u'El Salvador',\n", " u'Equatorial Guinea',\n", " u'Eritrea',\n", " u'Estonia',\n", " u'Ethiopia',\n", " u'Falkland Islands',\n", " u'Faroe Islands',\n", " u'Fiji',\n", " u'Finland',\n", " u'France',\n", " u'French Guiana',\n", " u'French Polynesia',\n", " u'Gabon',\n", " u'Gambia',\n", " u'Georgia',\n", " u'Germany',\n", " u'Ghana',\n", " u'Gibraltar',\n", " u'Greece',\n", " u'Greenland',\n", " u'Grenada',\n", " u'Guadeloupe',\n", " u'Guam',\n", " u'Guatemala',\n", " u'Guernsey; Jersey',\n", " u'Guinea',\n", " u'Guinea-Bissau',\n", " u'Guyana',\n", " u'Haiti',\n", " u'Honduras',\n", " u'Hong Kong',\n", " u'Hungary',\n", " u'Iceland',\n", " u'India',\n", " u'Indonesia',\n", " u'Iran',\n", " u'Iraq',\n", " u'Ireland',\n", " u'Isle of Man',\n", " u'Israel',\n", " u'Italy',\n", " u'Jamaica',\n", " u'Japan',\n", " u'Jordan',\n", " u'Kazakhstan',\n", " u'Kenya',\n", " u'Kiribati',\n", " u'Korea, North',\n", " u'Korea, South',\n", " u'Kuwait',\n", " u'Kyrgyzstan',\n", " u'Laos',\n", " u'Latvia',\n", " u'Lebanon',\n", " u'Lesotho',\n", " u'Liberia',\n", " u'Libya',\n", " u'Liechtenstein',\n", " u'Lithuania',\n", " u'Luxembourg',\n", " u'Macau',\n", " u'Macedonia',\n", " u'Madagascar',\n", " u'Malawi',\n", " u'Malaysia',\n", " u'Maldives',\n", " u'Mali',\n", " u'Malta',\n", " u'Marshall Islands',\n", " u'Martinique',\n", " u'Mauritania',\n", " u'Mauritius',\n", " u'Mayotte',\n", " u'Mexico',\n", " u'Micronesia, Federated States of',\n", " u'Moldova',\n", " u'Monaco',\n", " u'Mongolia',\n", " u'Montenegro',\n", " u'Montserrat',\n", " u'Morocco',\n", " u'Mozambique',\n", " u'Myanmar',\n", " u'Namibia',\n", " u'Nauru',\n", " u'Nepal',\n", " u'Netherlands',\n", " u'New Caledonia',\n", " u'New Zealand',\n", " u'Nicaragua',\n", " u'Niger',\n", " u'Nigeria',\n", " u'Niue',\n", " u'Northern Mariana Islands',\n", " u'Norway',\n", " u'Oman',\n", " u'Pakistan',\n", " u'Palau',\n", " u'Palestine',\n", " u'Panama',\n", " u'Papua New Guinea',\n", " u'Paraguay',\n", " u'Peru',\n", " u'Philippines',\n", " u'Poland',\n", " u'Portugal',\n", " u'Puerto Rico',\n", " u'Qatar',\n", " u'Romania',\n", " u'Russia',\n", " u'Rwanda',\n", " u'R\\xe9union',\n", " u'Saint Helena, Ascension and Tristan da Cunha',\n", " u'Saint Kitts and Nevis',\n", " u'Saint Lucia',\n", " u'Saint Pierre and Miquelon',\n", " u'Saint Vincent and the Grenadines',\n", " u'Samoa',\n", " u'San Marino',\n", " u'Saudi Arabia',\n", " u'Senegal',\n", " u'Serbia; Kosovo',\n", " u'Seychelles',\n", " u'Sierra Leone',\n", " u'Singapore',\n", " u'Sint Maarten',\n", " u'Slovakia',\n", " u'Slovenia',\n", " u'Solomon Islands',\n", " u'Somalia',\n", " u'South Africa',\n", " u'South Sudan',\n", " u'Spain',\n", " u'Sri Lanka',\n", " u'Sudan',\n", " u'Suriname',\n", " u'Swaziland',\n", " u'Sweden',\n", " u'Switzerland',\n", " u'Syria',\n", " u'S\\xe3o Tom\\xe9 and Pr\\xedncipe',\n", " u'Taiwan',\n", " u'Tajikistan',\n", " u'Tanzania',\n", " u'Thailand',\n", " u'Timor-Leste',\n", " u'Togo',\n", " u'Tokelau',\n", " u'Tonga',\n", " u'Trinidad and Tobago',\n", " u'Tunisia',\n", " u'Turkey',\n", " u'Turkmenistan',\n", " u'Turks and Caicos Islands',\n", " u'Tuvalu',\n", " u'Uganda',\n", " u'Ukraine',\n", " u'United Arab Emirates',\n", " u'United Kingdom',\n", " u'United States',\n", " u'Uruguay',\n", " u'Uzbekistan',\n", " u'Vanuatu',\n", " u'Vatican City',\n", " u'Venezuela',\n", " u'Vietnam',\n", " u'Virgin Islands, British',\n", " u'Virgin Islands, United States',\n", " u'Wallis and Futuna',\n", " u'Western Sahara',\n", " u'Yemen',\n", " u'Zambia',\n", " u'Zimbabwe'}" ] } ], "prompt_number": 46 }, { "cell_type": "code", "collapsed": false, "input": [ "cia_entities = set([r[1] for r in cia_list])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 47 }, { "cell_type": "code", "collapsed": false, "input": [ "len(wk_entities), len(cia_entities)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 48, "text": [ "(233, 240)" ] } ], "prompt_number": 48 }, { "cell_type": "code", "collapsed": false, "input": [ "# http://docs.python.org/2/library/stdtypes.html#set\n", "# intersection\n", "len(wk_entities & cia_entities)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 49, "text": [ "212" ] } ], "prompt_number": 49 }, { "cell_type": "code", "collapsed": false, "input": [ "# symmetric diff\n", "wk_entities ^ cia_entities" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 37, "text": [ "{u'Akrotiri',\n", " u'Bahamas',\n", " u'Bahamas, The',\n", " u'British Virgin Islands',\n", " u'Burma',\n", " u'Cabo Verde',\n", " u'Cape Verde',\n", " u'Caribbean Netherlands',\n", " u'Christmas Island',\n", " u'Cocos (Keeling) Islands',\n", " u\"Cote d'Ivoire\",\n", " u'Curacao',\n", " u'Cura\\xe7ao',\n", " u\"C\\xf4te d'Ivoire\",\n", " u'Dhekelia',\n", " u'European Union',\n", " u'Falkland Islands',\n", " u'Falkland Islands (Islas Malvinas)',\n", " u'French Guiana',\n", " u'Gambia',\n", " u'Gambia, The',\n", " u'Gaza Strip',\n", " u'Guadeloupe',\n", " u'Guernsey',\n", " u'Guernsey; Jersey',\n", " u'Holy See (Vatican City)',\n", " u'Jersey',\n", " u'Kosovo',\n", " u'Martinique',\n", " u'Mayotte',\n", " u'Myanmar',\n", " u'Norfolk Island',\n", " u'Palestine',\n", " u'Pitcairn Islands',\n", " u'R\\xe9union',\n", " u'Saint Barthelemy',\n", " u'Saint Helena, Ascension and Tristan da Cunha',\n", " u'Saint Helena, Ascension, and Tristan da Cunha',\n", " u'Saint Martin',\n", " u'Sao Tome and Principe',\n", " u'Serbia',\n", " u'Serbia; Kosovo',\n", " u'Svalbard',\n", " u'S\\xe3o Tom\\xe9 and Pr\\xedncipe',\n", " u'Vatican City',\n", " u'Virgin Islands',\n", " u'Virgin Islands, British',\n", " u'Virgin Islands, United States',\n", " u'West Bank'}" ] } ], "prompt_number": 37 }, { "cell_type": "code", "collapsed": false, "input": [ "wk_entities - cia_entities" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 38, "text": [ "{u'Bahamas',\n", " u'Cape Verde',\n", " u'Caribbean Netherlands',\n", " u'Cura\\xe7ao',\n", " u\"C\\xf4te d'Ivoire\",\n", " u'Falkland Islands',\n", " u'French Guiana',\n", " u'Gambia',\n", " u'Guadeloupe',\n", " u'Guernsey; Jersey',\n", " u'Martinique',\n", " u'Mayotte',\n", " u'Myanmar',\n", " u'Palestine',\n", " u'R\\xe9union',\n", " u'Saint Helena, Ascension and Tristan da Cunha',\n", " u'Serbia; Kosovo',\n", " u'S\\xe3o Tom\\xe9 and Pr\\xedncipe',\n", " u'Vatican City',\n", " u'Virgin Islands, British',\n", " u'Virgin Islands, United States'}" ] } ], "prompt_number": 38 }, { "cell_type": "code", "collapsed": false, "input": [ "cia_entities - wk_entities" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 39, "text": [ "{u'Akrotiri',\n", " u'Bahamas, The',\n", " u'British Virgin Islands',\n", " u'Burma',\n", " u'Cabo Verde',\n", " u'Christmas Island',\n", " u'Cocos (Keeling) Islands',\n", " u\"Cote d'Ivoire\",\n", " u'Curacao',\n", " u'Dhekelia',\n", " u'European Union',\n", " u'Falkland Islands (Islas Malvinas)',\n", " u'Gambia, The',\n", " u'Gaza Strip',\n", " u'Guernsey',\n", " u'Holy See (Vatican City)',\n", " u'Jersey',\n", " u'Kosovo',\n", " u'Norfolk Island',\n", " u'Pitcairn Islands',\n", " u'Saint Barthelemy',\n", " u'Saint Helena, Ascension, and Tristan da Cunha',\n", " u'Saint Martin',\n", " u'Sao Tome and Principe',\n", " u'Serbia',\n", " u'Svalbard',\n", " u'Virgin Islands',\n", " u'West Bank'}" ] } ], "prompt_number": 39 }, { "cell_type": "code", "collapsed": false, "input": [ "len(wk_entities or cia_entities)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 34, "text": [ "233" ] } ], "prompt_number": 34 } ], "metadata": {} } ] }