{ "metadata": { "name": "", "signature": "sha256:9c21337d8433c1079c4616e48ad732e962a7fe5736c183f351f93ee7713239d2" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import numpy\n", "import json\n", "from collections import defaultdict\n", "import scipy.stats\n", "import math\n", "import pywikibot\n", "from matplotlib.pylab import style\n", "style.use('fivethirtyeight')\n", "\n", "%pylab inline\n", "java_min_int = -2147483648" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Starting 1 threads...\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] } ], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "allrecs = pd.DataFrame.from_dict(json.load(open('helpers/world_cultures_shortcut.json','r')))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "#Tranforming QIDs into English labels.\n", "enwp = pywikibot.Site('en','wikipedia')\n", "wikidata = enwp.data_repository()\n", "\n", "retrieved = dict()\n", "\n", "def english_label(qid):\n", " \n", " if type(qid) is float:\n", " if math.isnan(qid):\n", " return None\n", " #first see if we've done it\n", " try:\n", " return retrieved[qid]\n", " except KeyError:\n", " try:\n", " page = pywikibot.ItemPage(wikidata, qid)\n", " data = page.get()\n", " lab = data['labels']['en']\n", " retrieved[qid] = lab\n", " return lab\n", " except (KeyError, pywikibot.exceptions.NoPage):\n", " retrieved[qid] = qid\n", " return qid\n", "#Tranforming QIDs into English labels.\n", "enwp = pywikibot.Site('en','wikipedia')\n", "wikidata = enwp.data_repository()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "allrecs['citname'] = allrecs['citizenship'].apply(english_label)\n", "allrecs['countryname'] = allrecs['country'].apply(english_label)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stderr", "text": [ "VERBOSE:pywiki:Found 1 wikidata:wikidata processes running, including this one.\n" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "wikidatanames = set(allrecs['citname']).union(set(allrecs['countryname']))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "def normname(name):\n", " name = name.replace('*','')\n", " try:\n", " return {'Iran, Islamic Rep.': 'Iran',\n", " 'Korea, Rep.':'South Korea',\n", " 'Brunei Darussalam': 'Brunei',\n", " 'United States':'United States of America',\n", " 'Slovak Republic':'Slovakia',\n", " 'China':\"People's Republic of China\",\n", " 'People\u2019s Republic of China':\"People's Republic of China\",\n", " 'Kyrgyz Republic': 'Kyrgyzstan',\n", " 'Russian Federation': 'Russia',\n", " 'Macedonia, FYR': 'Republic of Macedonia',\n", " 'Lao PDR':'Laos',\n", " 'Bahamas':'The Bahamas',\n", " u'C\\xf4te d\\u2019Ivoire':u\"C\\xf4te d'Ivoire\",\n", " 'C\u00f4te d\u2019Ivoire':u\"C\\xf4te d'Ivoire\",\n", " 'Plu. St.. of Bolivia':'Bolivia',\n", " 'Viet Nam':'Vietnam',\n", " 'Myanmar':'Burma',\n", " 'Former Yugoslav Republic of Macedonia':'Macedonia',\n", " 'Lao People\u2019s Democratic Republic':'Laos',\n", " 'Bolivarian Republic of Venezuela':'Venezuela',\n", " 'Republic of Moldova':'Moldova',\n", " 'Central African Rep.':'Central African Republic',\n", " 'Syrian Arab Republic':'Syria',\n", " 'Republic of Tanzania':'Tanzania',\n", " 'Palestine, State of':'Palestine',\n", " 'Moldova (Republic of)':'Moldova',\n", " 'Sao Tome and Principe': u'Sao Tom\\xe9 and Pr\\xedncipe',\n", " \"Lao People's Democratic Republic\":'Laos',\n", " 'Venezuela (Bolivarian Republic of)':'Venezuela',\n", " 'The former Yugoslav Republic of Macedonia':'Macedonia',\n", " 'Iran (Islamic Republic of)':'Iran',\n", " 'Congo (Democratic Republic of the)': u'Democratic Republic of the Congo',\n", " 'Congo':u'Republic of the Congo',\n", " 'Tanzania (United Republic of)':'Tanzania',\n", " 'Hong Kong, China (SAR)':\"People's Republic of China\",\n", " 'Russian Federation':'Russia',\n", " 'Korea (Republic of)':'South Korea',\n", " 'Bolivia (Plurinational State of)':'Bolivia'}[name]\n", " except KeyError:\n", " return name" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "wef = pd.io.html.read_html('http://reports.weforum.org/global-gender-gap-report-2014/rankings/')[0]\n", "wef['Economy'] = wef['Economy'].apply(normname)\n", "wefnames = set(wef['Economy'])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "geidirty = pd.io.html.read_html('http://www.socialwatch.org/node/14367')[2]\n", "gei = geidirty.iloc[3:,6:8]\n", "gei.columns = ['Economy', 'Score']\n", "gei = gei.dropna()\n", "gei[\"Rank\"] = gei['Score'].rank(ascending=False).apply(lambda x: int(x))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "def country_sigi_extract(text_line):\n", " '''put the first strings together as name and the first float as the sigi value'''\n", " economy = ''\n", " sigi_val = float()\n", " for w in text_line.split(' '):\n", " try:\n", " sigi_val = float(w)\n", " break\n", " except ValueError:\n", " if economy:\n", " economy += ' ' + w\n", " else:\n", " economy = w\n", " return economy, sigi_val\n", "\n", "ec_sigi = dict([country_sigi_extract(text_line) for text_line in sigipdftext.split('\\n')] )\n", "\n", "sigi = pd.DataFrame.from_dict(ec_sigi, orient='index')\n", "sigi['Economy'] = sigi.index\n", "sigi['Economy'] = sigi['Economy'].apply(normname)\n", "sigi['Score'] = 1-sigi[0]\n", "sigi[\"Rank\"] = sigi['Score'].rank(ascending=False).apply(lambda x: int(x))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "gdidirty = pd.DataFrame.from_csv('helpers/foreign_indexes/Table_5__Gender-related_development_index.csv')\n", "nar = gdidirty.iloc[1:,:3]\n", "nar.columns = ['Economy', 'Score', 'Rank']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "gdi = nar[(nar['Score'] != '..') & (nar['Rank'] != '\u2014') ]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "gdi['Score'] = gdi['Score'].apply(lambda x: float(x))\n", "gdi['Rank'] = gdi['Rank'].apply(lambda x: int(x))\n", "gdi.sort('Score')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stderr", "text": [ "WARNING: -c:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", "\n", "WARNING:py.warnings:-c:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", "\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING: -c:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", "\n", "WARNING:py.warnings:-c:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", "\n" ] }, { "html": [ "
\n", " | Economy | \n", "Score | \n", "Rank | \n", "
---|---|---|---|
HDI ranks | \n", "\n", " | \n", " | \n", " |
169 | \n", "Afghanistan | \n", "0.602 | \n", "148 | \n", "
187 | \n", "Niger | \n", "0.714 | \n", "147 | \n", "
154 | \n", "Yemen | \n", "0.738 | \n", "146 | \n", "
146 | \n", "Pakistan | \n", "0.750 | \n", "145 | \n", "
184 | \n", "Chad | \n", "0.762 | \n", "144 | \n", "
176 | \n", "Mali | \n", "0.771 | \n", "143 | \n", "
185 | \n", "Central African Republic | \n", "0.776 | \n", "142 | \n", "
179 | \n", "Guinea | \n", "0.785 | \n", "141 | \n", "
175 | \n", "Liberia | \n", "0.786 | \n", "140 | \n", "
183 | \n", "Sierra Leone | \n", "0.799 | \n", "139 | \n", "
161 | \n", "Mauritania | \n", "0.801 | \n", "138 | \n", "
120 | \n", "Iraq | \n", "0.802 | \n", "137 | \n", "
166 | \n", "Togo | \n", "0.803 | \n", "136 | \n", "
165 | \n", "Benin | \n", "0.822 | \n", "134 | \n", "
186 | \n", "Congo (Democratic Republic of the) | \n", "0.822 | \n", "134 | \n", "
129 | \n", "Morocco | \n", "0.828 | \n", "132 | \n", "
135 | \n", "India | \n", "0.828 | \n", "132 | \n", "
152 | \n", "Nigeria | \n", "0.839 | \n", "131 | \n", "
77 | \n", "Jordan | \n", "0.842 | \n", "130 | \n", "
93 | \n", "Algeria | \n", "0.843 | \n", "129 | \n", "
75 | \n", "Iran (Islamic Republic of) | \n", "0.847 | \n", "128 | \n", "
118 | \n", "Syrian Arab Republic | \n", "0.851 | \n", "127 | \n", "
173 | \n", "Ethiopia | \n", "0.853 | \n", "126 | \n", "
110 | \n", "Egypt | \n", "0.855 | \n", "125 | \n", "
163 | \n", "Senegal | \n", "0.864 | \n", "124 | \n", "
152 | \n", "Cameroon | \n", "0.872 | \n", "123 | \n", "
128 | \n", "Timor-Leste | \n", "0.875 | \n", "122 | \n", "
148 | \n", "Swaziland | \n", "0.877 | \n", "121 | \n", "
178 | \n", "Mozambique | \n", "0.879 | \n", "120 | \n", "
138 | \n", "Ghana | \n", "0.884 | \n", "118 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
117 | \n", "Philippines | \n", "0.989 | \n", "17 | \n", "
10 | \n", "Denmark | \n", "0.989 | \n", "17 | \n", "
20 | \n", "France | \n", "0.989 | \n", "17 | \n", "
96 | \n", "Jamaica | \n", "0.989 | \n", "17 | \n", "
89 | \n", "Thailand | \n", "0.990 | \n", "14 | \n", "
114 | \n", "Moldova (Republic of) | \n", "0.990 | \n", "14 | \n", "
14 | \n", "United Kingdom | \n", "0.993 | \n", "13 | \n", "
87 | \n", "Armenia | \n", "0.994 | \n", "8 | \n", "
64 | \n", "Trinidad and Tobago | \n", "0.994 | \n", "8 | \n", "
58 | \n", "Bulgaria | \n", "0.994 | \n", "8 | \n", "
5 | \n", "United States | \n", "0.995 | \n", "7 | \n", "
1 | \n", "Norway | \n", "0.997 | \n", "5 | \n", "
43 | \n", "Hungary | \n", "0.998 | \n", "4 | \n", "
67 | \n", "Venezuela (Bolivarian Republic of) | \n", "0.999 | \n", "2 | \n", "
37 | \n", "Slovakia | \n", "1.000 | \n", "1 | \n", "
49 | \n", "Argentina | \n", "1.001 | \n", "2 | \n", "
12 | \n", "Sweden | \n", "1.004 | \n", "6 | \n", "
24 | \n", "Finland | \n", "1.006 | \n", "8 | \n", "
25 | \n", "Slovenia | \n", "1.006 | \n", "8 | \n", "
35 | \n", "Poland | \n", "1.010 | \n", "14 | \n", "
83 | \n", "Ukraine | \n", "1.012 | \n", "21 | \n", "
50 | \n", "Uruguay | \n", "1.015 | \n", "25 | \n", "
70 | \n", "Kazakhstan | \n", "1.015 | \n", "25 | \n", "
53 | \n", "Belarus | \n", "1.021 | \n", "32 | \n", "
59 | \n", "Barbados | \n", "1.021 | \n", "32 | \n", "
103 | \n", "Mongolia | \n", "1.021 | \n", "32 | \n", "
48 | \n", "Latvia | \n", "1.033 | \n", "52 | \n", "
35 | \n", "Lithuania | \n", "1.036 | \n", "58 | \n", "
57 | \n", "Russian Federation | \n", "1.038 | \n", "61 | \n", "
33 | \n", "Estonia | \n", "1.042 | \n", "70 | \n", "
148 rows \u00d7 3 columns
\n", "\n", " | start_year | \n", "bios_count | \n", "spearman | \n", "spearman_p | \n", "mannwhitneyu | \n", "mannwhitneyu_p | \n", "ranksum | \n", "ranksum_p | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "1000 | \n", "887006 | \n", "0.263995 | \n", "0.001501 | \n", "6195.0 | \n", "9.762673e-09 | \n", "5.616870 | \n", "1.944474e-08 | \n", "
1 | \n", "1100 | \n", "886514 | \n", "0.265131 | \n", "0.001429 | \n", "6193.0 | \n", "9.600785e-09 | \n", "5.619760 | \n", "1.912226e-08 | \n", "
2 | \n", "1200 | \n", "885697 | \n", "0.265446 | \n", "0.001410 | \n", "6187.0 | \n", "9.130603e-09 | \n", "5.628431 | \n", "1.818567e-08 | \n", "
3 | \n", "1300 | \n", "884571 | \n", "0.265739 | \n", "0.001392 | \n", "6200.0 | \n", "1.017908e-08 | \n", "5.609645 | \n", "2.027420e-08 | \n", "
4 | \n", "1400 | \n", "883044 | \n", "0.263195 | \n", "0.001553 | \n", "6197.0 | \n", "9.927211e-09 | \n", "5.613980 | \n", "1.977249e-08 | \n", "
5 | \n", "1500 | \n", "879276 | \n", "0.262088 | \n", "0.001628 | \n", "6206.0 | \n", "1.070155e-08 | \n", "5.600975 | \n", "2.131498e-08 | \n", "
6 | \n", "1600 | \n", "870495 | \n", "0.265848 | \n", "0.001385 | \n", "6230.0 | \n", "1.306420e-08 | \n", "5.566294 | \n", "2.602143e-08 | \n", "
7 | \n", "1700 | \n", "857099 | \n", "0.265899 | \n", "0.001382 | \n", "6252.0 | \n", "1.566935e-08 | \n", "5.534503 | \n", "3.121122e-08 | \n", "
8 | \n", "1800 | \n", "815661 | \n", "0.270712 | \n", "0.001120 | \n", "6421.0 | \n", "6.130025e-08 | \n", "5.290291 | \n", "1.221218e-07 | \n", "
9 | \n", "1810 | \n", "805811 | \n", "0.276275 | \n", "0.000874 | \n", "6483.0 | \n", "9.965155e-08 | \n", "5.200699 | \n", "1.985407e-07 | \n", "
10 | \n", "1820 | \n", "794371 | \n", "0.277217 | \n", "0.000838 | \n", "6564.0 | \n", "1.858238e-07 | \n", "5.083650 | \n", "3.702490e-07 | \n", "
11 | \n", "1830 | \n", "781921 | \n", "0.276718 | \n", "0.000857 | \n", "6621.0 | \n", "2.858085e-07 | \n", "5.001283 | \n", "5.694998e-07 | \n", "
12 | \n", "1840 | \n", "768581 | \n", "0.282408 | \n", "0.000661 | \n", "6689.0 | \n", "4.735930e-07 | \n", "4.903020 | \n", "9.437414e-07 | \n", "
13 | \n", "1850 | \n", "753693 | \n", "0.290256 | \n", "0.000458 | \n", "6753.0 | \n", "7.553225e-07 | \n", "4.810538 | \n", "1.505246e-06 | \n", "
14 | \n", "1860 | \n", "736868 | \n", "0.295216 | \n", "0.000362 | \n", "6834.0 | \n", "1.347622e-06 | \n", "4.693490 | \n", "2.685835e-06 | \n", "
15 | \n", "1870 | \n", "716540 | \n", "0.292414 | \n", "0.000414 | \n", "6940.0 | \n", "2.818059e-06 | \n", "4.540315 | \n", "5.617012e-06 | \n", "
16 | \n", "1880 | \n", "691692 | \n", "0.298912 | \n", "0.000302 | \n", "7110.0 | \n", "8.776426e-06 | \n", "4.294659 | \n", "1.749623e-05 | \n", "
17 | \n", "1890 | \n", "660609 | \n", "0.302099 | \n", "0.000258 | \n", "7285.0 | \n", "2.660525e-05 | \n", "4.041777 | \n", "5.304774e-05 | \n", "
18 | \n", "1900 | \n", "623915 | \n", "0.305469 | \n", "0.000218 | \n", "7331.0 | \n", "3.525074e-05 | \n", "3.975305 | \n", "7.028916e-05 | \n", "
19 | \n", "1910 | \n", "579592 | \n", "0.312051 | \n", "0.000157 | \n", "7367.0 | \n", "4.380497e-05 | \n", "3.923283 | \n", "8.735029e-05 | \n", "
20 | \n", "1920 | \n", "534223 | \n", "0.292366 | \n", "0.000415 | \n", "7516.5 | \n", "1.050574e-04 | \n", "3.707250 | \n", "2.095221e-04 | \n", "
21 | \n", "1930 | \n", "472919 | \n", "0.292359 | \n", "0.000415 | \n", "7953.0 | \n", "1.049781e-03 | \n", "3.076490 | \n", "2.094533e-03 | \n", "
22 | \n", "1940 | \n", "410994 | \n", "0.273103 | \n", "0.001008 | \n", "7938.0 | \n", "9.759586e-04 | \n", "3.098166 | \n", "1.947226e-03 | \n", "
23 | \n", "1950 | \n", "331071 | \n", "0.257789 | \n", "0.001953 | \n", "7283.0 | \n", "2.627829e-05 | \n", "4.044667 | \n", "5.239760e-05 | \n", "
24 | \n", "1960 | \n", "248180 | \n", "0.183820 | \n", "0.028538 | \n", "7737.0 | \n", "3.521279e-04 | \n", "3.388619 | \n", "7.024563e-04 | \n", "
25 | \n", "1970 | \n", "165324 | \n", "0.138381 | \n", "0.100519 | \n", "6806.0 | \n", "1.104553e-06 | \n", "4.733951 | \n", "2.201911e-06 | \n", "