{ "metadata": { "name": "", "signature": "sha256:b0764b62e773abee4299763c339d0fffa49a622ecdff83f552a79c38e62002df" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import pandas\n", "import math\n", "import datetime\n", "\n", "%pylab inline\n", "\n", "java_min_int = -2147483648" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] } ], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "wikidata_df = pandas.read_csv('data/gender-index-data-2014-09-17.csv', \n", " na_values=[java_min_int])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "def split_column(q_str):\n", " if type(q_str) is float:\n", " if numpy.isnan(q_str):\n", " return [q_str] #returning this way so we can gurantee that column contains list\n", " if type(q_str) is str:\n", " qs = q_str.split('|')\n", " return qs[:-1] #cos the format will always end with a |\n", "\n", "for column in ['gender', 'ethnic_group', 'citizenship', 'place_of_birth', 'site_links']:\n", " column_plural = column+'s'\n", " wikidata_df[column_plural] = wikidata_df[column].apply(split_column)\n", "\n", " '''\n", " df['genders'] = df['gender'].apply(split_gender)\n", "df['langs'] = df['site_links'].apply(split_langs)\n", "'''" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "wikidata_df.head(5)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", " | qid | \n", "dob | \n", "dod | \n", "gender | \n", "ethnic_group | \n", "citizenship | \n", "place_of_birth | \n", "site_links | \n", "genders | \n", "ethnic_groups | \n", "citizenships | \n", "place_of_births | \n", "site_linkss | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "Q23 | \n", "1732 | \n", "1799 | \n", "Q6581097| | \n", "NaN | \n", "Q30| | \n", "Q494413| | \n", "zhwiki|kywiki|euwiki|plwiki|bswiki|angwiki|uzw... | \n", "[Q6581097] | \n", "[nan] | \n", "[Q30] | \n", "[Q494413] | \n", "[zhwiki, kywiki, euwiki, plwiki, bswiki, angwi... | \n", "
1 | \n", "Q42 | \n", "1952 | \n", "2001 | \n", "Q6581097| | \n", "NaN | \n", "Q145| | \n", "Q350| | \n", "zhwiki|jvwiki|euwiki|plwiki|bswiki|eswiki|tawi... | \n", "[Q6581097] | \n", "[nan] | \n", "[Q145] | \n", "[Q350] | \n", "[zhwiki, jvwiki, euwiki, plwiki, bswiki, eswik... | \n", "
2 | \n", "Q207 | \n", "1946 | \n", "NaN | \n", "Q6581097| | \n", "NaN | \n", "Q30| | \n", "Q49145| | \n", "uzwiki|eswiki|kowikiquote|huwiki|liwikiquote|p... | \n", "[Q6581097] | \n", "[nan] | \n", "[Q30] | \n", "[Q49145] | \n", "[uzwiki, eswiki, kowikiquote, huwiki, liwikiqu... | \n", "
3 | \n", "Q297 | \n", "NaN | \n", "1660 | \n", "Q6581097| | \n", "NaN | \n", "Q29| | \n", "Q8717| | \n", "zhwiki|kywiki|plwiki|euwiki|bswiki|uzwiki|eswi... | \n", "[Q6581097] | \n", "[nan] | \n", "[Q29] | \n", "[Q8717] | \n", "[zhwiki, kywiki, plwiki, euwiki, bswiki, uzwik... | \n", "
4 | \n", "Q326 | \n", "1942 | \n", "NaN | \n", "Q6581097| | \n", "NaN | \n", "Q298|Q39| | \n", "Q2887| | \n", "zhwiki|plwiki|euwiki|kowiki|frwiki|eswiki|yowi... | \n", "[Q6581097] | \n", "[nan] | \n", "[Q298, Q39] | \n", "[Q2887] | \n", "[zhwiki, plwiki, euwiki, kowiki, frwiki, eswik... | \n", "