{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## importing data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import codecs\n", "import json\n", "import os, sys\n", "\n", "from collections import defaultdict\n", "from multiprocessing import Pool as ThreadPool\n", "\n", "from IPython.display import display, HTML\n", "\n", "import pandas as pd\n", "\n", "sys.path.append(os.path.abspath('../../WKP-python-toolkit'))\n", "import wekeypedia" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "inflections = defaultdict(dict)\n", "\n", "ignore_list = \"{}()[]<>./,;\\\"':!?&#=*&%\"\n", "\n", "def from_file(name):\n", " diff_txt = \"\"\n", "\n", " with codecs.open(name, \"r\", encoding=\"utf-8-sig\") as f:\n", " data = json.load(f)\n", "\n", " return data\n", "\n", "def list_revisions(page):\n", " return os.listdir(\"data/%s\" % (page))\n", "\n", "def revision_stems(revision_filename):\n", " p = wekeypedia.WikipediaPage()\n", " # print revision_filename\n", " rev = from_file(revision_filename)\n", " \n", " # extract diff text\n", " diff = rev[\"diff\"][\"*\"]\n", " \n", " # bug with Ethics#462124891\n", " if diff == False:\n", " return { \"added\": {}, \"deleted\": {} }\n", " \n", " diff = p.extract_plusminus(diff)\n", "\n", " # count stems by added/deleted\n", " stems = {}\n", " stems[\"added\"] = p.count_stems(diff[\"added\"], inflections)\n", " stems[\"deleted\"] = p.count_stems(diff[\"deleted\"], inflections)\n", " # p.print_plusminus_terms_overview(stems)\n", " \n", " return stems\n", "\n", "def source_stems(s):\n", " p = wekeypedia.WikipediaPage(s)\n", " \n", " revisions = list_revisions(s)\n", "\n", " result = {\n", " \"added\": defaultdict(dict),\n", " \"deleted\": defaultdict(dict) }\n", " \n", " print \"%s: %s revisions\" % (s, len(revisions))\n", " \n", " i = 0\n", "\n", " for r in revisions:\n", " i += 1\n", " print \"\\rrevisions: %s (%s/%s)\" % (r, i, len(revisions),),\n", " stems = revision_stems(\"data/%s/%s\" % (s, r))\n", " \n", " for x in [\"added\", \"deleted\"]:\n", " for stem in stems[x].iteritems():\n", " result[x].setdefault(stem[0], 0)\n", " result[x][stem[0]] += stem[1]\n", " print \"\\r \",\n", " return result\n", "\n", "def to_df(a):\n", " df_add = pd.DataFrame([ [ x[1] ] for x in a[\"added\"].iteritems() ], index=a[\"added\"].keys())\n", " df_add.columns = [ 'added' ]\n", "\n", " df_del = pd.DataFrame([ [ x[1] ] for x in a[\"deleted\"].iteritems() ], index=a[\"deleted\"].keys())\n", " df_del.columns = [ 'deleted' ]\n", "\n", " df = df_add.join(df_del, how=\"outer\")\n", " \n", " return df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def clean_and_compute(df):\n", " ignore_list = [ \"a\", \"of\", \"and\", \"to\", \"the\", \"is\", \"for\", \"or\" , \"in\", \"that\", \"it\", \"|\", \"ref\",\n", " \"http\", \"''\", \"``\", \"s\", \"an\", \"-\", \"=\", \"*\", \"==\", \"===\", \"====\", \"name=\", \"nbsp\", \"style=\", \"5px\",\n", " \"font-siz\", \"|-\", \"--\", \"wikiquot\", \"/ref\", \"'s\" ]\n", "\n", " df = df.drop([ w for w in ignore_list if w in df.index ])\n", " \n", " df[\"added - deleted\"] = map(lambda x, y: x-y, df[\"added\"], df[\"deleted\"])\n", " df[\"abs(added - deleted)\"] = map(lambda x, y: abs(x-y), df[\"added\"], df[\"deleted\"])\n", " df = df.sort([\"abs(added - deleted)\", \"added\"], ascending=[0, 0])\n", "\n", " return df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Love: 6324 revisions\n", " \n" ] } ], "source": [ "love = source_stems(\"Love\")\n", "love = to_df(love)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
addeddeletedadded - deletedabs(added - deleted)
love4131540990325325
be588658315555
god407440245050
with423741934444
by414741044343
one316931353434
cite5565223434
from203920073232
thi350934783131
which335733263131
not337633463030
on308330552828
are355835312727
other295929342525
use267926542525
also162015952525
human202520012424
word267926562323
romant236423412323
term173117082323
\n", "
" ], "text/plain": [ " added deleted added - deleted abs(added - deleted)\n", "love 41315 40990 325 325\n", "be 5886 5831 55 55\n", "god 4074 4024 50 50\n", "with 4237 4193 44 44\n", "by 4147 4104 43 43\n", "one 3169 3135 34 34\n", "cite 556 522 34 34\n", "from 2039 2007 32 32\n", "thi 3509 3478 31 31\n", "which 3357 3326 31 31\n", "not 3376 3346 30 30\n", "on 3083 3055 28 28\n", "are 3558 3531 27 27\n", "other 2959 2934 25 25\n", "use 2679 2654 25 25\n", "also 1620 1595 25 25\n", "human 2025 2001 24 24\n", "word 2679 2656 23 23\n", "romant 2364 2341 23 23\n", "term 1731 1708 23 23" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "love = clean_and_compute(love)\n", "love.head(20)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wisdom: 1634 revisions\n", " \n" ] } ], "source": [ "wisdom = source_stems(\"Wisdom\")\n", "wisdom = to_df(wisdom)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
addeddeletedadded - deletedabs(added - deleted)
wisdom31403025115115
with109310642929
be6996752424
he5064842222
cite1381162222
one6456242121
wise6195992020
from5084882020
by4944751919
which2522341818
not4163991717
are4214051616
person3082921616
virtu3943791515
knowledg8087941414
who3042901414
god2702561414
proverb97831414
thi5024891313
other3753621313
\n", "
" ], "text/plain": [ " added deleted added - deleted abs(added - deleted)\n", "wisdom 3140 3025 115 115\n", "with 1093 1064 29 29\n", "be 699 675 24 24\n", "he 506 484 22 22\n", "cite 138 116 22 22\n", "one 645 624 21 21\n", "wise 619 599 20 20\n", "from 508 488 20 20\n", "by 494 475 19 19\n", "which 252 234 18 18\n", "not 416 399 17 17\n", "are 421 405 16 16\n", "person 308 292 16 16\n", "virtu 394 379 15 15\n", "knowledg 808 794 14 14\n", "who 304 290 14 14\n", "god 270 256 14 14\n", "proverb 97 83 14 14\n", "thi 502 489 13 13\n", "other 375 362 13 13" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wisdom = clean_and_compute(wisdom)\n", "wisdom.head(20)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Morality: 2776 revisions\n", " \n" ] } ], "source": [ "morality = source_stems(\"Morality\")\n", "morality = to_df(morality)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
addeddeletedadded - deletedabs(added - deleted)
moral75957421174174
cite7286656363
journal6205655555
on157715304747
be193418983636
with134713133434
religion9569233333
are166116293232
by109710663131
ethic135613263030
from110610782828
thi10059782727
cultur9238972626
other7807562424
year2402162424
have8248012323
behavior7767532323
studi6025792323
page1971742323
theori5715492222
\n", "
" ], "text/plain": [ " added deleted added - deleted abs(added - deleted)\n", "moral 7595 7421 174 174\n", "cite 728 665 63 63\n", "journal 620 565 55 55\n", "on 1577 1530 47 47\n", "be 1934 1898 36 36\n", "with 1347 1313 34 34\n", "religion 956 923 33 33\n", "are 1661 1629 32 32\n", "by 1097 1066 31 31\n", "ethic 1356 1326 30 30\n", "from 1106 1078 28 28\n", "thi 1005 978 27 27\n", "cultur 923 897 26 26\n", "other 780 756 24 24\n", "year 240 216 24 24\n", "have 824 801 23 23\n", "behavior 776 753 23 23\n", "studi 602 579 23 23\n", "page 197 174 23 23\n", "theori 571 549 22 22" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "morality = clean_and_compute(morality)\n", "morality.head(20)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ethics: 3739 revisions\n", " \n" ] } ], "source": [ "ethics = source_stems(\"Ethics\")\n", "ethics = to_df(ethics)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
addeddeletedadded - deletedabs(added - deleted)
ethic1023610012224224
moral287428007474
be313030666464
are279127444747
on192418774747
right202419824242
philosophi135613183838
not158515483737
with168116453636
by179517613434
cite3202893131
wa146114313030
one143014012929
good140613772929
virtu7827532929
theori175217252727
thi151414872727
action8318042727
person133313072626
from111110872424
\n", "
" ], "text/plain": [ " added deleted added - deleted abs(added - deleted)\n", "ethic 10236 10012 224 224\n", "moral 2874 2800 74 74\n", "be 3130 3066 64 64\n", "are 2791 2744 47 47\n", "on 1924 1877 47 47\n", "right 2024 1982 42 42\n", "philosophi 1356 1318 38 38\n", "not 1585 1548 37 37\n", "with 1681 1645 36 36\n", "by 1795 1761 34 34\n", "cite 320 289 31 31\n", "wa 1461 1431 30 30\n", "one 1430 1401 29 29\n", "good 1406 1377 29 29\n", "virtu 782 753 29 29\n", "theori 1752 1725 27 27\n", "thi 1514 1487 27 27\n", "action 831 804 27 27\n", "person 1333 1307 26 26\n", "from 1111 1087 24 24" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ethics = clean_and_compute(ethics)\n", "ethics.head(20)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [], "source": [ "love.to_csv(\"data/love.terms.csv\", encoding=\"utf-8\")\n", "wisdom.to_csv(\"data/wisdom.terms.csv\", encoding=\"utf-8\")\n", "ethics.to_csv(\"data/ethics.terms.csv\", encoding=\"utf-8\")\n", "morality.to_csv(\"data/morality.terms.csv\", encoding=\"utf-8\")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
addeddeletedadded - deletedabs(added - deleted)
nature,17-66
band175180-55
love==160165-55
br225229-44
13157161-44
februari4448-44
|love3034-44
fact|dat48-44
food,26-44
organizations,26-44
g102105-33
iniqu912-33
/b710-33
86710-33
feelings,58-33
id404406-22
me227229-22
|ero223225-22
etc190192-22
patient173175-22
\n", "
" ], "text/plain": [ " added deleted added - deleted abs(added - deleted)\n", "nature, 1 7 -6 6\n", "band 175 180 -5 5\n", "love== 160 165 -5 5\n", "br 225 229 -4 4\n", "13 157 161 -4 4\n", "februari 44 48 -4 4\n", "|love 30 34 -4 4\n", "fact|dat 4 8 -4 4\n", "food, 2 6 -4 4\n", "organizations, 2 6 -4 4\n", "g 102 105 -3 3\n", "iniqu 9 12 -3 3\n", "/b 7 10 -3 3\n", "86 7 10 -3 3\n", "feelings, 5 8 -3 3\n", "id 404 406 -2 2\n", "me 227 229 -2 2\n", "|ero 223 225 -2 2\n", "etc 190 192 -2 2\n", "patient 173 175 -2 2" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "love[ love[\"added - deleted\"] < 0 ].head(20)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }