## importing data

In [1]:
import codecs
import json
import os, sys

from collections import defaultdict
from multiprocessing import Pool as ThreadPool

from IPython.display import display, HTML

import pandas as pd

sys.path.append(os.path.abspath('../../WKP-python-toolkit'))
import wekeypedia

In [11]:
inflections = defaultdict(dict)

ignore_list = "{}()[]<>./,;\"':!?&#=*&%"

def from_file(name):
  diff_txt = ""

  with codecs.open(name, "r", encoding="utf-8-sig") as f:
    data = json.load(f)

  return data

def list_revisions(page):
  return os.listdir("data/%s" % (page))

def revision_stems(revision_filename):
  p = wekeypedia.WikipediaPage()
  # print revision_filename
  rev = from_file(revision_filename)
  
  # extract diff text
  diff = rev["diff"]["*"]
  
  # bug with Ethics#462124891
  if diff == False:
    return { "added": {}, "deleted": {} }
  
  diff = p.extract_plusminus(diff)

  # count stems by added/deleted
  stems = {}
  stems["added"] = p.count_stems(diff["added"], inflections)
  stems["deleted"] = p.count_stems(diff["deleted"], inflections)
  # p.print_plusminus_terms_overview(stems)
  
  return stems

def source_stems(s):
  p = wekeypedia.WikipediaPage(s)
  
  revisions = list_revisions(s)

  result = {
      "added": defaultdict(dict),
      "deleted": defaultdict(dict) }
  
  print "%s: %s revisions" % (s, len(revisions))
  
  i = 0

  for r in revisions:
    i += 1
    print "\rrevisions: %s (%s/%s)" % (r, i, len(revisions),),
    stems = revision_stems("data/%s/%s" % (s, r))
    
    for x in ["added", "deleted"]:
      for stem in stems[x].iteritems():
        result[x].setdefault(stem[0], 0)
        result[x][stem[0]] += stem[1]
  print "\r ",
  return result

def to_df(a):
  df_add = pd.DataFrame([ [ x[1] ] for x in a["added"].iteritems() ], index=a["added"].keys())
  df_add.columns = [ 'added' ]

  df_del = pd.DataFrame([ [ x[1] ] for x in a["deleted"].iteritems() ], index=a["deleted"].keys())
  df_del.columns = [ 'deleted' ]

  df = df_add.join(df_del, how="outer")
  
  return df




In [3]:
def clean_and_compute(df):
  ignore_list = [ "a", "of", "and", "to", "the", "is", "for", "or" , "in", "that", "it", "|", "ref",
               "http", "''", "``", "s", "an", "-", "=", "*", "==", "===", "====", "name=", "nbsp", "style=", "5px",
              "font-siz", "|-", "--", "wikiquot", "/ref", "'s" ]

  df = df.drop([ w for w in ignore_list if w in df.index ])
  
  df["added - deleted"] = map(lambda x, y: x-y, df["added"], df["deleted"])
  df["abs(added - deleted)"] = map(lambda x, y: abs(x-y), df["added"], df["deleted"])
  df = df.sort(["abs(added - deleted)", "added"], ascending=[0, 0])

  return df

In [4]:
love = source_stems("Love")
love = to_df(love)

Love: 6324 revisions
 


In [5]:
love = clean_and_compute(love)
love.head(20)

Unnamed: 0,added,deleted,added - deleted,abs(added - deleted)
love,41315,40990,325,325
be,5886,5831,55,55
god,4074,4024,50,50
with,4237,4193,44,44
by,4147,4104,43,43
one,3169,3135,34,34
cite,556,522,34,34
from,2039,2007,32,32
thi,3509,3478,31,31
which,3357,3326,31,31


In [6]:
wisdom = source_stems("Wisdom")
wisdom = to_df(wisdom)

Wisdom: 1634 revisions
 


In [7]:
wisdom = clean_and_compute(wisdom)
wisdom.head(20)

Unnamed: 0,added,deleted,added - deleted,abs(added - deleted)
wisdom,3140,3025,115,115
with,1093,1064,29,29
be,699,675,24,24
he,506,484,22,22
cite,138,116,22,22
one,645,624,21,21
wise,619,599,20,20
from,508,488,20,20
by,494,475,19,19
which,252,234,18,18


In [8]:
morality = source_stems("Morality")
morality = to_df(morality)

Morality: 2776 revisions
 


In [9]:
morality = clean_and_compute(morality)
morality.head(20)

Unnamed: 0,added,deleted,added - deleted,abs(added - deleted)
moral,7595,7421,174,174
cite,728,665,63,63
journal,620,565,55,55
on,1577,1530,47,47
be,1934,1898,36,36
with,1347,1313,34,34
religion,956,923,33,33
are,1661,1629,32,32
by,1097,1066,31,31
ethic,1356,1326,30,30


In [12]:
ethics = source_stems("Ethics")
ethics = to_df(ethics)

Ethics: 3739 revisions
 


In [13]:
ethics = clean_and_compute(ethics)
ethics.head(20)

Unnamed: 0,added,deleted,added - deleted,abs(added - deleted)
ethic,10236,10012,224,224
moral,2874,2800,74,74
be,3130,3066,64,64
are,2791,2744,47,47
on,1924,1877,47,47
right,2024,1982,42,42
philosophi,1356,1318,38,38
not,1585,1548,37,37
with,1681,1645,36,36
by,1795,1761,34,34


In [14]:
love.to_csv("data/love.terms.csv", encoding="utf-8")
wisdom.to_csv("data/wisdom.terms.csv", encoding="utf-8")
ethics.to_csv("data/ethics.terms.csv", encoding="utf-8")
morality.to_csv("data/morality.terms.csv", encoding="utf-8")

In [15]:
love[ love["added - deleted"] < 0 ].head(20)

Unnamed: 0,added,deleted,added - deleted,abs(added - deleted)
"nature,",1,7,-6,6
band,175,180,-5,5
love==,160,165,-5,5
br,225,229,-4,4
13,157,161,-4,4
februari,44,48,-4,4
|love,30,34,-4,4
fact|dat,4,8,-4,4
"food,",2,6,-4,4
"organizations,",2,6,-4,4
