<a href="http://www.persistent-identifier.nl/?identifier=urn%3Anbn%3Anl%3Aui%3A13-048i-71" target="_blank"><img align="left"src="images/etcbc4easy-small.png"/></a>
<a href="http://laf-fabric.readthedocs.org/en/latest/" target="_blank"><img align="left" src="images/laf-fabric-xsmall.png"/></a>
<a href="http://www.godgeleerdheid.vu.nl/etcbc" target="_blank"><img align="right" src="images/VU-ETCBC-xsmall.png"/></a>

# Rare lexemes in parallels

Joint work of Kenneth Bergland, Martijn Naaijer and Dirk Roorda.

We try to find rare combinations of words and see how they occur in Thora books and in the Prophets.

In [1]:
import sys,os
import collections, difflib
from IPython.display import HTML, display_pretty, display_html
from difflib import SequenceMatcher

import laf
from laf.fabric import LafFabric
from etcbc.preprocess import prepare
fabric = LafFabric()

  0.00s This is LAF-Fabric 4.5.5
API reference: http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html
Feature doc: https://shebanq.ancient-data.org/static/docs/featuredoc/texts/welcome.html



In [2]:
source = 'etcbc'
version = '4b'

In [3]:
API = fabric.load(source+version, 'lexicon', 'rare_lexemes', {
    "xmlids": {"node": False, "edge": False},
    "features": ('''
        otype
        language lex g_cons g_word_utf8 trailer_utf8 phono phono_sep
        sp
        book chapter verse label number
    ''',''),
    "prepare": prepare,
    "primary": False,
}, verbose='NORMAL')
exec(fabric.localnames.format(var='fabric'))

  0.00s LOADING API: please wait ... 
  0.00s INFO: USING DATA COMPILED AT: 2015-11-02T15-08-56
  0.00s INFO: USING DATA COMPILED AT: 2015-11-03T06-44-21
  4.17s LOGFILE=/Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/rare_lexemes/__log__rare_lexemes.txt
    13s ETCBC reference: http://laf-fabric.readthedocs.org/en/latest/texts/ETCBC-reference.html
  0.00s LOADING API with EXTRAs: please wait ... 
  0.00s INFO: USING DATA COMPILED AT: 2015-11-02T15-08-56
  0.00s INFO: USING DATA COMPILED AT: 2015-11-03T06-44-21
  0.00s INFO: DATA LOADED FROM SOURCE etcbc4b AND ANNOX lexicon FOR TASK rare_lexemes AT 2015-12-17T10-55-18
  0.00s INFO: DATA LOADED FROM SOURCE etcbc4b AND ANNOX lexicon FOR TASK rare_lexemes AT 2015-12-17T10-55-18


In [4]:
book_classes = dict(
    legal = set('''
        Exodus Leviticus Deuteronomium
    '''.strip().split()),
    prophets = set('''
        Jesaia Jeremia
        Ezechiel Hosea
        Joel Amos Obadia Jona Micha Nahum Habakuk Zephania Haggai Sacharia Maleachi
    '''.strip().split())
)
book_classes_index = {}
for (cl, books) in book_classes.items():
    for book in books:
        book_classes_index[book] = cl

# Leave out non-content words

It is a bit sorry to have all those bigrams involving an article, for example.
So, as an option, we leave out the non content words, being words with a certain part-of-speech.

    X art	article
      verb	verb
      subs	noun
      nmpr	proper noun
      advb	adverb
    X prep	preposition
    X conj	conjunction
    X prps	personal pronoun
    X prde	demonstrative pronoun
    X prin	interrogative pronoun
    X intj	interjection
    X nega	negative particle
    X inrg	interrogative particle
      adjv	adjective

In [5]:
ONLY_SP = set('''
    verb subs nmpr advb adjv
'''.strip().split())

# Gather bi- and tri-grams

We create a big set of bi-tri-grams and count their frequencies.
We only consider bi-tri-grams that of which all members are part of the same clause.

We want to know the frequencies in the thora, in the prophets, and in the whole bible.

## Get all the grams

In [6]:
grams = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.Counter()))
ng = 4 # the max number of members in the n-gram
msg('Collecting all <= {}-grams'.format(ng))
for c in F.otype.s('clause'):
    bk = F.book.v(L.u('book', c))
    words = list(L.d('word', c))
    lenw = len(words)
    for n in range(1, ng + 1):
        for i in range(lenw - n):
            this_gram = '-'.join(F.lex.v(w) if F.sp.v(w) in ONLY_SP else '*' for w in (words[j] for j in range(i, n)))
            grams[n][bk][this_gram] += 1
msg('Done')

    11s Collecting all <= 4-grams
    16s Done


# Count and rank the grams

Now compute the counts for the indicated groups of books.

In [7]:
freqs_by_g = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.Counter()))
freqs_by_c = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.Counter()))

msg('Counting grams')
for n in grams:
    ngrams = grams[n]
    for bk in ngrams:
        cl = book_classes_index.get(bk, 'bible')
        these_grams = ngrams[bk]
        for g in these_grams:
            freqs_by_g[n][g][cl] += these_grams[g]
            freqs_by_c[n][cl][g] += these_grams[g]
msg('Done')

    18s Counting grams
    19s Done


# Filter the grams

We set a threshold below which grams are considered *rare* and another threshold above which grams are considered abundant.

Then we list all grams that are abundant in one collection of books but rare in the other one.

In [8]:
RARE = 1

class_a = 'legal'
class_b = 'prophets'

results = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.Counter()))

msg('Filtering')
for RARE in range(1, 10):
    for n in sorted(freqs_by_g):
        nfreqs_by_g = freqs_by_g[n]
        for g in nfreqs_by_g:
            g_info = nfreqs_by_g[g]
            nc = dict((cl, g_info.get(cl, 0)) for cl in book_classes)
            if nc[class_a] <= RARE and nc[class_b] <= RARE: results[RARE][n]['rr'] += 1 
            elif nc[class_a] <= RARE and nc[class_b] > RARE: results[RARE][n]['ra'] += 1
            elif nc[class_a] > RARE and nc[class_b] <= RARE: results[RARE][n]['ar'] += 1
            elif nc[class_a] > RARE and nc[class_b] > RARE: results[RARE][n]['aa'] += 1
            else: results[RARE][n]['xx'] += 1
msg('Done')

for RARE in sorted(results):
    print('RARE means <= {}'.format(RARE))
    for n in range(1, ng + 1):
        print('\t{}-grams: rr ra ar aa = {}'.format(
            n, 
            ' '.join(
                    '{:>5}'.format(results[RARE].get(n, {}).get(y, 0)) for y in ('rr', 'ra', 'ar', 'aa')
                ),
            ))

    21s Filtering
    24s Done


RARE means <= 1
	1-grams: rr ra ar aa =  1866   423    88   141
	2-grams: rr ra ar aa = 10124  1276   430   700
	3-grams: rr ra ar aa = 29657  1933  1116   723
	4-grams: rr ra ar aa = 42812  1541  1393   458
RARE means <= 2
	1-grams: rr ra ar aa =  2120   262    43    93
	2-grams: rr ra ar aa = 10997   808   241   484
	3-grams: rr ra ar aa = 31502   927   535   465
	4-grams: rr ra ar aa = 44768   603   583   250
RARE means <= 3
	1-grams: rr ra ar aa =  2232   181    28    77
	2-grams: rr ra ar aa = 11403   566   178   383
	3-grams: rr ra ar aa = 32193   555   346   335
	4-grams: rr ra ar aa = 45384   309   351   160
RARE means <= 4
	1-grams: rr ra ar aa =  2307   124    24    63
	2-grams: rr ra ar aa = 11643   433   135   319
	3-grams: rr ra ar aa = 32525   383   262   259
	4-grams: rr ra ar aa = 45616   215   246   127
RARE means <= 5
	1-grams: rr ra ar aa =  2352    96    18    52
	2-grams: rr ra ar aa = 11803   334   115   278
	3-grams: rr ra ar aa = 32721   295   203   210
	4-grams

# Verse index

In [4]:
msg("Making a mapping between a passage specification and a verse node")
passage2vnode = {}
for vs in F.otype.s('verse'):
    lab = (F.book.v(vs), F.chapter.v(vs), F.verse.v(vs))
    passage2vnode[lab] = vs
msg("{} verses".format(len(passage2vnode)))

21m 38s Making a mapping between a passage specification and a verse node
21m 39s 23213 verses


# Ways forward

1. restrict to indicated chapters in each of the books (that correspond to the agreed genre of *legal inside thora*)
2. compute unique parallels between different collections in order to get a baseline of how many to expect.
3. alternatively, use parallel passages to find parallel phrase and then focus on things that are rare.

# Other indicators

Look for *update lexemes*, where in parallel phrases one lexeme is different, for example measure nouns.
See for example Ezek 45:10 - Lev 19:36 - Deut 25:15.

This is a case of parallel language use, not strictly a *parallel passage*.

# Difference lexemes

We are going to list the lexemes that make a difference between parallel passages.
For every set of parallel passages we list the lexemes that do not occur in their intersection.

We print a list of all groups of parallel verses, where the lexemes that do not occur in all members of the group are highlighted.

Every lexeme that occurs in some parallel passage, but not in the intersection of the lexemes of its parallel passages will be listed in a dictionary, keyed by that lexeme, and then by the group number, and then it has the following information: passages in which it does occur, and passages in which it does not occur.

In [5]:
CROSSREF_TOOL = 'parallel'
CROSSREF_DB_FILE = 'crossrefdb.csv'
SHEBANQ_PATH = os.path.abspath('{}/../../../shebanq'.format(os.getcwd))
CROSSREF_DB_DIR = '{}/static/docs/tools/{}/files'.format(SHEBANQ_PATH, CROSSREF_TOOL)
CROSSREF_DB_PATH = '{}/{}'.format(CROSSREF_DB_DIR, CROSSREF_DB_FILE)
msg(CROSSREF_DB_PATH)
PRETTY_PAIRS = 'pairs.html'
DIFF_LEX = 'difflex.csv'

21m 43s /Users/dirk/SURFdrive/current/demos/github/shebanq/static/docs/tools/parallel/files/crossrefdb.csv


# Results

In [6]:
loc_tpl = 'https://rawgit.com/etcbc/laf-fabric-nbs/master/lingvar/{}'
HTML('''
<a target="_blank" href="{}">parallel_pairs</a>
<a target="_blank" href="{}">difference lexemes</a>
'''.format(loc_tpl.format(PRETTY_PAIRS), loc_tpl.format(DIFF_LEX)))

In [7]:
msg('Reading crossrefs database')
n = 0
parallel_pairs_proto = collections.defaultdict(lambda: set())
group_index = {}
group_number = 0
with open(CROSSREF_DB_PATH) as f:
    for line in f:
        n += 1
        if n == 1: continue
        (bkx, chx, vsx, bky, chy, vsy, rd) = line.rstrip('\n').split('\t')
        vx = passage2vnode[(bkx, chx, vsx)]
        vy = passage2vnode[(bky, chy, vsy)]
        gx = group_index.get(vx, None)
        gy = group_index.get(vy, None)
        if gx == None and gy == None:
            group_number += 1
            group_index[vx] = group_number
            group_index[vy] = group_number
        elif gx == None:
            group_index[vx] = gy
        elif gy == None:
            group_index[vy] = gx
        elif gx != gy:
            update = [x for x in group_index if group_index[x] == gx]
            for x in update: group_index[x] = gy
msg('{} entries read'.format(n))

for (x, n) in group_index.items(): parallel_pairs_proto[n].add(x)
parallel_pairs = [sorted(parallel_pairs_proto[n]) for n in parallel_pairs_proto]
parallel_pairs_proto = None
msg('Gathered {} sets of parallel verses'.format(len(parallel_pairs)))

21m 52s Reading crossrefs database
21m 52s 14354 entries read
21m 52s Gathered 1235 sets of parallel verses


In [8]:
msg('Building a difference lexeme table')
intersections = []
unions = []
for p in parallel_pairs:
    clex = None
    ulex = collections.defaultdict(lambda: set())
    for x in p:
        xlex = {F.lex.v(w) for w in L.d('word', x)}
        clex = xlex if clex == None else clex & xlex
        for lex in xlex:
            ulex[lex].add(x)
    intersections.append(clex)
    unions.append(ulex)
msg('{} groups in table'.format(len(intersections)))

21m 59s Building a difference lexeme table
22m 00s 1235 groups in table


In [9]:
css = '''
<style type="text/css">
table.t {
    width: 100%;
    border-collapse: collapse;
    direction: rtl;
    border: 2px solid #aaaaaa;
}
td.t {
    border: 2px solid #aaaaaa;
    font-family: Ezra SIL, SBL Hebrew, Verdana, sans-serif;
    font-size: x-large;
    line-height: 1.7;
    text-align: right;
    direction: rtl;
}
td.vl {
    font-family: Verdana, Arial, sans-serif;
    font-size: small;
    text-align: right;
    color: #aaaaaa;
    width: 10%;
    direction: ltr;
    border-left: 2px solid #aaaaaa;
    border-right: 2px solid #aaaaaa;
}
span.m {
    background-color: #aaaaff;
}
span.f {
    background-color: #ffaaaa;
}
span.x {
    background-color: #ffffaa;
    color: #bb0000;
}
span.delete {
    background-color: #ffaaaa;
}
span.insert {
    background-color: #aaffaa;
}
span.replace {
    background-color: #ffff00;
}
</style>
'''

html_file_tpl = '''<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>{}</title>
{}
</head>
<body>
<h1>Table of groups</h1>
{}
<h1>Groups</h1>
{}
</body>
</html>'''

In [10]:
# we want to sort passages in such a way that the verses in 1 Kings 19-26 are put before anything else

def print_chunk(i, clex, members):
    result = ['''
<a name="c_{i}">Group {i}</a>
<table class="t">
'''.format(i=i)]
    for v in members:
        lab = '{} {}:{}'.format(F.book.v(v), F.chapter.v(v), F.verse.v(v))
        text = ''.join(
            (F.g_word_utf8.v(w)\
                if F.lex.v(w) in clex \
                else '<span class="replace">{}</span>'.format(F.g_word_utf8.v(w))
            )+F.trailer_utf8.v(w) for w in L.d('word', v)
        )
        result.append('''
<tr class="t"><td class="vl">{rb}</td><td class="t">{rl}</td></tr>
'''.format(
            rb=lab,
            rl=text,
        ))
    result.append('''</table>
''')
    return ''.join(result)

def index_chunk(i, clex, members):
    verse_labels = ', '.join('{} {}:{}'.format(F.book.v(v), F.chapter.v(v), F.verse.v(v)) for v in members)
    return '<p><b>{i}</b> <a href="#c_{i}">{vl}</a></p>\n'.format(
        vl = verse_labels, i=i,
    )

In [11]:
msg('Pretty printing the table')
allgeni_html = []
allgenh_html = []
for (i, p) in enumerate(parallel_pairs):
    clex = intersections[i]
    allgeni_html.append(index_chunk(i, clex, p))
    allgenh_html.append(print_chunk(i, clex, p))
outf = open('pairs.html', 'w')
outf.write(html_file_tpl.format(
    'Pairs',
    css,
    ''.join(allgeni_html),
    ''.join(allgenh_html),
))
outf.close()
msg('Done')

22m 21s Pretty printing the table
22m 21s Done


# The table with difference lexemes

In [13]:
diff_pairs = collections.defaultdict(lambda: collections.defaultdict(lambda: {}))
for (i, p) in enumerate(parallel_pairs):
    clex = intersections[i]
    ulex = unions[i]
    for lex in ulex:
        if lex in clex: continue
        diff_pairs[lex][i]['has'] = ulex[lex]
        diff_pairs[lex][i]['hasnot'] = set(p) - ulex[lex]

In [15]:
outf = open('difflex.csv', 'w')
outf.write('lexeme\tgroup\tsize\thas?\tbook\tchapter\tverse\n')
for lex in sorted(diff_pairs):
    for i in sorted(diff_pairs[lex]):
        lnp = len(diff_pairs[lex][i]['has']) + len(diff_pairs[lex][i]['hasnot'])
        for v in sorted(diff_pairs[lex][i]['has']):
            outf.write('{}\t{}\t{}\t+\t{}\t{}\t{}\n'.format(
                lex, i, lnp, F.book.v(v), F.chapter.v(v), F.verse.v(v), 
            ))
        for v in sorted(diff_pairs[lex][i]['hasnot']):
            outf.write('{}\t{}\t{}\t-\t{}\t{}\t{}\n'.format(
                lex, i, lnp, F.book.v(v), F.chapter.v(v), F.verse.v(v), 
            ))
outf.close()

The result is a tab-separated file with fields:

    lexeme
    group number
    has?
    book
    chapter
    verse
    
We list the lexemes that occur in at least one group of parallel passages where the lexeme in question does not occur in all members of the group.
For all such lexemes and for all such groups and for all members of such groups we make an entry.
The entry has + in field *has?* if the lexeme occurs in that passage, else it has -.

See below for the lines corresponding to the first 4 lexemes.