# Lexemes

Various ways to list the lexeme base of individual chapters in the Hebew Bible.

The *lexeme base* of a passage is the set of lexemes that occurs in that passage.

We define a function, ``lexbase(passages, excluded=xpassages)``, 
that produces a file of the lexemes that occur in a given list of passages and do not occur in an other given list of passages.

If you have LAF-Fabric working and downloaded this notebook, you can call this function yourself in order to generate 
lexeme bases of arbitrary passages.

We also produce standard files with the lexeme bases of individual books, chapters and verses in the Bible.


# Output

The output files are organized as follows:

* all files are comma separated text files that can imported in a spreadsheet application such as OpenOffice or Excel;
* every line corresponds to a lexeme in the lexeme base and contains the following information:
 * lexeme (unique identifier in transcription, containing `` / [ = `` characters),
 * frequency (number of occurrences of this lexeme in the whole Hebrew Bible),
 * ``lex_utf8`` feature (the lexeme in Hebrew as it occurs in the ETCBC text database),
 * ``g_entry_heb`` feature (the vocalized lexeme as it is listed in the ETCBC lexicon),
 * ``sp`` feature (part of speech),
 * ``gloss`` feature.
 


In [1]:
import sys, collections, re

from laf.fabric import LafFabric
from etcbc.preprocess import prepare
fabric = LafFabric()

 0.00s This is LAF-Fabric 4.5.0
API reference: http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html
Feature doc: http://shebanq-doc.readthedocs.org/en/latest/texts/welcome.html



In [2]:
version = '4b'
fabric.load('etcbc{}'.format(version), 'lexicon', 'lexemes', {
 "xmlids": {"node": False, "edge": False},
 "features": ('''
 otype
 lex lex_utf8 g_entry_heb
 sp gloss
 book chapter verse
 ''',''),
 "prepare": prepare,
 "primary": False,
})
exec(fabric.localnames.format(var='fabric'))

 0.00s LOADING API: please wait ... 
 0.65s INFO: USING DATA COMPILED AT: 2015-05-04T13-46-20
 0.65s INFO: USING DATA COMPILED AT: 2015-05-04T14-07-34
 3.67s LOGFILE=/Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/__log__lexemes.txt
 14s INFO: DATA LOADED FROM SOURCE etcbc4b AND ANNOX lexicon FOR TASK lexemes AT 2015-05-27T15-43-40


In [3]:
csvdir = my_file('csv')
passagedir = my_file('passage')
%mkdir -p {csvdir}
%mkdir -p {passagedir}

In [9]:
passage_pat = re.compile('^\s*([A-Za-z0-9_]+)\s*([0-9,-]*)\s*:?\s*([0-9,-]*)\s*$')

lex_info = {}
lex_section = {}
lex_count = collections.Counter()
for v in F.otype.s('verse'):
 bk = F.book.v(L.u('book', v))
 ch = F.chapter.v(L.u('chapter', v))
 vs = F.verse.v(v)
 for w in L.d('word', v):
 lex = F.lex.v(w)
 if lex not in lex_info:
 lex_info[lex] = (F.lex_utf8.v(w), F.g_entry_heb.v(w), F.sp.v(w), F.gloss.v(w))
 lex_section.setdefault(bk, {}).setdefault(ch, {}).setdefault(vs, collections.Counter())[lex] += 1
 lex_count[lex] += 1

def verse_index():
 result = {}
 for v in F.verse.s():
 bk = F.book.v(L.u('book', v))
 ch = F.chapter.v(L.u('chapter', v))
 vs = F.verse.v(v)
 result.setdefault(bk, {}).setdefault(ch, {})[vs] = v
 return result

vindex = verse_index()

def parse_passages(passages):
 lexemes = set()
 for p in passages.strip().split('|'):
 lexemes |= parse_passage(p.strip())
 return lexemes

def parse_ranges(rangespec, kind, passage, source, subsources=None):
 numbers = set()
 if rangespec == '':
 if subsources == None:
 return set(source.keys())
 else:
 for subsource in subsources:
 if subsource in source:
 numbers |= set(source[subsource].keys())
 return numbers
 ranges = rangespec.split(',')
 good = True
 for r in ranges:
 comps = r.split('-', 1)
 if len(comps) == 1:
 b = comps[0]
 e = comps[0]
 else:
 (b,e) = comps
 if not (b.isdigit() and e.isdigit()):
 print('Error: Not a valid {} range: [{}] in [{}]'.format(kind, r, passage))
 good = False
 else:
 b = int(b)
 e = int(e)
 for c in range(b, e+1):
 crep = str(c)
 if subsources == None:
 if crep not in source:
 print('Warning: No such {}: {} ([{}] in [{}])'.format(kind, crep, r, passage))
 numbers.add(crep)
 else:
 for subsource in subsources:
 if subsource not in source or crep not in source[subsource]:
 print('Warning: No such {}: {}:{} ([{}] in [{}])'.format(kind, subsource, crep, r, passage))
 numbers.add(crep)
 return numbers
 
def parse_passage(passage):
 lexemes = set()
 result = passage_pat.match(passage)
 if result == None:
 print('Error: Not a valid passage: {}'.format(passage))
 return lexemes
 (book, chapterspec, versespec) = result.group(1,2,3)
 if book not in vindex:
 print('Error: Not a valid book: {} in {}'.format(book, passage))
 return lexemes
 chapters = parse_ranges(chapterspec, 'chapter', passage, vindex[book])
 verses = parse_ranges(versespec, 'verse', passage, vindex[book], chapters)

 vnodes = set()
 for ch in vindex[book]:
 if ch not in chapters: continue
 for vs in vindex[book][ch]:
 if vs not in verses: continue
 vnodes.add(vindex[book][ch][vs])
 lexemes = set()
 for v in vnodes:
 for w in L.d('word', v):
 lexemes.add(F.lex.v(w))
 return lexemes
 
def lexbase(passages, excluded=None):
 lexemes = parse_passages(passages)
 outlexemes = set() if excluded == None else parse_passages(excluded)
 lexemes -= outlexemes
 fileid = '{}{}'.format(
 passages, 
 '' if excluded == None else ' minus {}'.format(excluded)
 )
 filename = 'passage/{}.csv'.format(fileid.replace(':','_'))
 of = outfile(filename)
 i = 0
 limit = 20
 nlex = len(lexemes)
 shown = min((nlex, limit))
 print('==== {} ==== showing {} of {} lexemes here ===='.format(fileid, shown, nlex))
 for lx in sorted(lexemes, key=lambda x: (-lex_count[x], x)):
 (l_utf8, l_vc, l_sp, l_gl) = lex_info[lx]
 line = '"{}",{},{}","{}","{}","{}"\n'.format(lx, lex_count[lx], l_utf8, l_vc, l_sp, l_gl)
 of.write(line)
 if i < limit: sys.stdout.write(line)
 i += 1
 of.close()
 print('See {}\n'.format(my_file(filename)))

# Examples

Here are some examples of the flexibility with which you can call the ``lexbase`` function.

In [11]:
lexbase('Genesis 2', excluded=None)
lexbase('Genesis 2', excluded='Genesis 1')
lexbase('Genesis 3-4,10', excluded='Genesis 1-2')
lexbase('Exodus', excluded='Genesis')
lexbase('Numeri 1-3:10-15|Judices 5:1,3,5,7,9|Ruth 4', excluded='Chronica_I|Chronica_II')

==== Genesis 2 ==== showing 20 of 131 lexemes here ====
"W",51004,ו","וְ","conj","and"
"H",30386,ה","הַ","art","the"
"L",20447,ל","לְ","prep","to"
"B",15767,ב","בְּ","prep","in"
">T",11017,את","אֵת","prep",""
"MN",7681,מן","מִן","prep","from"
"JHWH/",6828,יהוה/","יהוה","nmpr","YHWH"
"L",5521,אל","אֶל","prep","to"
">CR",5500,אשׁר","אֲשֶׁר","conj",""
"KL/",5495,כל/","כֹּל","subs","whole"
">MR[",5378,אמר[","אָמַר","verb","say"
"L>",5249,לא","לֹא","nega","not"
"KJ",4483,כי","כִּי","conj","that"
"HJH[",3561,היה[","הָיָה","verb","be"
"K",2965,כ","כְּ","prep","as"
"LHJM/",2601,אלהים/","אֱלֹהִים","subs","god(s)"
"BW>[",2570,בוא[","בֹּוא","verb","come"
">RY/",2504,ארץ/","אֶרֶץ","subs","earth"
See /Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/passage/Genesis 2.csv

==== Genesis 2 minus Genesis 1 ==== showing 20 of 88 lexemes here ====
"JHWH/",6828,יהוה/","יהוה","nmpr","YHWH"
"L>",5249,לא","לֹא","nega","not"
"BW>[",2570,בוא[","בֹּוא","verb","come"
">JC/",2186,אישׁ/","אִישׁ","subs","man"

# Standard lexeme files

Here we produce several lexeme files for books and chapters.

## Output kind

There are normal and incremental output files.
In a normal output file, you find all lexemes for the indicated chapters and verses.
In an *incremental* file, you find per indicated passage the lexemes that are new with respect to the previous passages (either the previous verses in the chapter, or the previous chapters in the book).

## Output files

* all_lexemes.csv contains a listing of all lexemes, ordered by frequency
* *book*.csv contains a listing of all lexemes in that book
* *book*_per_ch.csv contains a listing of all lexemes in that book, organized by chapter
* *book*_per_ch_inc.csv contains a listing of all lexemes in that book, organized by chapter, where each chapter lists only the lexemes that did not occur in previous chapters of that book
* *book*_per_vs.csv contains a listing of all lexemes in that book, organized by chapter and then by verse
* *book*_per_vs_inc.csv contains a listing of all lexemes in that book, organized by chapter and then by verse, where each verse lists only the lexemes that did not occur in previous verses of that same chapter

## Output location

You can download the files as they have been generated by my LAF-Fabric installation via my SURFdrive:
[version 4](https://surfdrive.surf.nl/files/public.php?service=files&t=dca1e8094a9b3c4c79f07d17306d12bd) 
[version 4b](https://surfdrive.surf.nl/files/public.php?service=files&t=faf643c647abb6bfd052f6c2898efae5)

In [5]:
outf = outfile("csv/all_lexemes.csv")
for (l, f) in sorted(lex_count.items(), key=lambda x: -x[1]):
 (l_utf8, l_vc, l_sp, l_gl) = lex_info[l]
 outf.write('"{}",{},"{}","{}","{}","{}"\n'.format(
 l, f, l_utf8, l_vc, l_sp, l_gl,
 ))
outf.close()

In [6]:
for bk in sorted(lex_section):
 outfb = outfile("csv/{}.csv".format(bk))
 outfc = outfile("csv/{}_per_ch.csv".format(bk))
 outfci = outfile("csv/{}_per_ch_inc.csv".format(bk))
 outfv = outfile("csv/{}_per_vs.csv".format(bk))
 outfvi = outfile("csv/{}_per_vs_inc.csv".format(bk))
 bk_lex = set()
 for ch in sorted(lex_section[bk], key=lambda x: int(x)):
 ch_lex = set()
 for vs in sorted(lex_section[bk][ch], key=lambda x: int(x)):
 for l in sorted(lex_section[bk][ch][vs]):
 (l_utf8, l_vc, l_sp, l_gl) = lex_info[l]
 f = lex_count[l]
 line = '"{}",{},{},"{}",{},"{}","{}","{}","{}"\n'.format(
 bk, ch, vs, l, f, l_utf8, l_vc, l_sp, l_gl,
 )
 outfv.write(line)
 if l not in ch_lex:
 ch_lex.add(l)
 outfvi.write(line)
 if l not in bk_lex:
 bk_lex.add(l)
 for l in sorted(ch_lex):
 (l_utf8, l_vc, l_sp, l_gl) = lex_info[l]
 f = lex_count[l]
 line = '"{}",{},"{}",{},"{}","{}","{}","{}"\n'.format(
 bk, ch, l, f, l_utf8, l_vc, l_sp, l_gl,
 )
 outfc.write(line)
 if l not in bk_lex:
 bk_lex.add(l)
 outfci.write(line)
 for l in sorted(bk_lex):
 (l_utf8, l_vc, l_sp, l_gl) = lex_info[l]
 f = lex_count[l]
 line = '"{}","{}",{},"{}","{}","{}","{}"\n'.format(
 bk, l, f, l_utf8, l_vc, l_sp, l_gl,
 )
 outfb.write(line)
 outfb.close()
 outfc.close()
 outfci.close() 
 outfv.close()
 outfvi.close()