In [1]:
import sys, os
import collections
import subprocess
import time

from copy import deepcopy
from lxml import etree

import laf
from laf.fabric import LafFabric
from etcbc.preprocess import prepare
from etcbc.mql import MQL
fabric = LafFabric()

 0.00s This is LAF-Fabric 4.8.2
API reference: http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html
Feature doc: https://shebanq.ancient-data.org/static/docs/featuredoc/texts/welcome.html



In [2]:
API = fabric.load('etcbc4b', '--', 'mql', {
 "xmlids": {"node": False, "edge": False},
 "features": ('''
 oid otype monads
 g_word g_lex g_word_utf8 g_cons lex 
 typ code function rela det
 txt prs sp
 book chapter verse label
 ''','''
 functional_parent
 '''),
 "prepare": prepare,
}, verbose='DETAIL')
exec(fabric.localnames.format(var='fabric'))
Q = MQL(API)

 0.00s LOADING API: please wait ... 
 0.00s DETAIL: COMPILING m: etcbc4b: UP TO DATE
 0.00s USING main: etcbc4b DATA COMPILED AT: 2015-11-02T15-08-56
 0.01s DETAIL: load main: G.node_anchor_min
 0.08s DETAIL: load main: G.node_anchor_max
 0.15s DETAIL: load main: G.node_sort
 0.20s DETAIL: load main: G.node_sort_inv
 0.69s DETAIL: load main: G.edges_from
 0.76s DETAIL: load main: G.edges_to
 0.84s DETAIL: load main: F.etcbc4_db_monads [node] 
 1.70s DETAIL: load main: F.etcbc4_db_oid [node] 
 3.02s DETAIL: load main: F.etcbc4_db_otype [node] 
 3.76s DETAIL: load main: F.etcbc4_ft_code [node] 
 3.81s DETAIL: load main: F.etcbc4_ft_det [node] 
 4.04s DETAIL: load main: F.etcbc4_ft_function [node] 
 4.17s DETAIL: load main: F.etcbc4_ft_g_cons [node] 
 4.37s DETAIL: load main: F.etcbc4_ft_g_lex [node] 
 4.59s DETAIL: load main: F.etcbc4_ft_g_word [node] 
 4.82s DETAIL: load main: F.etcbc4_ft_g_word_utf8 [node] 
 5.12s DETAIL: load main: F.etcbc4_ft_lex [node] 
 5.32s DETAIL: load main: F.e

In [3]:
def text_per_verse():
 '''
 verse_dict is dict, keys are versenames(book_chapter_verse), values are concatenation of lexemes, each separated by _.
 letter_count_dict is dict, keys are letters, values are counts in whole Hebrew Bible.
 '''
 verse_dict = {}
 letter_count_dict = collections.defaultdict(int)
 for node in NN(): 
 if F.otype.v(node) == 'verse':
 text = ''
 info = F.book.v(L.u('book', node)) + '_' + F.chapter.v(L.u('chapter', node)) + '_' + F.verse.v(node)
 words = L.d('word', node)
 for word in words:
 lexeme = F.lex.v(word)
 for lett in lexeme:
 if not lett in [' ', '/', '=', '[', '_']:
 text += lett
 letter_count_dict[lett] += 1
 text += ' '
 verse_dict[info] = text
 return(verse_dict, letter_count_dict)

In [4]:
def make_short_str(lexem, letter_count_dict):
 '''
 short_word is string 
 '''
 word_lett_dict = {}
 if len(lexem) <= 2:
 return(lexem)
 else:
 for lett in lexem:
 word_lett_dict[lett] = letter_count_dict[lett]
 lowest = min(word_lett_dict, key=word_lett_dict.get) 
 word_lett_dict.pop(lowest, None)
 lowest2 = min(word_lett_dict, key=word_lett_dict.get)

 low_list = [lowest, lowest2]
 short_word = ''
 for lett in lexem:
 if len(short_word) < 2:
 if lett in low_list:
 short_word += lett

 return(short_word)
 

In [5]:
def make_num_dict(letter_count_dict):
 '''
 number_dict is dict, key is one or two letter string (there are (23**23 + 23) keys), value is numerical code of that string
 '''
 number_dict = {}
 basic_num = 100
 for letter in letter_count_dict.keys():
 feat = letter
 number_dict[feat] = basic_num
 basic_num += 1
 for letter2 in letter_count_dict.keys():
 
 feat = letter + letter2
 number_dict[feat] = basic_num
 basic_num += 1
 
 return(number_dict)

In [6]:
def make_skip_grams(number_dict):
 '''
 five_sam is list of reduced words.
 sam_order is int, it is place of last word of skipgram in MT.
 skip_grams_in_book is dict, key is place, value is list of skip-grams associated with it.
 skips_and_info is dict, key is place (int), value is string: book_chapter_verse
 sam_dict is dict, first key is numerical code of first word in skip-gram, second key is tuple of next words in skip-gram, 
 value is list of places of skip_gram in Hebrew Bible, place is place of last word in skip-gram
 
 '''
 sam_dict = collections.defaultdict(lambda: collections.defaultdict(list))
 skip_grams_in_book = collections.defaultdict(list)
 skips_and_info = {}
 order_list = []
 five_sam = []
 sam_order = 0

 for node in NN():
 if F.otype.v(node) == 'word':
 sam_order += 1
 info = F.book.v(L.u('book', node)), F.chapter.v(L.u('chapter', node)), F.verse.v(L.u('verse', node))
 lexeme = F.lex.v(node)
 redu_lex = ''
 for lett in lexeme:
 if not lett in [' ', '/', '=', '[', '_']:
 redu_lex += lett
 reduced = make_short_str(redu_lex, B)
 if len(five_sam) < 5: 
 five_sam.append(reduced)
 if len(five_sam) == 5:
 numb = [number_dict[word] for word in five_sam]
 sam_dict[numb[0]][(str(numb[1]) + str(numb[2]) + str(numb[3]))].append(sam_order)
 sam_dict[numb[0]][(str(numb[1]) + str(numb[2]) + str(numb[4]))].append(sam_order)
 sam_dict[numb[0]][(str(numb[1]) + str(numb[3]) + str(numb[4]))].append(sam_order)
 sam_dict[numb[0]][(str(numb[2]) + str(numb[3]) + str(numb[4]))].append(sam_order)
 skip_grams_in_book[sam_order].append((numb[0], str(numb[1]) + str(numb[2]) + str(numb[3])))
 skip_grams_in_book[sam_order].append((numb[0], str(numb[1]) + str(numb[2]) + str(numb[4])))
 skip_grams_in_book[sam_order].append((numb[0], str(numb[1]) + str(numb[3]) + str(numb[4])))
 skip_grams_in_book[sam_order].append((numb[0], str(numb[2]) + str(numb[3]) + str(numb[4])))
 skips_and_info[sam_order] = info
 del(five_sam[0])
 return(skips_and_info, sam_dict)

In [7]:
def make_grambi_dict(sam_dict):
 '''
 grambi_dict is dict, key is place(int), value is list of other places sharing skip-gram with place 
 grambi_list is list of all places (int)
 '''
 grambi_dict = collections.defaultdict(list)
 grambi_set = set()
 for key in sam_dict.keys():
 for key2 in sam_dict[key].keys():
 for place in sam_dict[key][key2]:
 grambi_set.add(place)
 for other_item in sam_dict[key][key2]:
 if not other_item in grambi_dict[place]:
 grambi_dict[place].append(other_item)
 grambi_list = sorted(list(grambi_set))
 return(grambi_dict, grambi_list) 

In [8]:
def find_clusters(grambi_dict, grambi_list, chap_dict): #=HK, KL, FG
 finished_clusters = collections.defaultdict(list)
 cluster_dict = collections.defaultdict(list)
 start_skip_list = []
 for gram in grambi_list:
 if len(cluster_dict) == 0:
 if len(grambi_dict[gram]) > 1:
 for item in grambi_dict[gram]:
 if item != gram:
 if abs(item - gram) > 50:
 if not (chap_dict[gram][0], chap_dict[gram][1]) == (chap_dict[item][0], chap_dict[item][1]):
 if not set((chap_dict[gram], chap_dict[item])) in start_skip_list:
 cluster_dict[(gram, item)].append(gram)
 cluster_dict[(gram, item)].append(item)
 else:
 copy_dict = deepcopy(cluster_dict)
 keys = copy_dict.keys()
 for it1, it2 in keys:
 if gram in grambi_dict[cluster_dict[(it1, it2)][-1] + 1]:
 cluster_dict[(it1, it2)].append(gram)
 cluster_dict[(it1, it2)].append(cluster_dict[(it1, it2)][-2] + 1)
 else:
 if len(cluster_dict[(it1, it2)]) > 39:
 finished_clusters[(it1, it2)] = sorted(cluster_dict[(it1, it2)])
 start_skip_list.append(set((chap_dict[it1], chap_dict[it2])))
 cluster_dict.pop((it1, it2))
 print(chap_dict[it1], chap_dict[it2])
 else:
 cluster_dict.pop((it1, it2))

 return(finished_clusters)

In [11]:
start_time = time.time()
A, B = text_per_verse()
D = make_num_dict(B)
FG, GH = make_skip_grams(D)
HK, KL = make_grambi_dict(GH)
LM = find_clusters(HK, KL, FG)
print("--- %s seconds ---" % (time.time() - start_time))

('Genesis', '10', '2') ('Chronica_I', '1', '5')
('Genesis', '10', '7') ('Chronica_I', '1', '9')
('Genesis', '10', '13') ('Chronica_I', '1', '11')
('Genesis', '10', '24') ('Chronica_I', '1', '18')
('Genesis', '36', '32') ('Chronica_I', '1', '43')
('Genesis', '36', '40') ('Chronica_I', '1', '51')
('Exodus', '20', '2') ('Deuteronomium', '5', '6')
('Exodus', '20', '4') ('Deuteronomium', '5', '8')
('Exodus', '20', '5') ('Deuteronomium', '5', '9')
('Exodus', '20', '9') ('Deuteronomium', '5', '13')
('Exodus', '25', '4') ('Exodus', '35', '6')
('Exodus', '25', '12') ('Exodus', '37', '3')
('Exodus', '25', '17') ('Exodus', '37', '6')
('Exodus', '25', '19') ('Exodus', '37', '8')
('Exodus', '25', '20') ('Exodus', '37', '9')
('Exodus', '25', '23') ('Exodus', '37', '10')
('Exodus', '25', '26') ('Exodus', '37', '13')
('Exodus', '25', '31') ('Exodus', '37', '17')
('Exodus', '25', '33') ('Exodus', '37', '19')
('Exodus', '26', '1') ('Exodus', '36', '8')
('Exodus', '26', '4') ('Exodus', '36', '11')
('Exod

In [12]:
len(LM)

281