# Combining SP and MT datasets 

This notebok shows how the SP dataset can be combined with the BHSA dataset, which contains the Masoretic Text (MT). [The first example](#section1) compares the text of all the verses in Genesis, [the second example](#section2) explores spelling variation of proper nouns between SP and MT.

In [10]:
from tf.app import use

# Load the SP data, and rename the node features class F,
# the locality class L and the text class T, 
# then they cannot be overwritten while loading the MT.
SP = use('DT-UCPH/sp', version='3.3')
Fsp, Lsp, Tsp = SP.api.F, SP.api.L, SP.api.T

# Do the same for the MT dataset.
MT = use('etcbc/bhsa', version='2021')
Fmt, Lmt, Tmt = MT.api.F, MT.api.L, MT.api.T

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,5,79878.2,100
chapter,187,2135.78,100
verse,5841,68.38,100
word,114890,3.48,100
sign,399391,1.0,100


**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


<a id='section1'></a> 
## 1. Comparison of texts of SP and MT

In [12]:
# Add the other book names for textual variation in these books.
PENTATEUCH = ['Genesis'] #, 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy']

In [13]:
def reconstruct_pentateuchal_verses(F, L, T, text_feature):
    """For each verse of the Pentateuch in a given dataset, the text of each verse is reconstructed.
    Output:
    verse_texts: dict   Keys are verse label (tuple with book, chapter verse), values are reconstructed text (str).
    """
    verse_texts = {}

    for verse_node in F.otype.s('verse'):
        bo, ch, ve = T.sectionFromNode(verse_node)
        if bo in PENTATEUCH:
            verse_text = ''
            word_nodes = L.d(verse_node, 'word')
            for word_node in word_nodes:
                word_text = eval(f'F.{text_feature}.v(word_node)')
                trailer = F.trailer.v(word_node)
                if not word_text:
                    continue
                elif not trailer:
                    verse_text += word_text
                else:
                    verse_text += word_text + ' '

            verse_texts[(bo, ch, ve)] = verse_text.strip()
    return verse_texts    
    
sp_verses = reconstruct_pentateuchal_verses(Fsp, Lsp, Tsp, 'g_cons')
mt_verses = reconstruct_pentateuchal_verses(Fmt, Lmt, Tmt, 'g_cons')

In [14]:
for label, mt_verse_text in mt_verses.items():
    sp_verse_text = sp_verses.get(label, '')
    if mt_verse_text != sp_verse_text:
        print(label)
        print('SP:', sp_verse_text)
        print('MT:', mt_verse_text)
        print()

('Genesis', 1, 11)
SP: WJ>MR >LHJM TDC> H>RY DC> <FB MZRJ< ZR< W<Y PRJ <FH PRJ LMJNW >CR ZR<W BW <L H>RY WJHJ KN
MT: WJ>MR >LHJM TDC> H>RY DC> <FB MZRJ< ZR< <Y PRJ <FH PRJ LMJNW >CR ZR<W BW <L H>RY WJHJ KN

('Genesis', 1, 14)
SP: WJ>MR >LHJM JHJ M>WRWT BRQJ< HCMJM LH>JR <L H>RY WLHBDJL BJN HJWM WBJN HLJLH WHJW L>TWT WLMW<DJM WLJMJM WCNJM
MT: WJ>MR >LHJM JHJ M>RT BRQJ< HCMJM LHBDJL BJN HJWM WBJN HLJLH WHJW L>TT WLMW<DJM WLJMJM WCNJM

('Genesis', 1, 15)
SP: WHJW LM>WRWT BRQJ< HCMJM LH>JR <L H>RY WJHJ KN
MT: WHJW LM>WRT BRQJ< HCMJM LH>JR <L H>RY WJHJ KN

('Genesis', 1, 16)
SP: WJ<F >LHJM >T CNJ HM>WRWT HGDLJM >T HM>WR HGDWL LMMCLT HJWM W>T HM>WR HQVN LMMCLT HLJLH W>T HKWKBJM
MT: WJ<F >LHJM >T CNJ HM>RT HGDLJM >T HM>WR HGDL LMMCLT HJWM W>T HM>WR HQVN LMMCLT HLJLH W>T HKWKBJM

('Genesis', 1, 20)
SP: WJ>MR >LHJM JCRYW HMJM CRY NPC XJH W<WP J<PP <L H>RY <L PNJ RQJ< HCMJM
MT: WJ>MR >LHJM JCRYW HMJM CRY NPC XJH W<WP J<WPP <L H>RY <L PNJ RQJ< HCMJM

('Genesis', 1, 21)
SP: WJBR> >LHJM >T HTNJNJM 

SP: WJJVBW DBRJHM B<JNJ XMWR WB<JNJ CKM BNW
MT: WJJVBW DBRJHM B<JNJ XMWR WB<JNJ CKM BN XMWR

('Genesis', 34, 21)
SP: H>NCJM H>LH CLMJM HM >TNW JCBW B>RY WJSXRW >TH WH>RY HNH RXBWT JDJM LPNJHM >T BNWTM NQX LNW LNCJM W>T BNTJNW NTN LHM
MT: H>NCJM H>LH CLMJM HM >TNW WJCBW B>RY WJSXRW >TH WH>RY HNH RXBT JDJM LPNJHM >T BNTM NQX LNW LNCJM W>T BNTJNW NTN LHM

('Genesis', 34, 22)
SP: >K BZ>T J>WTW LNW H>NCJM LCBT >TNW LHJWT L<M >XD BHMWL LNW KL ZKR K>CR HM NMLJM
MT: >K BZ>T J>TW LNW H>NCJM LCBT >TNW LHJWT L<M >XD BHMWL LNW KL ZKR K>CR HM NMLJM

('Genesis', 34, 23)
SP: MQNJHM WQNJNM WKL BHMTM HLW> LNW HM >K N>WT LHM WJCBW >TNW
MT: MQNHM WQNJNM WKL BHMTM HLW> LNW HM >K N>WTH LHM WJCBW >TNW

('Genesis', 34, 27)
SP: WBNJ J<QB B>W <L HXLLJM WJBZW H<JR >CR VM>W >XWTM
MT: BNJ J<QB B>W <L HXLLJM WJBZW H<JR >CR VM>W >XWTM

('Genesis', 34, 29)
SP: W>T KL XJLM W>T VPM W>T NCJHM CBW WJBZW >T KL >CR BBJT
MT: W>T KL XJLM W>T KL VPM W>T NCJHM CBW WJBZW W>T KL >CR BBJT

('Genesis', 34, 30)
SP: WJ>MR J<QB >L C

## Compare texts with minimum Levenshtein distance

In [5]:
from Levenshtein import distance

In [6]:
threshold = 10

for label, mt_verse_text in mt_verses.items():
    sp_verse_text = sp_verses.get(label, '')
    if distance(mt_verse_text, sp_verse_text) > threshold:
        print(label)
        print('SP:', sp_verse_text)
        print('MT:', mt_verse_text)
        print()

('Genesis', 1, 14)
SP: WJ>MR >LHJM JHJ M>WRWT BRQJ< HCMJM LH>JR <L H>RY WLHBDJL BJN HJWM WBJN HLJLH WHJW L>TWT WLMW<DJM WLJMJM WCNJM
MT: WJ>MR >LHJM JHJ M>RT BRQJ< HCMJM LHBDJL BJN HJWM WBJN HLJLH WHJW L>TT WLMW<DJM WLJMJM WCNJM

('Genesis', 5, 19)
SP: WJXJ JRD >XRJ HWLJDW >T XNWK XMC WCMNJM CNH WCB< M>WT CNH WJWLJD BNJM WBNWT
MT: WJXJ JRD >XRJ HWLJDW >T XNWK CMNH M>WT CNH WJWLD BNJM WBNWT

('Genesis', 5, 20)
SP: WJHJW KL JMJ JRD CB< W>RB<JM CNH WCMNH M>WT CNH WJMT
MT: WJHJW KL JMJ JRD CTJM WCCJM CNH WTC< M>WT CNH WJMT

('Genesis', 5, 25)
SP: WJXJ MTWCLX CB< WCCJM CNH WJWLJD >T LMK
MT: WJXJ MTWCLX CB< WCMNJM CNH WM>T CNH WJWLD >T LMK

('Genesis', 5, 28)
SP: WJXJ LMK CLC WXMCJM CNH WJWLJD BN
MT: WJXJ LMK CTJM WCMNJM CNH WM>T CNH WJWLD BN

('Genesis', 5, 30)
SP: WJXJ LMK >XRJ HWLJDW >T NX CC M>WT CNH WJWLJD BNJM WBNWT
MT: WJXJ LMK >XRJ HWLJDW >T NX XMC WTC<JM CNH WXMC M>T CNH WJWLD BNJM WBNWT

('Genesis', 6, 20)
SP: WHJH MN H<WP LMJNHW WMN HBHMH LMJNH WMKL >CR RMF <L H>DMH LMJNJHM CNJM M

<a id='section2'></a>
## 2. Comparison of spelling of proper nouns between SP and MT

In [7]:
import collections

In [8]:
def collect_proper_noun_spellings(F, L, T):
    """Collects different spellings of proper nouns in a dataset.
    Output:
    proper_nouns_spelling: dict  Keys are lexemes of proper nouns, values are set with all spellings of the lexeme.
    """
    proper_nouns_spellings = collections.defaultdict(set)
    for w in F.otype.s('word'):
        bo, _, _ = T.sectionFromNode(w)
        if bo in PENTATEUCH and F.sp.v(w) == 'nmpr':
            proper_nouns_spellings[F.lex.v(w)].add(F.g_cons.v(w))
            
    return proper_nouns_spellings
        
sp_spellings = collect_proper_noun_spellings(Fsp, Lsp, Tsp) 
mt_spellings = collect_proper_noun_spellings(Fmt, Lmt, Tmt)

In [9]:
for lex, mt_spelling_set in mt_spellings.items():
    sp_spelling_set = sp_spellings.get(lex, set())
    if mt_spelling_set != sp_spelling_set:
        print(lex)
        print('MT:', mt_spelling_set)
        print('SP:', sp_spelling_set)
        print()
        

XDQL/
MT: {'XDQL'}
SP: {'HDQL'}

NWD/
MT: {'NWD'}
SP: {'ND'}

XNWK/
MT: {'XNWK', 'XNK'}
SP: {'XNWK'}

MXWJ>L/
MT: {'MXWJ>L'}
SP: {'MJH>L'}

MXJJ>L/
MT: {'MXJJ>L'}
SP: set()

>RRV/
MT: {'>RRV'}
SP: {'HRRV'}

TBL==/
MT: {'TBL'}
SP: {'TWBL'}

MCK=/
MT: {'MCK'}
SP: {'MWCK'}

>LJCH/
MT: {'>LJCH'}
SP: {'>LJC'}

SB>==/
MT: {'SB>'}
SP: {'SBH'}

SBTK>/
MT: {'SBTK>'}
SP: {'SBTKH'}

NMRD/
MT: {'NMRD'}
SP: {'NMRWD'}

RXBT_<JR/
MT: {'RXBT_<JR'}
SP: {'RXBWT_<JR'}

LWDJM/
MT: {'LWDJM'}
SP: {'LDJM'}

<NMJM/
MT: {'<NMJM'}
SP: {'<JNMJM'}

YJDWN/
MT: {'YJDN'}
SP: {'YJDWN', 'YJDN'}

<ZH/
MT: {'<ZH'}
SP: set()

YB>JM/
MT: {'YBJM', 'YBJJM'}
SP: {'YBW>JM'}

LC</
MT: {'LC<'}
SP: set()

LWD/
MT: {'LWD'}
SP: {'LD'}

XWL==/
MT: {'XWL'}
SP: {'XWJL'}

MC/
MT: {'MC'}
SP: {'MC>'}

HDWRM/
MT: {'HDWRM'}
SP: {'>DWRM'}

>WZL/
MT: {'>WZL'}
SP: {'<JZL'}

<WBL/
MT: {'<WBL'}
SP: {'<JBL'}

>WPJR/
MT: {'>WPR'}
SP: {'>PR'}

MWRH===/
MT: {'MWRH'}
SP: {'MWR>'}

Y<R/
MT: {'YW<R', 'Y<R', 'Y<RH'}
SP: {'Y<R', 'Y<RH'}

KDRL<MR/
MT: {