In [20]:
import collections
import json
from collatex import collate
from tf.app import use

In [2]:
BASE = "~/github/among/fusus"
VERSION = "0.7"

In [3]:
LK = "LK"
AF = "AF"

EDITIONS = {
    LK: "Lakhnawi",
    AF: "Afifi",
}

A = {}
F = {}
maxSlot = {}

In [4]:
for (acro, name) in EDITIONS.items():
    A[acro] = use(f"among/fusus/tf/{name}:clone", writing="ara", version=VERSION)
    F[acro] = A[acro].api.F
    maxSlot[acro] = F[acro].otype.maxSlot
maxSlot

This is Text-Fabric 9.1.3
Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html

27 features found and 0 ignored


This is Text-Fabric 9.1.3
Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html

17 features found and 0 ignored


{'LK': 40379, 'AF': 40271}

In [5]:
getTextLK = F[LK].lettersn.v
getTextAF = F[AF].lettersn.v
maxLK = maxSlot[LK]
maxAF = maxSlot[AF]

# Exploring

First a small example.

In [6]:
tokensLK = [dict(t=f"{getTextLK(slot)} ", s=slot) for slot in range(1, 10)]
tokensAF = [dict(t=f"{getTextAF(slot)} ", s=slot) for slot in range(1, 10)]

data = dict(
    witnesses=[
        dict(id=LK, tokens=tokensLK),
        dict(id=AF, tokens=tokensAF),
    ],
)

In [7]:
A[LK].indent(reset=True)
A[LK].info("Run collatex")
result = collate(data, output="json", segmentation=False, near_match=True)
resultAscii = collate(data, output="table", segmentation=False, near_match=True)
A[LK].info("Done")

  0.00s Run collatex
  0.01s Done


In [8]:
print(resultAscii)

+----+-----------+-------+-------+-----+------+-------+-----+------+-------+--------+--------+
| LK | -         | -     | ālḥmd | llh | mnzl | ālḥkm | ʿlá | ḳlwb | ālklm | bāḥdyŧ | ālṭryḳ |
| AF | bnzlylālʿ | ylrʿā | ālḥmd | lh  | mnzl | ālḥk  | ʿlá | ḳlwb | ālklm | -      | -      |
+----+-----------+-------+-------+-----+------+-------+-----+------+-------+--------+--------+


In [9]:
output = json.loads(result)["table"]
outputLK = output[0]
outputAF = output[1]

print(output[0])
print("=========")
print(output[1])

[None, None, [{'_sigil': 'LK', '_token_array_position': 0, 's': 1, 't': 'ālḥmd '}], [{'_sigil': 'LK', '_token_array_position': 1, 's': 2, 't': 'llh '}], [{'_sigil': 'LK', '_token_array_position': 2, 's': 3, 't': 'mnzl '}], [{'_sigil': 'LK', '_token_array_position': 3, 's': 4, 't': 'ālḥkm '}], [{'_sigil': 'LK', '_token_array_position': 4, 's': 5, 't': 'ʿlá '}], [{'_sigil': 'LK', '_token_array_position': 5, 's': 6, 't': 'ḳlwb '}], [{'_sigil': 'LK', '_token_array_position': 6, 's': 7, 't': 'ālklm '}], [{'_sigil': 'LK', '_token_array_position': 7, 's': 8, 't': 'bāḥdyŧ '}], [{'_sigil': 'LK', '_token_array_position': 8, 's': 9, 't': 'ālṭryḳ '}]]
[[{'_sigil': 'AF', '_token_array_position': 10, 's': 1, 't': 'bnzlylālʿ '}], [{'_sigil': 'AF', '_token_array_position': 11, 's': 2, 't': 'ylrʿā '}], [{'_sigil': 'AF', '_token_array_position': 12, 's': 3, 't': 'ālḥmd '}], [{'_sigil': 'AF', '_token_array_position': 13, 's': 4, 't': 'lh '}], [{'_sigil': 'AF', '_token_array_position': 14, 's': 5, 't': 'm

# Postprocessing

We need to turn the output into a clean alignment list.

In [10]:
def makeAlignment(result):
    output = json.loads(result)["table"]
    outputLK = output[0]
    outputAF = output[1]
    
    alignment = []
    for (chunkLK, chunkAF) in zip(outputLK, outputAF):
        if chunkLK is None:
            iLK = ""
            textLK = ""
        else:
            iLK = chunkLK[0]["s"]
            textLK = chunkLK[0]["t"]
        if chunkAF is None:
            iAF = ""
            textAF = ""
        else:
            iAF = chunkAF[0]["s"]
            textAF = chunkAF[0]["t"]
        alignment.append((iLK, textLK, textAF, iAF))
        
    return alignment

In [11]:
alignment = makeAlignment(result)
alignment

[('', '', 'bnzlylālʿ ', 1),
 ('', '', 'ylrʿā ', 2),
 (1, 'ālḥmd ', 'ālḥmd ', 3),
 (2, 'llh ', 'lh ', 4),
 (3, 'mnzl ', 'mnzl ', 5),
 (4, 'ālḥkm ', 'ālḥk ', 6),
 (5, 'ʿlá ', 'ʿlá ', 7),
 (6, 'ḳlwb ', 'ḳlwb ', 8),
 (7, 'ālklm ', 'ālklm ', 9),
 (8, 'bāḥdyŧ ', '', ''),
 (9, 'ālṭryḳ ', '', '')]

# Bigger sizes

How is the performance?

In [35]:
def test(size=None):
    sizeLK = maxLK if size is None else size
    sizeAF = maxAF if size is None else size
    tokensLK = [dict(t=f"{getTextLK(slot)} ", s=slot) for slot in range(1, sizeLK)]
    tokensAF = [dict(t=f"{getTextAF(slot)} ", s=slot) for slot in range(1, sizeAF)]

    data = dict(
        witnesses=[
            dict(id=LK, tokens=tokensLK),
            dict(id=AF, tokens=tokensAF),
        ],
    )
    A[LK].indent(reset=True)
    A[LK].info("Run collatex")
    result = collate(data, output="json", segmentation=False, near_match=True)
    A[LK].info("collation done")
    alignment = makeAlignment(result)
    A[LK].info(f"postprocessing done. {len(alignment)} entries in alignment table")
    return alignment

In [36]:
alignment = test(10)

  0.00s Run collatex
  0.00s collation done
  0.00s postprocessing done. 11 entries in alignment table


In [37]:
alignment = test(100)

  0.00s Run collatex
  0.10s collation done
  0.10s postprocessing done. 102 entries in alignment table


In [38]:
alignment = test(1000)

  0.00s Run collatex
  7.87s collation done
  7.87s postprocessing done. 1039 entries in alignment table


In [40]:
alignment = test(2000)

  0.00s Run collatex
    34s collation done
    34s postprocessing done. 2057 entries in alignment table


In [41]:
alignment = test(4000)

  0.00s Run collatex
 2m 44s collation done
 2m 44s postprocessing done. 4095 entries in alignment table


The performance does not scale well.
Our editions are 40,000 words each, so running Collatex on the full input will require 100 times
as much time as this, probably over 5 hours.

In our case, we are sure that we do not have to compare every part of the one edition
with every part of the other edition, which would require quadratic effort and which
Collatex seems to be needing.
A solution would be to divide the input in 100 word chunks and run Collatex repeatedly
on pairs of chunks.
But that would require quite subtle coding in order to avoid cases where variants occur at 
chunk boundaries.

We also do not get information about the closeness of the variants.

But how is the quality of the matching?

We apply the same method as we did after applying the algorithm of the
[`compareAFLk` notebook](compareAfLk.ipynb), with minor modifications.

We only can do it on the first 10% of the input, because we did not wait for those 5 hours.

In [54]:
def printLines(start=0, end=None):
    if start < 0:
        start = 0
    if end is None or end > len(alignment):
        end = len(alignment)
    lines = []
    for (iLK, left, right, iAF) in alignment[start:end]:
        lines.append(f"{iLK:>5} {left:>20} @{0 if left == right else 1} {right:<20} {iAF:>5}")
    return "\n".join(lines)
        
        
def printDiff(before, after):
    print(printLines(start=len(alignment) - before))
    lastLK = None
    lastAF = None
    for c in range(len(alignment) - 1, -1, -1):
        comp = alignment[c]
        if lastLK is None:
            if comp[0]:
                lastLK = comp[0]
        if lastAF is None:
            if comp[3]:
                lastAF = comp[3]
        if lastLK is not None and lastAF is not None:
            break
    if lastLK is not None and lastAF is not None:
        for i in range(after):
            iLK = lastLK + 1 + i
            iAF = lastAF + 1 + i
            textLK = getTextLK(iLK) if iLK <= maxLK else ""
            textAF = getTextAF(iAF) if iAF <= maxAF else ""
            print(f"{iLK:>5} =  {textLK:>20} @{0 if textLK == textAF else 1} {textAF:<20}  = {iAF:>5}")

In [55]:
# this number of good lines between bad lines will not lead to the
# interruption of bad stretches

LOOKAHEAD = 3


def analyseStretch(start, end):
    total = 0
    onlyLK = 0
    onlyAF = 0
    
    for (iLK, left, right, iAF) in alignment[start:end + 1]:
        total += 1
        if not iLK:
            onlyAF += 1
        if not iAF:
            onlyLK += 1
    
    suspect = onlyAF > 1 and onlyLK > 1 and onlyAF + onlyLK > 5
    return suspect
    
def checkAlignment(lastLK, lastAF):
    errors = {}
    prevILK = 0
    prevIAF = 0
    
    where = collections.Counter()
    agreement = collections.Counter()
    badStretches = collections.defaultdict(lambda: [])
    
    startBad = 0
    
    for (c, (iLK, left, right, iAF)) in enumerate(alignment):
        thisBad = not iLK or not iAF
        # a good line between bad lines is counted as bad
        if not thisBad and startBad:
            nextGood = True
            for j in range(1, LOOKAHEAD + 1):
                if c + j < len(alignment):
                    compJ = alignment[c + j]
                    if not compJ[0] or not compJ[-1]:
                        nextGood = False
                        break
            if not nextGood:
                thisBad = True
        if startBad:
            if not thisBad:
                badStretches[c - startBad].append(startBad)
                startBad = 0
        else:
            if thisBad:
                startBad = c
        
        agreement[0 if left == right else 1] += 1
        
        if iLK:
            if iLK != prevILK + 1:
                errors.setdefault("wrong iLK", []).append(f"{c:>5}: Expected {prevILK + 1}, found {iLK}")
            prevILK = iLK
            if iAF:
                where["both"] += 1
        else:
            where[AF] += 1
        if iAF:
            if iAF != prevIAF + 1:
                errors.setdefault("wrong iAF", []).append(f"{c:>5}: Expected {prevIAF + 1}, found {iAF}")
            prevIAF = iAF
        else:
            where[LK] += 1
            
    if startBad:
        badStretches[len(alignment) - startBad].append(startBad)
            
    if prevILK < lastLK:
        errors.setdefault("missing iLKs at the end", []).append(f"last is {prevILK}, expected {lastLK}")
    elif prevILK > lastLK:
        errors.setdefault("too many iLKs at the end", []).append(f"last is {prevILK}, expected {lastLK}")
    if prevIAF < lastAF:
        errors.setdefault("missing iAFs at the end", []).append(f"last is {prevIAF}, expected {lastAF}")
    elif prevIAF > lastAF:
        errors.setdefault("too many iAFs at the end", []).append(f"last is {prevIAF}, expected {lastAF}")
    
    print("\nSANITY\n")
    if not errors:
        print("All OK")
    else:
        for (kind, msgs) in errors.items():
            print(f"ERROR {kind} ({len(msgs):>5}x):")
            for msg in msgs[0:10]:
                print(f"\t{msg}")
            if len(msgs) > 10:
                print(f"\t ... and {len(msgs) - 10} more ...")
                
    print(f"\nAGREEMENT\n")
    print("Where are the words?\n")
    print(f"\t{LK}-only: {where[LK]:>5} slots")
    print(f"\t{AF}-only: {where[AF]:>5} slots")
    print(f"\tboth:    {where['both']:>5} slots")
    
    print("\nHow well is the agreement?\n")
    for (d, n) in agreement.items():
        print(f"dissimilarity? {d} : {n:>5} words")
    
    print(f"\nBAD STRETCHES\n")
    print("How many of which size?\n")
    allSuspects = []
    someBenigns = []
    for (size, starts) in sorted(badStretches.items(), key=lambda x: (-x[0], x[1])):
        suspects = {start: size for start in starts if analyseStretch(start, start + size)}
        benigns = {start: size for start in starts if start not in suspects}
        allSuspects.extend([(start, start + size) for (start, size) in suspects.items()])
        someBenigns.extend([(start, start + size) for (start, size) in list(benigns.items())[0:3]])
        examples = ", ".join(str(start) for start in list(suspects.keys())[0:3])
        if not suspects:
            examples = ", ".join(str(start) for start in list(benigns.keys())[0:3])
        print(f"bad stretches of size {size:>3} : {len(suspects):>4} suspect of total {len(starts):>4} x see e.g. {examples}")
        
    print(f"\nShowing all {len(allSuspects)} inversion suspects" if len(allSuspects) else "\nNo suspect bad stretches\n")
    for (i, (start, end)) in enumerate(reversed(allSuspects)):
        print(f"\nSUSPECT {i + 1:>2}")
        print(printLines(max((1, start - 5)), min((len(alignment), end + 5))))
    print(f"\nShowing some ({len(someBenigns)}) benign examples" if len(someBenigns) else "\nNo bad stretches\n")
    for (i, (start, end)) in enumerate(someBenigns):
        print(f"\nBENIGN {i + 1:>2}")
        print(printLines(max((1, start - 2)), min((len(alignment), end + 2))))

In [56]:
checkAlignment(4000 - 1, 4000 - 1)


SANITY

All OK

AGREEMENT

Where are the words?

	LK-only:    96 slots
	AF-only:    96 slots
	both:     3903 slots

How well is the agreement?

dissimilarity? 1 :   520 words
dissimilarity? 0 :  3575 words

BAD STRETCHES

How many of which size?

bad stretches of size  48 :    0 suspect of total    1 x see e.g. 4047
bad stretches of size   9 :    0 suspect of total    1 x see e.g. 458
bad stretches of size   8 :    0 suspect of total    1 x see e.g. 3103
bad stretches of size   5 :    0 suspect of total    5 x see e.g. 346, 431, 513
bad stretches of size   4 :    0 suspect of total    5 x see e.g. 636, 2897, 3146
bad stretches of size   3 :    0 suspect of total   10 x see e.g. 332, 356, 380
bad stretches of size   2 :    0 suspect of total    8 x see e.g. 501, 553, 799
bad stretches of size   1 :    0 suspect of total   72 x see e.g. 1, 14, 117

No suspect bad stretches


Showing some (18) benign examples

BENIGN  1
 3998               mtnāh  @0 mtnāh                 3950
 3999      

# A few comparisons

**with Collatex**

In [58]:
print(printLines(start=291, end=309))

  287             ālāḥṣāʾ  @0 ālāḥṣāʾ                288
                           @1 s                      289
  288                  ān  @0 ān                     290
  289                 yrá  @0 yrá                    291
  290             āʿyānhā  @0 āʿyānhā                292
  291                 wān  @1 ān                     293
  292                 šʾt  @1 šʾtḳlt                 294
  293                 ḳlt  @1                           
  294                  ān  @0 ān                     295
  295                 yrá  @0 yrá                    296
  296                ʿynh  @0 ʿynh                   297
  297                  fy  @1 y                      298
  298                 kwn  @1 kwnǧāmʿ                299
  299                ǧāmʿ  @1 yḥṣrālāmr              300
  300                yḥṣr  @1 kh                     301
  301               ālāmr  @1                           
  302               lkwnh  @0 lkwnh                  302
  303               mtṣfā  @0 m

**with my algorithm**

```
287 =               ālāḥṣāʾ @0  ālāḥṣāʾ               =   288
288 +1                   ān @1  s                    2+   289
    ^1                      @1  ān                   2+   290
289 =                   yrá @0  yrá                   =   291
290 =               āʿyānhā @0  āʿyānhā               =   292
291 =                   wān @1  ān                    =   293
292 +2                  šʾt @0  šʾtḳlt               1+   294
293 +2                  ḳlt @0                       1^      
294 =                    ān @0  ān                    =   295
295 =                   yrá @0  yrá                   =   296
296 =                  ʿynh @0  ʿynh                  =   297
297 =                    fy @1  y                     =   298
298 +2                  kwn @0  kwnǧāmʿ              1+   299
299 +2                 ǧāmʿ @0                       1^      
300 +2                 yḥṣr @0  yḥṣrālāmr            1+   300
301 +2                ālāmr @0                       1^      
302 +1                lkwnh @2  kh                   2+   301
    ^1                      @2  lkwnh                2+   302
303 =                 mtṣfā @0  mtṣfā                 =   303
```

(A) Lines 288 are better handled by Collatex than by my algorithm.

(B) The lines 298-301 are better handled by my algorithm than by Collatex.

**with Collatex**

In [60]:
print(printLines(start=457, end=470))

  448 =                ālʿālm @0  ālʿālm                =   441
  449 +2               ālmʿbr @0  ālmʿbrʿnh            1+   442
  450 +2                  ʿnh @0                       1^      
  451 =                    fy @0  fy                    =   443
  452 +2               āṣṭlāḥ @0  āṣṭlāḥālḳwm          1+   444
  453 +2                ālḳwm @0                       1^      
  454 +2             bālānsān @1  ālānsānālkbyr        1+   445
  455 +2               ālkbyr @1                       1^      
  456 =                 fkānt @0  fkānt                 =   446
  457 =              ālmlāʾkŧ @0  ālmlāʾkŧ              =   447
  458 +2                   lh @0  lhkālḳwá             1+   448
  459 +2               kālḳwá @0                       1^      
  460 =             ālrwḥānyŧ @0  ālrwḥānyŧ             =   449


**with my algorithm**

```
448              ālʿālm  @0 ālʿālm                 441
449              ālmʿbr  @1 ālmʿbrʿnh              442
450                 ʿnh  @1                           
451                  fy  @0 fy                     443
452              āṣṭlāḥ  @1 āṣṭlāḥālḳwm            444
453               ālḳwm  @1                           
454            bālānsān  @1 ālānsānālkbyr          445
455              ālkbyr  @1                           
456               fkānt  @0 fkānt                  446
457            ālmlāʾkŧ  @0 ālmlāʾkŧ               447
458                  lh  @1                           
459              kālḳwá  @1 lhkālḳwá               448
460           ālrwḥānyŧ  @0 ālrwḥānyŧ              449
```

(C) Lines 458-459 are better handled by my algorithm than by Collatex.

Note that A and C are similar cases. Sometimes my algorithm chooses the best fit, sometimes Collatex does.
Anyway, this kind of decision is not very important for the dataset we want to build from this table.

Case B is a bit more involved, and there Collatex fails to see a more obvious alignment.

# Conclusion

The performance is the biggest obstacle for using Collatex here.
A rather superficial comparison between the resulting alignments does not show marked differences in quality,
although there is an indication that Collatex will deal a bit less graceful with convoluted situations.

But closer inspection might reveal that Collatex has it right more often than my algorithm.

However, because both are not perfect, it is important to be able to tweak if there are glaring
mistakes.
In Collatex we do not have obvious means to steer the algorithm further.

With my algorithm we have the options to define special cases, to tweak a number of parameters, and to change the orchestration
of the comparisons.

That's why we stick to my algorithm.