<img align="right" src="images/dans-small.png"/>
<img align="right" src="images/tf-small.png"/>
<img align="right" src="images/etcbc.png"/>


# Phrases in versions of the BHSA

In [version Mappings](versionMappings.ipynb)
we have constructed edge features that map the nodes from one version of the data to the next.
In this notebook we are going to use those edges to study what happened to the feature `function`
of `phrases`.

# Overview

We explore:
* how the values of the `function` feature have changed;
* to what degree phrases have other boundaries.

# Discussion
The feature `function` was called `phrase_function` in version `3`.

## Phrase boundaries
In order to see whether phrase boundaries have changed, we follow the `omap@` edges from
phrases in one version to their counterparts in the next version.

We make use of the dissimilarity values that are attached to such edges.
If there is no value, or the value is `0`, we have a match without a boundary change.
All other dissimilarities imply that boundaries have changed.

# Results
For the sake of presentation,
we start with the result cells, **they should be run after the other cells**.
The computation starts [here](#Start).

# Changes in `function` values

In [11]:
for (v, w) in reversed(phraseMapping):  # noqa F821
    caption(1, "Phrase function change from version {} to {}".format(v, w))  # noqa F821
    featureDiff(v, w, "FUNCTION")  # noqa F821


##############################################################################################
#                                                                                            #
#      1m 13s Phrase function change from version 3 to 2017                                  #
#                                                                                            #
##############################################################################################



0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
3\2017,Adju,Cmpl,Conj,EPPr,ExsS,Exst,Frnt,IntS,Intj,Loca,ModS,Modi,NCoS,NCop,Nega,Objc,PrAd,PrcS,PreC,PreO,PreS,Pred,PtcO,Ques,Rela,Subj,Supp,Time,Voct
Adju,5438,142,15,,,,18,,,17,,169,,,2,171,84,,111,4,,,,9,2,177,3,20,1
Cmpl,186,22005,9,,,,11,,,25,,11,,,,168,27,,110,,,5,1,3,1,160,3,8,3
Conj,101,51,33064,,,,10,,,17,,151,,,,105,3,,10,2,3,19,,,73,29,,10,1
ExsS,,,,,7,,,,,,,,,,,,,,,,,,,,,,,,
Exst,,,2,,2,90,,,,,1,,3,7,,,,,,,,,,,,,,,
Frnt,8,7,,,,,755,,,,,1,,,,9,,,12,,,,1,,,45,,,5
IntS,,,,,,,,161,,,,,,,,,,,1,,,,,,,,,,
Intj,,1,1,,,,,17,1199,,,6,,,,5,,,9,,,3,,5,,5,,12,1
IrpC,1,18,,,,,,,,,,,,,,,,,3,,,,,,,,,,



##############################################################################################
#                                                                                            #
#      1m 15s Phrase function change from version 2016 to 2017                               #
#                                                                                            #
##############################################################################################



0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
2016\2017,Adju,Cmpl,Conj,EPPr,ExsS,Exst,Frnt,IntS,Intj,Loca,ModS,Modi,NCoS,NCop,Nega,Objc,PrAd,PrcS,PreC,PreO,PreS,Pred,PtcO,Ques,Rela,Subj,Supp,Time,Voct
Adju,9508,12,2,,,,5,,,2,,,,,,5,,,4,,,,,,,2,,6,
Cmpl,16,30002,4,,,,,,,,,,,,,13,,,1,,,,,,,1,,,
Conj,1,,46135,,,,,,,,,,,,,3,,,,,,,,,3,1,,,
EPPr,,,,21,,,,,,,,,,,,,,,,,,,,,,,,,
ExsS,,,,,14,,,,,,,,,,,,,,,,,,,,,,,,
Exst,,,,,,143,,,,,,,,,,,,,,,,,,,,,,,
Frnt,,1,,,,,1119,,,,,,,,,,,,1,,,,1,,,9,,,
IntS,,,,,,,,251,,,,,,,,,,,,,,,,,,,,,
Intj,,,,,,,,,1621,,,,,,,,,,,,,,,,,,,,



##############################################################################################
#                                                                                            #
#      1m 16s Phrase function change from version 4b to 2016                                 #
#                                                                                            #
##############################################################################################



0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
4b\2016,Adju,Cmpl,Conj,EPPr,ExsS,Exst,Frnt,IntS,Intj,Loca,ModS,Modi,NCoS,NCop,Nega,Objc,PrAd,PrcS,PreC,PreO,PreS,Pred,PtcO,Ques,Rela,Subj,Supp,Time,Voct
Adju,9477,31,1,,,,5,,,1,,1,,,1,11,1,,6,,,,,,1,8,2,1,
Cmpl,39,29921,1,,,,6,,,8,,1,,,2,41,,,24,,,,,,,11,,7,
Conj,1,,46124,,,,,,,,,,,,,,,,1,,,,,,2,1,,,
EPPr,,,,9,,,,,,,,,,,,,,,,,,,,,,,,,
ExsS,,,,,14,,,,,,,,,,,,,,,,,,,,,,,,
Exst,,,,,,143,,,,,,,,,,,,,,,,,,,,,,,
Frnt,,,,,,,1087,,,,,,,,,,,,,,,,,,,25,,,
IntS,,,,,,,,251,,,,,,,,,,,,,,,,,,,,,
Intj,,,,,,,,,1621,,,,,,,,,,,,,,,,,,,,



##############################################################################################
#                                                                                            #
#      1m 17s Phrase function change from version 4 to 4b                                    #
#                                                                                            #
##############################################################################################



0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
4\4b,Adju,Cmpl,Conj,EPPr,ExsS,Exst,Frnt,IntS,Intj,Loca,ModS,Modi,NCoS,NCop,Nega,Objc,PrAd,PrcS,PreC,PreO,PreS,Pred,PtcO,Ques,Rela,Subj,Supp,Time,Voct
Adju,8061,94,13,,,,7,,,10,,206,,,1,155,82,,65,5,1,,,8,1,186,3,17,
Cmpl,77,27606,9,,,,2,,,10,,8,,,,65,7,,105,,,3,1,1,5,86,,2,3
Conj,44,39,45936,,,,17,,,10,,6,,,,110,1,,19,1,,3,,,74,42,,7,1
EPPr,,,,4,,,,,,,,,,,,,,,,,,,,,,,,,
ExsS,,,,,14,,,,,,,,,,,,,,,,,,,,,,,,
Exst,,,,,,143,,,,,,,,1,,,,,,,,,,,,,,,
Frnt,1,5,,,,,1007,1,,,,,,,,2,,,5,,,,,,,5,,,3
IntS,,,,,,,,250,,,,,,,,,,,,,,,,,,,,,
Intj,,,,,,,,,1624,,,,,,,,,,,,,,,,,1,,,3



##############################################################################################
#                                                                                            #
#      1m 18s Phrase function change from version 3 to 4                                     #
#                                                                                            #
##############################################################################################



0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
3\4,Adju,Cmpl,Conj,EPPr,ExsS,Exst,Frnt,IntS,Intj,Loca,ModS,Modi,NCoS,NCop,Nega,Objc,PrAd,PrcS,PreC,PreO,PreS,Pred,PtcO,Ques,Rela,Subj,Supp,Time,Unkn,Voct
Adju,6067,74,15,,,,6,,,10,,31,,,,43,19,,65,,,,,,1,43,,15,,2
Cmpl,90,22418,12,,,,5,,,14,,6,,,1,79,26,,21,,,3,,1,1,71,3,6,,2
Conj,87,27,33540,,,,6,,,8,,154,,,,36,2,,5,2,3,18,,,39,22,,7,,1
ExsS,,,,,7,,,,,,,,,,,,,,,,,,,,,,,,,
Exst,,,,,2,90,,,,,1,,3,9,,,,,,,,,,,,,,,,
Frnt,8,2,1,,,,785,,,,,,,,,8,,,12,,,,,,,22,,,,5
IntS,,,,,,,,161,,,,,,,,,,,1,,,,,,,,,,,
Intj,,1,2,,,,,17,1199,,,16,,,,5,,,9,,,3,,5,,5,,1,,1
IrpC,1,18,,,,,,,,,,,,,,,,,3,,,,,,,,,,,


# Boundary statistics

In [12]:
for (v, w) in reversed(phraseMapping):  # noqa F821
    caption(1, "Phrase boundary change from version {} to {}".format(v, w))  # noqa F821
    showStats(v, w)  # noqa F821


##############################################################################################
#                                                                                            #
#      1m 30s Phrase boundary change from version 3 to 2017                                  #
#                                                                                            #
##############################################################################################



0,1
dissimilarity,number of phrases
0,251551
1,29
2,26
3,22
4,13
5,10
6,5
7,6
8,1



##############################################################################################
#                                                                                            #
#      1m 30s Phrase boundary change from version 2016 to 2017                               #
#                                                                                            #
##############################################################################################



0,1
dissimilarity,number of phrases
0,253073
1,29
2,26
3,22
4,13
5,10
6,5
7,6
8,1



##############################################################################################
#                                                                                            #
#      1m 30s Phrase boundary change from version 4b to 2016                                 #
#                                                                                            #
##############################################################################################



0,1
dissimilarity,number of phrases
0,252881
1,128
2,82
3,65
4,26
5,16
6,11
7,14
8,11



##############################################################################################
#                                                                                            #
#      1m 30s Phrase boundary change from version 4 to 4b                                    #
#                                                                                            #
##############################################################################################



0,1
dissimilarity,number of phrases
0,250751
1,750
2,745
3,618
4,372
5,305
6,188
7,141
8,123



##############################################################################################
#                                                                                            #
#      1m 30s Phrase boundary change from version 3 to 4                                     #
#                                                                                            #
##############################################################################################



0,1
dissimilarity,number of phrases
0,250346
1,2837
2,1164
3,788
4,457
5,287
6,166
7,127
8,86


# Start
Start the program here.

In [1]:
import os  # noqa 402
import collections  # noqa 402
from functools import reduce  # noqa 402
from utils import caption  # noqa 402
from tf.fabric import Fabric  # noqa 402

from IPython.display import HTML, display  # noqa 402


We specify our versions and the subtle differences between them as far as they are relevant.

In [2]:
REPO = os.path.expanduser("~/github/etcbc/bhsa")
baseDir = "{}/tf".format(REPO)
tempDir = "{}/_temp".format(REPO)

versions = """
    3
    4
    4b
    2016
    2017
""".strip().split()

versionInfoSpec = {
    "": dict(
        OCC="g_word",
        LEX="lex",
        FUNCTION="function",
    ),
    "3": dict(
        OCC="text_plain",
        LEX="lexeme",
        FUNCTION="phrase_function",
    ),
}

versionInfo = {}

defaults = versionInfoSpec[""].items()

for (i, v) in enumerate(versions):
    versionInfo.setdefault(v, {})["OMAP"] = (
        "" if i == 0 else "omap@{}-{}".format(versions[i - 1], v)
    )
    versionInfo[v].update(versionInfoSpec.get("", {}))
    versionInfo[v].update(versionInfoSpec.get(v, {}))

Load all versions in one go, with the version mapping feature if present.

In [3]:
TF = {}
api = {}
for (i, v) in enumerate(versions):
    for (param, value) in versionInfo[v].items():
        globals()[param] = value
    caption(4, "Version -> {} <- loading ...".format(v))
    TF[v] = Fabric(locations="{}/{}".format(baseDir, v), modules=[""])
    api[v] = TF[v].load(" ".join((OCC, LEX, FUNCTION, OMAP)))  # noqa F821

..............................................................................................
.       0.00s Version -> 3 <- loading ...                                                    .
..............................................................................................
This is Text-Fabric 3.0.9
Api reference : https://github.com/Dans-labs/text-fabric/wiki/Api
Tutorial      : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb
Example data  : https://github.com/Dans-labs/text-fabric-data

118 features found and 0 ignored
  0.00s loading features ...
   |     0.12s B lexeme               from /Users/dirk/github/etcbc/bhsa/tf/3
   |     0.22s B text_plain           from /Users/dirk/github/etcbc/bhsa/tf/3
   |     0.08s B phrase_function      from /Users/dirk/github/etcbc/bhsa/tf/3
   |     0.00s Feature overview: 115 for nodes; 2 for edges; 1 configs; 7 computed
  4.99s All features loaded/computed - for details use loadLog()
...........................

# Utility function: tables in your cells

In [4]:
def tableText(table):
    return display(
        HTML(
            "<table><tr>{}</tr></table>".format(
                "</tr><tr>".join(
                    "<td>{}</td>".format("</td><td>".join(str(_) for _ in row))
                    for row in table
                )
            )
        )
    )

# Get counterparts

Here is a function that gets the counterparts of phrases between versions, and classifies them according to dissimilarity.

`phraseMapping` is keyed by a (source version, target version) pair,
then by dissimilarity, then by node in source version, and then
the value is a node in the target version.

Source nodes that lack a counterpart, end up in a bucket with dissimilarity -1.

In [5]:
phraseMapping = collections.OrderedDict()

In [6]:
def getPhrases(v, w):
    V = api[v]
    W = api[w]
    mapVW = "omap@{}-{}".format(v, w)
    vKey = (v, w)

    phraseMapping[vKey] = {}
    phrases = phraseMapping[vKey]

    for n in V.F.otype.s("phrase"):
        ms = W.Es(mapVW).f(n)
        if ms is not None:
            phrases[n] = ms

We also want to see the evolution in one big leap, so we construct a mapping from the first version to the last,
just by composing the individual `omap@`s into a stride.

Picking a phrase, and following it through the versions might lead to multiple counterparts.
When that happens, we choose the one with the highest similarity, and ignore the rest.

In [7]:
def composeMap(curMap, newStep):
    resultMap = {}
    for (n, ms) in curMap.items():
        theM = (
            ms[0][0] if len(ms) == 1 else sorted(ms, key=lambda x: (x[1], x[0]))[0][0]
        )
        resultMap[n] = newStep[theM]
    return resultMap


def getFirstLastMapping():
    if len(versions) <= 2:
        return {}
    curMap = phraseMapping[(versions[0], versions[1])]

    for i in range(2, len(versions)):
        caption(0, "mapping from {} to {}".format(versions[0], versions[i]))
        curMap = composeMap(curMap, phraseMapping[(versions[i - 1], versions[i])])
    phraseMapping[(versions[0], versions[-1])] = curMap

# Table of boundary changes

In [8]:
def showStats(v, w):
    vKey = (v, w)
    phrases = phraseMapping[vKey]
    dists = {}
    for (n, ms) in phrases.items():
        for (m, dis) in ms:
            dists.setdefault(dis or 0, set()).add(m)
    stats = collections.Counter()
    for (dis, ms) in dists.items():
        stats[dis] = len(ms)
    table = []
    table.append(["dissimilarity", "number of phrases"])
    for dis in range(0, max(stats) + 1):
        table.append([dis, stats.get(dis, "")])
    tableText(table)

# Table of old and new values
We visualize the changes in the values of the `function` feature,
by generating a matrix, with old values in the row headers
and new values in the column headers, and the number of times that this old feature has changed into that new
feature in the corresponding matrix cells.

In [9]:
def featureDiff(v, w, feat):
    V = api[v]
    W = api[w]
    vKey = (v, w)
    vFeat = versionInfo[v][feat]
    wFeat = versionInfo[w][feat]
    phrases = phraseMapping[vKey]

    combis = {}
    for (n, ms) in phrases.items():
        vVal = V.Fs(vFeat).v(n)
        for (m, dis) in ms:
            wVal = W.Fs(wFeat).v(m)
            combis.setdefault(vVal, collections.Counter())[wVal] += 1
    vValues = sorted(combis.keys())
    wValues = sorted(reduce(set.union, [set(combis[v]) for v in vValues], set()))
    table = []
    table.append(["{}\\{}".format(v, w)] + wValues)
    for v in vValues:
        table.append([v] + [str(combis[v].get(w, "")) for w in wValues])
    tableText(table)

# Collect
We collect all data in a big data structure.

In [10]:
caption(4, "Collecting data")
for (i, w) in enumerate(versions):
    if i == 0:
        continue
    v = versions[i - 1]
    caption(0, "\t{:<4} => {:<4}".format(v, w))
    getPhrases(v, w)

caption(0, "\t{:<4} => {:<4}".format(versions[0], versions[-1]))
getFirstLastMapping()
caption(0, "Done")

..............................................................................................
.         55s Collecting data                                                                .
..............................................................................................
|         55s 	3    => 4   
|         57s 	4    => 4b  
|         58s 	4b   => 2016
|      1m 00s 	2016 => 2017
|      1m 02s 	3    => 2017
|      1m 02s mapping from 3 to 4b
|      1m 02s mapping from 3 to 2016
|      1m 02s mapping from 3 to 2017
|      1m 02s Done
