{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "from itertools import chain\n", "\n", "from tf.app import use\n", "from tf.core.files import dirMake" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "**Locating corpus resources ...**" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "app: ~/github/ETCBC/bhsa/app" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "data: ~/github/ETCBC/bhsa/tf/2021" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "data: ~/github/ETCBC/phono/tf/2021" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "data: ~/github/ETCBC/parallels/tf/2021" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", " Text-Fabric: Text-Fabric API 12.0.5, ETCBC/bhsa/app v3, Search Reference
\n", " Data: ETCBC - bhsa 2021, Character table, Feature docs
\n", "
Node types\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "
Name# of nodes# slots/node% coverage
book3910938.21100
chapter929459.19100
lex923046.22100
verse2321318.38100
half_verse451799.44100
sentence637176.70100
sentence_atom645146.61100
clause881314.84100
clause_atom907044.70100
phrase2532031.68100
phrase_atom2675321.59100
subphrase1138501.4238
word4265901.00100
\n", " Sets: no custom sets
\n", " Features:
\n", "
Parallel Passages\n", "
\n", "\n", "
\n", "
\n", "crossref\n", "
\n", "
int
\n", "\n", " 🆗 links between similar passages\n", "\n", "
\n", "\n", "
\n", "
\n", "\n", "
BHSA = Biblia Hebraica Stuttgartensia Amstelodamensis\n", "
\n", "\n", "
\n", "
\n", "book\n", "
\n", "
str
\n", "\n", " ✅ book name in Latin (Genesis; Numeri; Reges1; ...)\n", "\n", "
\n", "\n", "
\n", "
\n", "book@ll\n", "
\n", "
str
\n", "\n", " ✅ book name in amharic (ኣማርኛ)\n", "\n", "
\n", "\n", "
\n", "
\n", "chapter\n", "
\n", "
int
\n", "\n", " ✅ chapter number (1; 2; 3; ...)\n", "\n", "
\n", "\n", "
\n", "
\n", "code\n", "
\n", "
int
\n", "\n", " ✅ identifier of a clause atom relationship (0; 74; 367; ...)\n", "\n", "
\n", "\n", "
\n", "
\n", "det\n", "
\n", "
str
\n", "\n", " ✅ determinedness of phrase(atom) (det; und; NA.)\n", "\n", "
\n", "\n", "
\n", "
\n", "domain\n", "
\n", "
str
\n", "\n", " ✅ text type of clause (? (Unknown); N (narrative); D (discursive); Q (Quotation).)\n", "\n", "
\n", "\n", "
\n", "
\n", "freq_lex\n", "
\n", "
int
\n", "\n", " ✅ frequency of lexemes\n", "\n", "
\n", "\n", "
\n", "
\n", "function\n", "
\n", "
str
\n", "\n", " ✅ syntactic function of phrase (Cmpl; Objc; Pred; ...)\n", "\n", "
\n", "\n", "
\n", "
\n", "g_cons\n", "
\n", "
str
\n", "\n", " ✅ word consonantal-transliterated (B R>CJT BR> >LHJM ...)\n", "\n", "
\n", "\n", "
\n", "
\n", "g_cons_utf8\n", "
\n", "
str
\n", "\n", " ✅ word consonantal-Hebrew (ב ראשׁית ברא אלהים)\n", "\n", "
\n", "\n", "
\n", "
\n", "g_lex\n", "
\n", "
str
\n", "\n", " ✅ lexeme pointed-transliterated (B.:- R;>CIJT B.@R@> >:ELOH ...)\n", "\n", "
\n", "\n", "
\n", "
\n", "g_lex_utf8\n", "
\n", "
str
\n", "\n", " ✅ lexeme pointed-Hebrew (בְּ רֵאשִׁית בָּרָא אֱלֹה)\n", "\n", "
\n", "\n", "
\n", "
\n", "g_word\n", "
\n", "
str
\n", "\n", " ✅ word pointed-transliterated (B.:- R;>CI73JT B.@R@74> >:ELOHI92JM)\n", "\n", "
\n", "\n", "
\n", "
\n", "g_word_utf8\n", "
\n", "
str
\n", "\n", " ✅ word pointed-Hebrew (בְּ רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים)\n", "\n", "
\n", "\n", "
\n", "
\n", "gloss\n", "
\n", "
str
\n", "\n", " 🆗 english translation of lexeme (beginning create god(s))\n", "\n", "
\n", "\n", "
\n", "
\n", "gn\n", "
\n", "
str
\n", "\n", " ✅ grammatical gender (m; f; NA; unknown.)\n", "\n", "
\n", "\n", "
\n", "
\n", "label\n", "
\n", "
str
\n", "\n", " ✅ (half-)verse label (half verses: A; B; C; verses: GEN 01,02)\n", "\n", "
\n", "\n", "
\n", "
\n", "language\n", "
\n", "
str
\n", "\n", " ✅ of word or lexeme (Hebrew; Aramaic.)\n", "\n", "
\n", "\n", "
\n", "
\n", "lex\n", "
\n", "
str
\n", "\n", " ✅ lexeme consonantal-transliterated (B R>CJT/ BR>[ >LHJM/)\n", "\n", "
\n", "\n", "
\n", "
\n", "lex_utf8\n", "
\n", "
str
\n", "\n", " ✅ lexeme consonantal-Hebrew (ב ראשׁית֜ ברא אלהים֜)\n", "\n", "
\n", "\n", "
\n", "
\n", "ls\n", "
\n", "
str
\n", "\n", " ✅ lexical set, subclassification of part-of-speech (card; ques; mult)\n", "\n", "
\n", "\n", "
\n", "
\n", "nametype\n", "
\n", "
str
\n", "\n", " ⚠️ named entity type (pers; mens; gens; topo; ppde.)\n", "\n", "
\n", "\n", "
\n", "
\n", "nme\n", "
\n", "
str
\n", "\n", " ✅ nominal ending consonantal-transliterated (absent; n/a; JM, ...)\n", "\n", "
\n", "\n", "
\n", "
\n", "nu\n", "
\n", "
str
\n", "\n", " ✅ grammatical number (sg; du; pl; NA; unknown.)\n", "\n", "
\n", "\n", "
\n", "
\n", "number\n", "
\n", "
int
\n", "\n", " ✅ sequence number of an object within its context\n", "\n", "
\n", "\n", "
\n", "
\n", "otype\n", "
\n", "
str
\n", "\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", "pargr\n", "
\n", "
str
\n", "\n", " 🆗 hierarchical paragraph number (1; 1.2; 1.2.3.4; ...)\n", "\n", "
\n", "\n", "
\n", "
\n", "pdp\n", "
\n", "
str
\n", "\n", " ✅ phrase dependent part-of-speech (art; verb; subs; nmpr, ...)\n", "\n", "
\n", "\n", "
\n", "
\n", "pfm\n", "
\n", "
str
\n", "\n", " ✅ preformative consonantal-transliterated (absent; n/a; J, ...)\n", "\n", "
\n", "\n", "
\n", "
\n", "prs\n", "
\n", "
str
\n", "\n", " ✅ pronominal suffix consonantal-transliterated (absent; n/a; W; ...)\n", "\n", "
\n", "\n", "
\n", "
\n", "prs_gn\n", "
\n", "
str
\n", "\n", " ✅ pronominal suffix gender (m; f; NA; unknown.)\n", "\n", "
\n", "\n", "
\n", "
\n", "prs_nu\n", "
\n", "
str
\n", "\n", " ✅ pronominal suffix number (sg; du; pl; NA; unknown.)\n", "\n", "
\n", "\n", "
\n", "
\n", "prs_ps\n", "
\n", "
str
\n", "\n", " ✅ pronominal suffix person (p1; p2; p3; NA; unknown.)\n", "\n", "
\n", "\n", "
\n", "
\n", "ps\n", "
\n", "
str
\n", "\n", " ✅ grammatical person (p1; p2; p3; NA; unknown.)\n", "\n", "
\n", "\n", "
\n", "
\n", "qere\n", "
\n", "
str
\n", "\n", " ✅ word pointed-transliterated masoretic reading correction\n", "\n", "
\n", "\n", "
\n", "
\n", "qere_trailer\n", "
\n", "
str
\n", "\n", " ✅ interword material -pointed-transliterated (Masoretic correction)\n", "\n", "
\n", "\n", "
\n", "
\n", "qere_trailer_utf8\n", "
\n", "
str
\n", "\n", " ✅ interword material -pointed-transliterated (Masoretic correction)\n", "\n", "
\n", "\n", "
\n", "
\n", "qere_utf8\n", "
\n", "
str
\n", "\n", " ✅ word pointed-Hebrew masoretic reading correction\n", "\n", "
\n", "\n", "
\n", "
\n", "rank_lex\n", "
\n", "
int
\n", "\n", " ✅ ranking of lexemes based on freqnuecy\n", "\n", "
\n", "\n", "
\n", "
\n", "rela\n", "
\n", "
str
\n", "\n", " ✅ linguistic relation between clause/(sub)phrase(atom) (ADJ; MOD; ATR; ...)\n", "\n", "
\n", "\n", "
\n", "
\n", "sp\n", "
\n", "
str
\n", "\n", " ✅ part-of-speech (art; verb; subs; nmpr, ...)\n", "\n", "
\n", "\n", "
\n", "
\n", "st\n", "
\n", "
str
\n", "\n", " ✅ state of a noun (a (absolute); c (construct); e (emphatic).)\n", "\n", "
\n", "\n", "
\n", "
\n", "tab\n", "
\n", "
int
\n", "\n", " ✅ clause atom: its level in the linguistic embedding\n", "\n", "
\n", "\n", "
\n", "
\n", "trailer\n", "
\n", "
str
\n", "\n", " ✅ interword material pointed-transliterated (& 00 05 00_P ...)\n", "\n", "
\n", "\n", "
\n", "
\n", "trailer_utf8\n", "
\n", "
str
\n", "\n", " ✅ interword material pointed-Hebrew (־ ׃)\n", "\n", "
\n", "\n", "
\n", "
\n", "txt\n", "
\n", "
str
\n", "\n", " ✅ text type of clause and surrounding (repetion of ? N D Q as in feature domain)\n", "\n", "
\n", "\n", "
\n", "
\n", "typ\n", "
\n", "
str
\n", "\n", " ✅ clause/phrase(atom) type (VP; NP; Ellp; Ptcp; WayX)\n", "\n", "
\n", "\n", "
\n", "
\n", "uvf\n", "
\n", "
str
\n", "\n", " ✅ univalent final consonant consonantal-transliterated (absent; N; J; ...)\n", "\n", "
\n", "\n", "
\n", "
\n", "vbe\n", "
\n", "
str
\n", "\n", " ✅ verbal ending consonantal-transliterated (n/a; W; ...)\n", "\n", "
\n", "\n", "
\n", "
\n", "vbs\n", "
\n", "
str
\n", "\n", " ✅ root formation consonantal-transliterated (absent; n/a; H; ...)\n", "\n", "
\n", "\n", "
\n", "
\n", "verse\n", "
\n", "
int
\n", "\n", " ✅ verse number\n", "\n", "
\n", "\n", "
\n", "
\n", "voc_lex\n", "
\n", "
str
\n", "\n", " ✅ vocalized lexeme pointed-transliterated (B.: R;>CIJT BR> >:ELOHIJM)\n", "\n", "
\n", "\n", "
\n", "
\n", "voc_lex_utf8\n", "
\n", "
str
\n", "\n", " ✅ vocalized lexeme pointed-Hebrew (בְּ רֵאשִׁית ברא אֱלֹהִים)\n", "\n", "
\n", "\n", "
\n", "
\n", "vs\n", "
\n", "
str
\n", "\n", " ✅ verbal stem (qal; piel; hif; apel; pael)\n", "\n", "
\n", "\n", "
\n", "
\n", "vt\n", "
\n", "
str
\n", "\n", " ✅ verbal tense (perf; impv; wayq; infc)\n", "\n", "
\n", "\n", "
\n", "
\n", "mother\n", "
\n", "
none
\n", "\n", " ✅ linguistic dependency between textual objects\n", "\n", "
\n", "\n", "
\n", "
\n", "oslots\n", "
\n", "
none
\n", "\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", "\n", "
Phonetic Transcriptions\n", "
\n", "\n", "
\n", "
\n", "phono\n", "
\n", "
str
\n", "\n", " 🆗 phonological transcription (bᵊ rēšˌîṯ bārˈā ʔᵉlōhˈîm)\n", "\n", "
\n", "\n", "
\n", "
\n", "phono_trailer\n", "
\n", "
str
\n", "\n", " 🆗 interword material in phonological transcription\n", "\n", "
\n", "\n", "
\n", "
\n", "\n", " Settings:
specified
  1. apiVersion: 3
  2. appName: ETCBC/bhsa
  3. appPath: /Users/me/github/ETCBC/bhsa/app
  4. commit: no value
  5. css: ''
  6. dataDisplay:
    • exampleSectionHtml:<code>Genesis 1:1</code> (use <a href=\"https://github.com/{org}/{repo}/blob/master/tf/{version}/book%40en.tf\" target=\"_blank\">English book names</a>)
    • excludedFeatures:
      • g_uvf_utf8
      • g_vbs
      • kq_hybrid
      • languageISO
      • g_nme
      • lex0
      • is_root
      • g_vbs_utf8
      • g_uvf
      • dist
      • root
      • suffix_person
      • g_vbe
      • dist_unit
      • suffix_number
      • distributional_parent
      • kq_hybrid_utf8
      • crossrefSET
      • instruction
      • g_prs
      • lexeme_count
      • rank_occ
      • g_pfm_utf8
      • freq_occ
      • crossrefLCS
      • functional_parent
      • g_pfm
      • g_nme_utf8
      • g_vbe_utf8
      • kind
      • g_prs_utf8
      • suffix_gender
      • mother_object_type
    • noneValues:
      • absent
      • n/a
      • none
      • unknown
      • no value
      • NA
  7. docs:
    • docBase: {docRoot}/{repo}
    • docExt: ''
    • docPage: ''
    • docRoot: https://{org}.github.io
    • featurePage: 0_home
  8. interfaceDefaults: {}
  9. isCompatible: True
  10. local: clone
  11. localDir: /Users/me/github/ETCBC/bhsa/_temp
  12. provenanceSpec:
    • corpus: BHSA = Biblia Hebraica Stuttgartensia Amstelodamensis
    • doi: 10.5281/zenodo.1007624
    • moduleSpecs:
      • :
        • backend: no value
        • corpus: Phonetic Transcriptions
        • docUrl:https://nbviewer.jupyter.org/github/etcbc/phono/blob/master/programs/phono.ipynb
        • doi: 10.5281/zenodo.1007636
        • org: ETCBC
        • relative: /tf
        • repo: phono
      • :
        • backend: no value
        • corpus: Parallel Passages
        • docUrl:https://nbviewer.jupyter.org/github/ETCBC/parallels/blob/master/programs/parallels.ipynb
        • doi: 10.5281/zenodo.1007642
        • org: ETCBC
        • relative: /tf
        • repo: parallels
    • org: ETCBC
    • relative: /tf
    • repo: bhsa
    • version: 2021
    • webBase: https://shebanq.ancient-data.org/hebrew
    • webHint: Show this on SHEBANQ
    • webLang: la
    • webLexId: True
    • webUrl:{webBase}/text?book=<1>&chapter=<2>&verse=<3>&version={version}&mr=m&qw=q&tp=txt_p&tr=hb&wget=v&qget=v&nget=vt
    • webUrlLex: {webBase}/word?version={version}&id=<lid>
  13. release: no value
  14. typeDisplay:
    • clause:
      • label: {typ} {rela}
      • style: ''
    • clause_atom:
      • hidden: True
      • label: {code}
      • level: 1
      • style: ''
    • half_verse:
      • hidden: True
      • label: {label}
      • style: ''
      • verselike: True
    • lex:
      • featuresBare: gloss
      • label: {voc_lex_utf8}
      • lexOcc: word
      • style: orig
      • template: {voc_lex_utf8}
    • phrase:
      • label: {typ} {function}
      • style: ''
    • phrase_atom:
      • hidden: True
      • label: {typ} {rela}
      • level: 1
      • style: ''
    • sentence:
      • label: {number}
      • style: ''
    • sentence_atom:
      • hidden: True
      • label: {number}
      • level: 1
      • style: ''
    • subphrase:
      • hidden: True
      • label: {number}
      • style: ''
    • word:
      • features: pdp vs vt
      • featuresBare: lex:gloss
  15. writing: hbo
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Text-Fabric API: names N F E L T S C TF Fs Fall Es Eall Cs Call directly usable

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "A = use(\"ETCBC/bhsa:clone\", checkout=\"clone\", hoist=globals())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Find multiple word phrases with equal `nu` on first and last word\n", "\n", "### In TF, by means of a query:" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0.90s 15522 results\n" ] } ], "source": [ "query = \"\"\"\n", "phrase\n", " =: fi:word\n", " # la:word\n", " :=\n", "\n", "fi .nu. la\n", "\"\"\"\n", "\n", "resultsQ = A.search(query)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

result 1" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "

clause WXQt NA
phrase CP Conj
phrase NP Subj
phrase VP Pred
phrase NP PreC
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "

result 2" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "

clause WayX NA
phrase CP Conj
phrase VP Pred
phrase NP Subj
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "

result 3" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "

clause NmCl NA
phrase NP PreC
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "\n", "A.show(resultsQ, end=3, condenseType=\"clause\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### In TF, by means of hand-coding:" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "15522" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "resultsH = []\n", "\n", "for p in F.otype.s(\"phrase\"):\n", " ws = L.d(p, otype=\"word\")\n", " if len(ws) < 2:\n", " continue\n", " fi = ws[0]\n", " la = ws[-1]\n", " if F.nu.v(fi) != F.nu.v(la):\n", " continue\n", " resultsH.append((p, fi, la))\n", "\n", "len(resultsH)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Check" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "set(resultsQ) == set(resultsH)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## In STAM\n", "\n", "Challenges:\n", "\n", "### Find the first and last word in each phrase\n", "\n", "Given a phrase, we need to find its words.\n", "Well, a phrase is an annotation with key `otype` and value `phrase` and target some word annotations.\n", "These target annotations are easy to get, by means of the `annotations()` method on annotations.\n", "\n", "Then we have to find the first and last words among these targets.\n", "\n", "**That is currently difficult!**\n", "\n", "You need a concept of order between annotations.\n", "One possibility is to put sequence numbers in the data, as annotations.\n", "But that is very cumbersome, because you need to refer to yet another level of annotation.\n", "And it will inflate the data.\n", "\n", "The other possibility is \"canonical ordering\".\n", "Annotations that target the text can be ordered by their targets.\n", "A target is a subset of textual positions. Two such subsets can be ordered as follows:\n", "\n", "* if A is a subset of B then B <= A\n", "* if B is a subset of A then A <= B\n", "* if A and B are no subsets of each other, the set that has the smallest element that does not belong to the other set, is the smallest.\n", "\n", "As part of the index building, you could create the rank of each annotation in this ordering.\n", "\n", "Annotations that target annotations that are already canonically ordered, can themselves be canonically ordered w.r.t. their targets.\n", "\n", "Without this, the user will need to implement sorting in ad-hoc ways.\n", "\n", "### Retrieve values for the first and last word\n", "\n", "Given the annotations for the first and last word in a phrase, we have to find annotations with key `nu`\n", "and target these words, and read off their value.\n", "\n", "**That is currently difficult!**\n", "\n", "A way out is this:\n", "\n", "As preparation, before looping through the phrases:\n", "Make a dict that associates word annotation identifiers with `nu`-values:\n", "\n", "* retrieve all annotations that have key `nu`, for each annotation:\n", "* pick the target, it is a word annotation, pick its id and use that as key in the dict\n", "* pick the data and from that the value and use that as value in the dict\n", "\n", "Then, for each phrase with at least two words:\n", "\n", "* retrieve the first word and from there the `nu`-value for that word\n", "* retrieve the second word and from there the `nu`-value for that word\n", "* compare the two values. If they are equal, we have a hit.\n", "\n", "This can be improved if the API offers an efficient function to look up values.\n", "That could be a pre computation of all those dictionaries.\n", "\n", "Even better: those dictionaries could be the primary data!" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Current: 3.01 GB\n", "Delta: 3.01 GB\n" ] } ], "source": [ "import stam\n", "\n", "from memutil import memUsage\n", "memUsage()\n", "\n", "workDir = f\"{A.tempDir}/stam\"\n", "storeC = stam.AnnotationStore(file=f\"{workDir}/bhsa.store.stam.csv\")" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "aDataSet = list(storeC.annotationsets())[0]" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "def stamOtype(otype):\n", " otypeData = aDataSet.find_data(\"otype\", otype)\n", " otypeAnnos = otypeData[0].annotations()\n", " return otypeAnnos\n", "\n", "def idsOf(annos):\n", " return {a.id() for a in annos}" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "# get the word annotations, sorted, and the phrase annotations\n", "\n", "def getPos(wordAnno):\n", " t = wordAnno.textselections()[0]\n", " return (t.begin(), t.end())\n", "\n", "wordAnnos = stamOtype(\"word\")\n", "wordIds = idsOf(wordAnnos)\n", "phraseAnnos = stamOtype(\"phrase\")" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "wordAnnos = sorted(wordAnnos, key=getPos)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "# make a rank of the word annos\n", "\n", "wordRank = {anno.id(): i for (i, anno) in enumerate(wordAnnos)}" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "78754" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get the phrase annotations together with their first and last word\n", "\n", "phrases = []\n", "\n", "for pAnno in phraseAnnos:\n", " words = pAnno.annotations()\n", " if len(words) < 2:\n", " continue\n", " sortedWords = sorted(words, key=lambda x: wordRank[x.id()])\n", " phrases.append((p, words[0], words[-1]))\n", "\n", "len(phrases)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0.90s 78754 results\n" ] } ], "source": [ "# intermediate check with TF\n", "\n", "query = \"\"\"\n", "phrase\n", " =: word\n", " # word\n", " :=\n", "\"\"\"\n", "results = A.search(query)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "# get the `nu` information ready\n", "# we collect a dict keyed by word id with values the grammatical number of the word\n", "\n", "nuKey = aDataSet.key(\"nu\")\n", "nuAnnos = nuKey.annotations()\n", "\n", "nuValue = {}\n", "\n", "for nuAnno in nuAnnos:\n", " value = nuAnno.data()[0].value()\n", " word = list(nuAnno.annotations())[0]\n", "\n", " nuValue[word.id()] = value\n" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "בְּ NA\n", "רֵאשִׁ֖ית sg\n", "בָּרָ֣א sg\n", "אֱלֹהִ֑ים pl\n", "אֵ֥ת NA\n", "הַ NA\n", "שָּׁמַ֖יִם pl\n", "וְ NA\n", "אֵ֥ת NA\n", "הָ NA\n", "אָֽרֶץ00 sg\n" ] } ], "source": [ "# check some values\n", "\n", "for wordAnno in wordAnnos[0:11]:\n", " print(f\"{wordAnno} {nuValue[wordAnno.id()]}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "So far so good!" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "15522" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# now compute the final result\n", "\n", "resultsSTAM = [x for x in phrases if nuValue[x[1].id()] == nuValue[x[2].id()]]\n", "len(resultsSTAM)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Perfect!\n", "\n", "## Now in one go\n", "\n", "In order to see the performance, let's do this again in one go." ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "# The complete task in one go\n", "\n", "def getNicePhrases():\n", " aDataSet = list(storeC.annotationsets())[0]\n", " wordAnnos = sorted(stamOtype(\"word\"), key=getPos)\n", " wordIds = idsOf(wordAnnos)\n", " wordRank = {anno.id(): i for (i, anno) in enumerate(wordAnnos)}\n", "\n", " phraseAnnos = stamOtype(\"phrase\")\n", " phrases = []\n", "\n", " for pAnno in phraseAnnos:\n", " words = pAnno.annotations()\n", " if len(words) < 2:\n", " continue\n", " sortedWords = sorted(words, key=lambda x: wordRank[x.id()])\n", " phrases.append((p, words[0], words[-1]))\n", "\n", " nuKey = aDataSet.key(\"nu\")\n", " nuAnnos = nuKey.annotations()\n", "\n", " nuValue = {}\n", "\n", " for nuAnno in nuAnnos:\n", " value = nuAnno.data()[0].value()\n", " word = list(nuAnno.annotations())[0]\n", "\n", " nuValue[word.id()] = value\n", "\n", " results = [x for x in phrases if nuValue[x[1].id()] == nuValue[x[2].id()]]\n", " print(len(results))\n", " return results" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "15522\n" ] } ], "source": [ "resultsSTAM = getNicePhrases()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The execution times for this task were\n", "\n", "TF query | TF hand coding | STAM\n", "--- | --- | ---\n", "0.9 | 0.2 | 2.0\n", "\n", "But in STAM we can move quite a bit out of the task:\n", "\n", "1. sorting the words should be taken care of during the index building/loading when loading the STAM dataset (saves 0.3 sec)\n", "2. retrieving the `nu` value should be optimized (could save 0.9 sec)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.1" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }