{ "cells": [ { "cell_type": "markdown", "id": "5306bdc6-1d2d-4d2b-80ab-5d64efe92003", "metadata": {}, "source": [ "# Swapping of wordorders in XML source files (N1904LFT)" ] }, { "cell_type": "markdown", "id": "c9bf60c4-73e3-43b4-a60a-314ebbbf1426", "metadata": {}, "source": [ "## Table of content \n", "* 1 - Introduction\n", "* 2 - Load Text-Fabric app and data\n", "* 3 - Performing the queries\n", " * 3.1 - Swaps in wordorder" ] }, { "cell_type": "markdown", "id": "fc1cd1c6-7d00-47b4-9749-8b7f2f703fbf", "metadata": {}, "source": [ "# 1 - Introduction \n", "##### [Back to TOC](#TOC)\n", "\n", "This notebook investigate the occurence of wordorder changes in the XML source files." ] }, { "cell_type": "markdown", "id": "31b88ba9-ee5e-4dc9-ae32-83c92c6a51b6", "metadata": {}, "source": [ "# 2 - Load Text-Fabric app and data \n", "##### [Back to TOC](#TOC)" ] }, { "cell_type": "code", "execution_count": 1, "id": "4c667b6f-cdfa-4fde-87e9-4ac0c82cfe3b", "metadata": { "tags": [] }, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "id": "26fbe0c2-5a35-4ae1-8fee-f0e3553c08aa", "metadata": {}, "outputs": [], "source": [ "# Loading the Text-Fabric code\n", "# Note: it is assumed Text-Fabric is installed in your environment\n", "from tf.fabric import Fabric\n", "from tf.app import use" ] }, { "cell_type": "code", "execution_count": 3, "id": "be4fe0bd-ddc5-4c28-833a-0114c3e5ce8e", "metadata": { "scrolled": true, "tags": [] }, "outputs": [ { "data": { "text/markdown": [ "**Locating corpus resources ...**" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "app: ~/text-fabric-data/github/tonyjurg/Nestle1904LFT/app" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "data: ~/text-fabric-data/github/tonyjurg/Nestle1904LFT/tf/0.6" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", " TF: TF API 12.2.2, tonyjurg/Nestle1904LFT/app v3, Search Reference
\n", " Data: tonyjurg - Nestle1904LFT 0.6, Character table, Feature docs
\n", "
Node types\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "
Name# of nodes# slots / node% coverage
book275102.93100
chapter260529.92100
verse794317.35100
sentence801117.20100
wg1054306.85524
word1377791.00100
\n", " Sets: no custom sets
\n", " Features:
\n", "
Nestle 1904 (Low Fat Tree)\n", "
\n", "\n", "
\n", "
\n", "after\n", "
\n", "
str
\n", "\n", " ✅ Characters (eg. punctuations) following the word\n", "\n", "
\n", "\n", "
\n", "
\n", "book\n", "
\n", "
str
\n", "\n", " ✅ Book name (in English language)\n", "\n", "
\n", "\n", "
\n", "
\n", "booknumber\n", "
\n", "
int
\n", "\n", " ✅ NT book number (Matthew=1, Mark=2, ..., Revelation=27)\n", "\n", "
\n", "\n", "
\n", "
\n", "bookshort\n", "
\n", "
str
\n", "\n", " ✅ Book name (abbreviated)\n", "\n", "
\n", "\n", "
\n", "
\n", "case\n", "
\n", "
str
\n", "\n", " ✅ Gramatical case (Nominative, Genitive, Dative, Accusative, Vocative)\n", "\n", "
\n", "\n", "
\n", "
\n", "chapter\n", "
\n", "
int
\n", "\n", " ✅ Chapter number inside book\n", "\n", "
\n", "\n", "
\n", "
\n", "clausetype\n", "
\n", "
str
\n", "\n", " ✅ Clause type details (e.g. Verbless, Minor)\n", "\n", "
\n", "\n", "
\n", "
\n", "containedclause\n", "
\n", "
str
\n", "\n", " 🆗 Contained clause (WG number)\n", "\n", "
\n", "\n", "
\n", "
\n", "degree\n", "
\n", "
str
\n", "\n", " ✅ Degree (e.g. Comparitative, Superlative)\n", "\n", "
\n", "\n", "
\n", "
\n", "gloss\n", "
\n", "
str
\n", "\n", " ✅ English gloss\n", "\n", "
\n", "\n", "
\n", "
\n", "gn\n", "
\n", "
str
\n", "\n", " ✅ Gramatical gender (Masculine, Feminine, Neuter)\n", "\n", "
\n", "\n", "
\n", "
\n", "headverse\n", "
\n", "
str
\n", "\n", " ✅ Start verse number of a sentence\n", "\n", "
\n", "\n", "
\n", "
\n", "junction\n", "
\n", "
str
\n", "\n", " ✅ Junction data related to a wordgroup\n", "\n", "
\n", "\n", "
\n", "
\n", "lemma\n", "
\n", "
str
\n", "\n", " ✅ Lexeme (lemma)\n", "\n", "
\n", "\n", "
\n", "
\n", "lex_dom\n", "
\n", "
str
\n", "\n", " ✅ Lexical domain according to Semantic Dictionary of Biblical Greek, SDBG (not present everywhere?)\n", "\n", "
\n", "\n", "
\n", "
\n", "ln\n", "
\n", "
str
\n", "\n", " ✅ Lauw-Nida lexical classification (not present everywhere?)\n", "\n", "
\n", "\n", "
\n", "
\n", "markafter\n", "
\n", "
str
\n", "\n", " 🆗 Text critical marker after word\n", "\n", "
\n", "\n", "
\n", "
\n", "markbefore\n", "
\n", "
str
\n", "\n", " 🆗 Text critical marker before word\n", "\n", "
\n", "\n", "
\n", "
\n", "markorder\n", "
\n", "
str
\n", "\n", "  Order of punctuation and text critical marker\n", "\n", "
\n", "\n", "
\n", "
\n", "monad\n", "
\n", "
int
\n", "\n", " ✅ Monad (smallest token matching word order in the corpus)\n", "\n", "
\n", "\n", "
\n", "
\n", "mood\n", "
\n", "
str
\n", "\n", " ✅ Gramatical mood of the verb (passive, etc)\n", "\n", "
\n", "\n", "
\n", "
\n", "morph\n", "
\n", "
str
\n", "\n", " ✅ Morphological tag (Sandborg-Petersen morphology)\n", "\n", "
\n", "\n", "
\n", "
\n", "nodeID\n", "
\n", "
str
\n", "\n", " ✅ Node ID (as in the XML source data)\n", "\n", "
\n", "\n", "
\n", "
\n", "normalized\n", "
\n", "
str
\n", "\n", " ✅ Surface word with accents normalized and trailing punctuations removed\n", "\n", "
\n", "\n", "
\n", "
\n", "nu\n", "
\n", "
str
\n", "\n", " ✅ Gramatical number (Singular, Plural)\n", "\n", "
\n", "\n", "
\n", "
\n", "number\n", "
\n", "
str
\n", "\n", " ✅ Gramatical number of the verb (e.g. singular, plural)\n", "\n", "
\n", "\n", "
\n", "
\n", "otype\n", "
\n", "
str
\n", "\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", "person\n", "
\n", "
str
\n", "\n", " ✅ Gramatical person of the verb (first, second, third)\n", "\n", "
\n", "\n", "
\n", "
\n", "punctuation\n", "
\n", "
str
\n", "\n", " ✅ Punctuation after word\n", "\n", "
\n", "\n", "
\n", "
\n", "ref\n", "
\n", "
str
\n", "\n", " ✅ Value of the ref ID (taken from XML sourcedata)\n", "\n", "
\n", "\n", "
\n", "
\n", "reference\n", "
\n", "
str
\n", "\n", " ✅ Reference (to nodeID in XML source data, not yet post-processes)\n", "\n", "
\n", "\n", "
\n", "
\n", "roleclausedistance\n", "
\n", "
str
\n", "\n", " ⚠️ Distance to the wordgroup defining the syntactical role of this word\n", "\n", "
\n", "\n", "
\n", "
\n", "sentence\n", "
\n", "
int
\n", "\n", " ✅ Sentence number (counted per chapter)\n", "\n", "
\n", "\n", "
\n", "
\n", "sp\n", "
\n", "
str
\n", "\n", " ✅ Part of Speech (abbreviated)\n", "\n", "
\n", "\n", "
\n", "
\n", "sp_full\n", "
\n", "
str
\n", "\n", " ✅ Part of Speech (long description)\n", "\n", "
\n", "\n", "
\n", "
\n", "strongs\n", "
\n", "
str
\n", "\n", " ✅ Strongs number\n", "\n", "
\n", "\n", "
\n", "
\n", "subj_ref\n", "
\n", "
str
\n", "\n", " 🆗 Subject reference (to nodeID in XML source data, not yet post-processes)\n", "\n", "
\n", "\n", "
\n", "
\n", "tense\n", "
\n", "
str
\n", "\n", " ✅ Gramatical tense of the verb (e.g. Present, Aorist)\n", "\n", "
\n", "\n", "
\n", "
\n", "type\n", "
\n", "
str
\n", "\n", " ✅ Gramatical type of noun or pronoun (e.g. Common, Personal)\n", "\n", "
\n", "\n", "
\n", "
\n", "unicode\n", "
\n", "
str
\n", "\n", " ✅ Word as it apears in the text in Unicode (incl. punctuations)\n", "\n", "
\n", "\n", "
\n", "
\n", "verse\n", "
\n", "
int
\n", "\n", " ✅ Verse number inside chapter\n", "\n", "
\n", "\n", "
\n", "
\n", "voice\n", "
\n", "
str
\n", "\n", " ✅ Gramatical voice of the verb (e.g. active,passive)\n", "\n", "
\n", "\n", "
\n", "
\n", "wgclass\n", "
\n", "
str
\n", "\n", " ✅ Class of the wordgroup (e.g. cl, np, vp)\n", "\n", "
\n", "\n", "
\n", "
\n", "wglevel\n", "
\n", "
int
\n", "\n", " 🆗 Number of the parent wordgroups for a wordgroup\n", "\n", "
\n", "\n", "
\n", "
\n", "wgnum\n", "
\n", "
int
\n", "\n", " ✅ Wordgroup number (counted per book)\n", "\n", "
\n", "\n", "
\n", "
\n", "wgrole\n", "
\n", "
str
\n", "\n", " ✅ Syntactical role of the wordgroup (abbreviated)\n", "\n", "
\n", "\n", "
\n", "
\n", "wgrolelong\n", "
\n", "
str
\n", "\n", " ✅ Syntactical role of the wordgroup (full)\n", "\n", "
\n", "\n", "
\n", "
\n", "wgrule\n", "
\n", "
str
\n", "\n", " ✅ Wordgroup rule information (e.g. Np-Appos, ClCl2, PrepNp)\n", "\n", "
\n", "\n", "
\n", "
\n", "wgtype\n", "
\n", "
str
\n", "\n", " ✅ Wordgroup type details (e.g. group, apposition)\n", "\n", "
\n", "\n", "
\n", "
\n", "word\n", "
\n", "
str
\n", "\n", " ✅ Word as it appears in the text (excl. punctuations)\n", "\n", "
\n", "\n", "
\n", "
\n", "wordlevel\n", "
\n", "
str
\n", "\n", " 🆗 Number of the parent wordgroups for a word\n", "\n", "
\n", "\n", "
\n", "
\n", "wordrole\n", "
\n", "
str
\n", "\n", " ✅ Syntactical role of the word (abbreviated)\n", "\n", "
\n", "\n", "
\n", "
\n", "wordrolelong\n", "
\n", "
str
\n", "\n", " ✅ Syntactical role of the word (full)\n", "\n", "
\n", "\n", "
\n", "
\n", "wordtranslit\n", "
\n", "
str
\n", "\n", " 🆗 Transliteration of the text (in latin letters, excl. punctuations)\n", "\n", "
\n", "\n", "
\n", "
\n", "wordunacc\n", "
\n", "
str
\n", "\n", " ✅ Word without accents (excl. punctuations)\n", "\n", "
\n", "\n", "
\n", "
\n", "oslots\n", "
\n", "
none
\n", "\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", "\n", " Settings:
specified
  1. apiVersion: 3
  2. appName: tonyjurg/Nestle1904LFT
  3. appPath:C:/Users/tonyj/text-fabric-data/github/tonyjurg/Nestle1904LFT/app
  4. commit: e68bd68c7c4c862c1464d995d51e27db7691254f
  5. css: ''
  6. dataDisplay:
    • excludedFeatures:
      • orig_order
      • verse
      • book
      • chapter
    • noneValues:
      • none
      • unknown
      • no value
      • NA
      • ''
    • showVerseInTuple: 0
    • textFormat: text-orig-full
  7. docs:
    • docBase: https://github.com/tonyjurg/Nestle1904LFT/blob/main/docs/
    • docPage: about
    • docRoot: https://github.com/tonyjurg/Nestle1904LFT
    • featureBase:https://github.com/tonyjurg/Nestle1904LFT/blob/main/docs/features/<feature>.md
  8. interfaceDefaults: {fmt: layout-orig-full}
  9. isCompatible: True
  10. local: local
  11. localDir:C:/Users/tonyj/text-fabric-data/github/tonyjurg/Nestle1904LFT/_temp
  12. provenanceSpec:
    • corpus: Nestle 1904 (Low Fat Tree)
    • doi: 10.5281/zenodo.10182594
    • org: tonyjurg
    • relative: /tf
    • repo: Nestle1904LFT
    • repro: Nestle1904LFT
    • version: 0.6
    • webBase: https://learner.bible/text/show_text/nestle1904/
    • webHint: Show this on the Bible Online Learner website
    • webLang: en
    • webUrl:https://learner.bible/text/show_text/nestle1904/<1>/<2>/<3>
    • webUrlLex: {webBase}/word?version={version}&id=<lid>
  13. release: v0.6
  14. typeDisplay:
    • book:
      • condense: True
      • hidden: True
      • label: {book}
      • style: ''
    • chapter:
      • condense: True
      • hidden: True
      • label: {chapter}
      • style: ''
    • sentence:
      • hidden: 0
      • label: #{sentence} (start: {book} {chapter}:{headverse})
      • style: ''
    • verse:
      • condense: True
      • excludedFeatures: chapter verse
      • label: {book} {chapter}:{verse}
      • style: ''
    • wg:
      • hidden: 0
      • label:#{wgnum}: {wgtype} {wgclass} {clausetype} {wgrole} {wgrule} {junction}
      • style: ''
    • word:
      • base: True
      • features: lemma
      • featuresBare: gloss
      • surpress: chapter verse
  15. writing: grc
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
TF API: names N F E L T S C TF Fs Fall Es Eall Cs Call directly usable

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# load the N1904 app and data\n", "N1904 = use (\"tonyjurg/Nestle1904LFT\", version=\"0.6\", hoist=globals())" ] }, { "cell_type": "code", "execution_count": 4, "id": "d023ea5a-aa1e-417c-a331-a980097d4c81", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# The following will push the Text-Fabric stylesheet to this notebook (to facilitate proper display with notebook viewer)\n", "N1904.dh(N1904.getCss())" ] }, { "cell_type": "markdown", "id": "39fec5a4-f3d5-4bdf-a4eb-44c0420ea6f4", "metadata": {}, "source": [ "# 3 - Performing the queries " ] }, { "cell_type": "markdown", "id": "c9de05f1-95b1-4fad-a99c-45358b8bab7c", "metadata": {}, "source": [ "## 3.1 - Swaps in wordorder\n", "##### [Back to TOC](#TOC)" ] }, { "cell_type": "markdown", "id": "3f372314-d93a-442e-afeb-f81b58460f2a", "metadata": {}, "source": [ "Differences in wordorder between TF and XML source data" ] }, { "cell_type": "code", "execution_count": 65, "id": "a5c06c66-49cd-4e8c-a1b7-3ee8ddcdd588", "metadata": { "tags": [] }, "outputs": [ { "ename": "AttributeError", "evalue": "'NodeFeatures' object has no attribute 'orig_order'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[65], line 7\u001b[0m\n\u001b[0;32m 5\u001b[0m monad\u001b[38;5;241m=\u001b[39mF\u001b[38;5;241m.\u001b[39mmonad\u001b[38;5;241m.\u001b[39mv(node)\n\u001b[0;32m 6\u001b[0m \u001b[38;5;66;03m# too bad... it is not in the dataset anymore :(\u001b[39;00m\n\u001b[1;32m----> 7\u001b[0m orig_order\u001b[38;5;241m=\u001b[39mF\u001b[38;5;241m.\u001b[39morig_order\u001b[38;5;241m.\u001b[39mv(node)\n\u001b[0;32m 8\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m monad \u001b[38;5;241m!=\u001b[39m orig_order:\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28mprint\u001b[39m (node)\n", "\u001b[1;31mAttributeError\u001b[0m: 'NodeFeatures' object has no attribute 'orig_order'" ] } ], "source": [ "#range:\n", "#print (F.otype.sInterval(\"word\"))\n", "\n", "for node in F.otype.s(\"word\"):\n", " monad=F.monad.v(node)\n", " orig_order=F.orig_order.v(node)\n", " # too bad... it is not in the dataset anymore :( -> needs to be added in next release again\n", " if monad != orig_order:\n", " print (node)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "6c65cb16-d0c4-43b0-b496-036086a16b29", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }