{ "cells": [ { "cell_type": "markdown", "id": "cb10925c-cccb-420e-be3b-b5ca49ad5cf5", "metadata": { "tags": [] }, "source": [ "# Compare class with morph where morph=ADV (N1904LFT)" ] }, { "cell_type": "markdown", "id": "a5e4bdf3-b108-4c6a-b99b-a4bf4723afee", "metadata": {}, "source": [ "The following script reads the xml atributes class and morph for tag w and compare them. It will analyse the words whenever morph atribute is 'adv' and the class atribute is unequal to 'adv'. It prints first a number of examples (with verse/word location) and finishes with a table showing the frequency of the cases where morph=adv is not matching class=adv." ] }, { "cell_type": "code", "execution_count": 16, "id": "aa00a05d-0251-4510-87b3-94a79dbb4b9f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Comparing atributes class morph for file xml/20230628/01-matthew.xml\n", "\n", "Result:\n", "\n", "At ref=MAT 1:17!7 found class=prep and morph=ADV for lemma=ἕως\n", "At ref=MAT 1:17!14 found class=prep and morph=ADV for lemma=ἕως\n", "At ref=MAT 1:17!25 found class=prep and morph=ADV for lemma=ἕως\n", "At ref=MAT 1:24!9 found class=conj and morph=ADV for lemma=ὡς\n", "At ref=MAT 1:25!5 found class=prep and morph=ADV for lemma=ἕως\n", "At ref=MAT 2:8!18 found class=conj and morph=ADV for lemma=ὅπως\n", "At ref=MAT 2:9!18 found class=conj and morph=ADV for lemma=ἕως\n", "At ref=MAT 2:9!21 found class=prep and morph=ADV for lemma=ἐπάνω\n", "At ref=MAT 2:13!28 found class=conj and morph=ADV for lemma=ἕως\n", "At ref=MAT 2:15!4 found class=prep and morph=ADV for lemma=ἕως\n", "╒═══════════════════════════════════════╤═════════════╕\n", "│ lemma, morph, class │ frequency │\n", "╞═══════════════════════════════════════╪═════════════╡\n", "│ lemma=ὡς, morph=ADV, class=conj │ 40 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=ἕως, morph=ADV, class=prep │ 35 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=ὅπως, morph=ADV, class=conj │ 17 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=ἕως, morph=ADV, class=conj │ 14 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=ὅτε, morph=ADV, class=conj │ 12 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=ὥσπερ, morph=ADV, class=conj │ 10 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=ἐπάνω, morph=ADV, class=prep │ 8 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=ὕστερος, morph=ADV, class=adj │ 7 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=μόνος, morph=ADV, class=adj │ 7 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=ὅπου, morph=ADV, class=conj │ 6 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=ὀπίσω, morph=ADV, class=prep │ 5 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=εὐθύς, morph=ADV, class=adj │ 5 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=πλήν, morph=ADV, class=conj │ 5 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=πέραν, morph=ADV, class=prep │ 3 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=ταχύς, morph=ADV, class=adj │ 3 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=ἔξω, morph=ADV, class=prep │ 3 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=χωρίς, morph=ADV, class=prep │ 3 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=ὅθεν, morph=ADV, class=conj │ 3 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=καθώς, morph=ADV, class=conj │ 3 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=ὡσεί, morph=ADV, class=conj │ 2 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=μέχρι, morph=ADV, class=prep │ 2 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=ἅμα, morph=ADV, class=prep │ 2 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=μεταξύ, morph=ADV, class=prep │ 2 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=παρεκτός, morph=ADV, class=prep │ 1 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=μακράν, morph=ADV, class=adj │ 1 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=πῶς, morph=ADV, class=conj │ 1 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=ὄπισθεν, morph=ADV, class=prep │ 1 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=οὗ, morph=ADV, class=conj │ 1 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=ὑποκάτω, morph=ADV, class=prep │ 1 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=ἄχρι, morph=ADV, class=prep │ 1 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=καθά, morph=ADV, class=conj │ 1 │\n", "├───────────────────────────────────────┼─────────────┤\n", "│ lemma=ὀψέ, morph=ADV, class=prep │ 1 │\n", "╘═══════════════════════════════════════╧═════════════╛\n" ] } ], "source": [ "import os\n", "import xml.etree.ElementTree as ET\n", "from tabulate import tabulate\n", "\n", "ResultDict = {}\n", "\n", "def compare_class_and_morph(file_path):\n", " ExampleNumber = 0\n", " tree = ET.parse(file_path)\n", " root = tree.getroot()\n", "\n", " for w_tag in root.iter('w'):\n", " # Extract attributes class and morph for the tag w\n", " class_attr = w_tag.get('class')\n", " morph_attr = w_tag.get('morph')\n", " lemma_attr = w_tag.get('lemma')\n", " ref_attr = w_tag.get('ref')\n", "\n", " # Compare class and morph attributes\n", " if morph_attr.lower()=='adv' and class_attr!='adv':\n", " ExampleNumber += 1\n", "\n", " Mapping=f\"lemma={lemma_attr}, morph={morph_attr}, class={class_attr}\" \n", " # Check if this Change already exists in ResultDict\n", " if Mapping in ResultDict:\n", " # If it exists, add the count to the existing value\n", " ResultDict[Mapping]+=1\n", " else:\n", " # If it doesn't exist, initialize the count as the value\n", " ResultDict[Mapping]=1\n", " if ExampleNumber<=NumberExamples: \n", " print(f\"At ref={ref_attr} found class={class_attr} and morph={morph_attr} for lemma={lemma_attr}\")\n", "\n", " return \n", "\n", "# Following variable should contain the relative path and name of file to check\n", "InputFile=\"xml/20230628/01-matthew.xml\"\n", "# How many difference to show prior to table\n", "NumberExamples = 10\n", "\n", "# First check if the file exists, then analyze its content\n", "if os.path.exists(InputFile):\n", " print(f\"Comparing atributes class morph for file {InputFile}\\n\\nResult:\\n\\n\", end=\"\")\n", " differences = compare_class_and_morph(InputFile)\n", " \n", " # Convert the dictionary into a list of key-value pairs and sort it according to frequency\n", " UnsortedTableData = [[key, value] for key, value in ResultDict.items()]\n", " TableData= sorted(UnsortedTableData, key=lambda row: row[1], reverse=True)\n", "\n", " # Produce the table\n", " headers = [\"lemma, morph, class\",\"frequency\"]\n", " print(tabulate(TableData, headers=headers, tablefmt='fancy_grid'))\n", "else:\n", " print(f\"Could not find file {InputFile}.\")\n", "\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "e6095ea9-0f56-4526-a49a-547180f64b46", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }