{
"cells": [
{
"cell_type": "markdown",
"id": "9a869eb5-7a9e-4e22-b933-7bdbfdc6974a",
"metadata": {},
"source": [
"# Calculate V, S, O order (N1904GBI)"
]
},
{
"cell_type": "markdown",
"id": "8f61a637-f07e-4c3e-a76c-1b2019bd83c9",
"metadata": {
"tags": []
},
"source": [
"## Table of content \n",
"* 1 - Introduction\n",
"* 2 - Create sum of orders"
]
},
{
"cell_type": "markdown",
"id": "fc5ded12-6f05-442c-8bb5-541d0468723b",
"metadata": {},
"source": [
"# 1 - Introduction \n",
"##### [Back to TOC](#TOC)"
]
},
{
"cell_type": "markdown",
"id": "c6aa9f28-7c84-4dc6-b5a0-2c507b395e94",
"metadata": {},
"source": [
"Investigating the order of various clausal parts (e.g. V,S,O).\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "4ed069ce-4af6-40c9-bdb6-2737f8742fda",
"metadata": {},
"source": [
"Testing dataset: N1904 treebank (GBI)\n"
]
},
{
"cell_type": "markdown",
"id": "9c38fa20-5e1a-44d5-98f4-34d62d42c0ae",
"metadata": {},
"source": [
"# 2 - Create sum of orders\n",
"##### [Back to TOC](#TOC) "
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d2024bb2-4728-4810-abfd-726499c74430",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import sys\n",
"import os\n",
"import time\n",
"import pickle\n",
"\n",
"import re # used for regular expressions\n",
"from os import listdir\n",
"from os.path import isfile, join\n",
"import xml.etree.ElementTree as ET"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "581c1806-99b1-42ec-874a-fa7b3cd97086",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"BaseDir = 'C:\\\\Users\\\\tonyj\\\\my_new_Jupyter_folder\\\\test_of_xml_etree\\\\'\n",
"InputDir = BaseDir+'inputfiles\\\\'\n",
"bo='26-jude'\n",
"InputFile = os.path.join(InputDir, f'{bo}.xml')\n",
"tree = ET.parse(InputFile)\n",
"root = tree.getroot()\n",
"\n",
"# Dictionary to store transition frequencies\n",
"transition_frequencies = {}"
]
},
{
"cell_type": "markdown",
"id": "d9f1d9fb-aa7f-4014-9ec6-111e9db5c79f",
"metadata": {},
"source": [
"Multiple sets of books are defined here allowing for determening variations."
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "a544f78c-6be8-4a13-b26d-f17eecfea8af",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"booklist = ['01-matthew', '02-mark', '03-luke', '04-john', '05-acts', '06-romans',\n",
" '07-1corinthians','08-2corinthians', '09-galatians', '10-ephesians',\n",
" '11-philippians', '12-colossians', '13-1thessalonians', '14-2thessalonians',\n",
" '15-1timothy', '16-2timothy', '17-titus', '18-philemon', '19-hebrews', \n",
" '20-james', '21-1peter', '22-2peter', '23-1john', '24-2john', '25-3john',\n",
" '26-jude', '27-revelation']\n",
"paullist= ['06-romans', '07-1corinthians','08-2corinthians', '09-galatians', '10-ephesians',\n",
" '11-philippians', '12-colossians', '13-1thessalonians', '14-2thessalonians',\n",
" '15-1timothy', '16-2timothy', '17-titus', '18-philemon']\n",
"peterlist= ['21-1peter', '22-2peter']\n",
"lukelist= ['03-luke','05-acts']\n",
"johnlist = ['23-1john', '24-2john', '25-3john']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10c4098e-3ced-4fdb-9cc7-d7771ba16dea",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import xml.etree.ElementTree as ET\n",
"import re\n",
"\n",
"# Dictionary to store transition frequencies\n",
"transition_frequencies = {}\n",
"total_transitions = 0 \n",
"# Dictionary to store transitions grouped by ('from', 'to') value\n",
"grouped_transitions = {}\n",
"\n",
"for bo in paullist:\n",
" InputFile = os.path.join(InputDir, f'{bo}.xml')\n",
" print (f'Reading file {InputFile}')\n",
" \n",
" # Load the XML file\n",
" tree = ET.parse(InputFile)\n",
" root = tree.getroot()\n",
" \n",
" pattern = re.compile(r'-')\n",
" \n",
" # Iterate over 'Tree' elements\n",
" for tree in root.findall('.//Tree'):\n",
" # Iterate over child nodes of the current 'Tree' element\n",
" for node in tree.findall('.//Node'):\n",
" # Determine the current rule\n",
" node_cat = node.get('Cat')\n",
" node_rule = node.get('Rule')\n",
" if node_cat == \"CL\" :\n",
" print (node_cat,node_rule)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e5a3d39d-99fe-4050-b1cf-e6f3d5c60fba",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# avarages for each seperate transition (i.e. all rules sum op to p=1 per starting condition)\n",
"\n",
"import xml.etree.ElementTree as ET\n",
"\n",
"def addParentInfo(parent, element):\n",
" for child in element:\n",
" child.attrib['parent'] = parent\n",
" addParentInfo(child, child)\n",
"\n",
"def getParent(element):\n",
" if 'parent' in element.attrib:\n",
" return element.attrib['parent']\n",
" else:\n",
" return None\n",
"\n",
"# Dictionary to store transition frequencies\n",
"transition_frequencies = {}\n",
"total_transitions = 0\n",
"\n",
"# Dictionary to store transitions grouped by ('from', 'to') value\n",
"grouped_transitions = {}\n",
"print('loading books ',end='')\n",
"\n",
"for bo in johnlist:\n",
" InputFile = os.path.join(InputDir, f'{bo}.xml')\n",
" #print (f'Reading file {InputFile}')\n",
" print ('.',end='')\n",
" \n",
" # Load the XML file\n",
" tree = ET.parse(InputFile)\n",
" root = tree.getroot()\n",
" \n",
" # Add 'parent' attribute to each child element\n",
" addParentInfo(None, root)\n",
"\n",
" # Iterate over 'Tree' elements\n",
" for tree in root.findall('.//Tree'):\n",
" # Iterate over child nodes of the current 'Tree' element\n",
" for node in tree.findall('.//Node'):\n",
" # Check if the node has child nodes\n",
" has_children = bool(list(node))\n",
"\n",
" # Determine the current rule\n",
" node_cat = node.get('Cat') if has_children else 'Term'\n",
"\n",
" # Get the parent node using the 'getParent' function\n",
" parent_node = getParent(node)\n",
"\n",
" # Check if there is a parent node\n",
" if parent_node is not None:\n",
" parent_cat = parent_node.get('Cat')\n",
" if parent_cat is None and node_cat is not None:\n",
" parent_cat = \"Start\"\n",
" continue\n",
"\n",
" # Combine parent and current rule to form the transition\n",
" transition = (parent_cat, node_cat)\n",
"\n",
" # Update the frequency count in the dictionary\n",
" total_transitions += 1\n",
" transition_frequencies[transition] = transition_frequencies.get(transition, 0) + 1\n",
"\n",
"print (f'\\nFinished\\tNumber of transitions: {total_transitions}\\n')\n",
"\n",
"# Group transitions based on ('from', 'to') value\n",
"for (from_value, to_value), frequency in transition_frequencies.items():\n",
" grouped_transitions.setdefault(from_value, []).append((from_value, to_value, frequency))\n",
"\n",
"# Print separate tables for each group with sorted transitions\n",
"for from_value, transitions in grouped_transitions.items():\n",
" print(f\"Transition table for starting condition: {from_value}\")\n",
" print(\"From\\tTo\\tOcc.\\tWeigth\")\n",
" \n",
" # Sort transitions based on frequency in descending order\n",
" sorted_transitions = sorted(transitions, key=lambda x: x[2], reverse=True)\n",
"\n",
" # Calculate total occurrences for the current table\n",
" total_occurrences = sum(occurrence for _, _, occurrence in sorted_transitions)\n",
"\n",
" for from_val, to_val, frequency in sorted_transitions:\n",
" # Calculate the average occurrence for each transition\n",
" average_occurrence = frequency / total_occurrences\n",
" print(f'{from_val}\\t{to_val}\\t{frequency}\\t{average_occurrence:.4}')\n",
"\n",
" print('\\n')"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "192d6936-d9ed-40fb-a0e8-9f22f8c8fa30",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading the inputfiles ...........................\n",
"\n",
"Frequency Table:\n",
"Node Rule Frequency \n",
"------------------------------\n",
"V-O 2964 \n",
"V-ADV 1870 \n",
"ADV-V 1371 \n",
"O-V 1158 \n",
"that-VP 978 \n",
"S-V 863 \n",
"V-S 767 \n",
"ADV-V-O 743 \n",
"V-IO 651 \n",
"V-O-ADV 599 \n",
"S-V-O 572 \n",
"S-V-ADV 531 \n",
"ADV-V-ADV 491 \n",
"S-ADV-V 480 \n",
"S-VC-P 427 \n",
"V-S-ADV 352 \n",
"S-P 345 \n",
"V-ADV-ADV 314 \n",
"P-VC 312 \n",
"O-V-ADV 306 \n",
"VC-P 282 \n",
"S-P-VC 270 \n",
"ADV-V-S 268 \n",
"P-S 263 \n",
"P-VC-S 236 \n",
"S-O-V 216 \n",
"S-ADV-V-O 212 \n",
"ADV-S-V 211 \n",
"ADV-ADV-V 200 \n",
"O-ADV-V 197 \n",
"V-IO-S 196 \n",
"S-ADV-V-ADV 185 \n",
"S-V-O-ADV 181 \n",
"ADV-O-V 174 \n",
"V-IO-O 168 \n",
"V-S-O 165 \n",
"V-ADV-O 152 \n",
"ADV-V-O-ADV 143 \n",
"V-ADV-S 143 \n",
"S-V-ADV-ADV 137 \n",
"S-V-IO 128 \n",
"O-V-IO 113 \n",
"V-O-S 110 \n",
"O-V-S 109 \n",
"ADV-V-IO 106 \n",
"S-ADV 103 \n",
"ADV-S-V-ADV 100 \n",
"VC-S-P 99 \n",
"ADV-V-ADV-ADV 97 \n",
"O-S-V 91 \n",
"ADV-V-S-ADV 91 \n",
"ADV-ADV-V-O 90 \n",
"ADV-ADV-V-ADV 89 \n",
"V-O-ADV-ADV 88 \n",
"ADV-S 84 \n",
"ADV-ADV 76 \n",
"V-O-IO 72 \n",
"ADV-S-V-O 71 \n",
"P-VC-ADV 70 \n",
"S-P-ADV 66 \n",
"V-S-ADV-ADV 66 \n",
"ADV-P 66 \n",
"IO-V 65 \n",
"ADV-VC-P 62 \n",
"ADV-S-ADV-V 61 \n",
"S-IO 61 \n",
"ADV-V-S-O 59 \n",
"ADV-O 58 \n",
"V-IO-ADV 58 \n",
"S-P-VC-ADV 57 \n",
"S-ADV-VC-P 56 \n",
"VC-P-ADV 54 \n",
"V-O-O2 52 \n",
"ADV-S-V-IO 52 \n",
"O-V-ADV-ADV 52 \n",
"S-ADV-ADV-V 51 \n",
"S-O-V-ADV 50 \n",
"ADV-P-VC 49 \n",
"P-ADV 48 \n",
"V-ADV-ADV-ADV 46 \n",
"VC-P-S 46 \n",
"S-ADV-V-O-ADV 45 \n",
"S-VC-P-ADV 45 \n",
"P-S-VC 44 \n",
"S-ADV-V-IO 43 \n",
"S-O-ADV-V 43 \n",
"O-ADV 41 \n",
"V-ADV-S-ADV 40 \n",
"S-ADV-V-ADV-ADV 39 \n",
"ADV-O-V-ADV 37 \n",
"S-ADV-P 37 \n",
"ADV-V-IO-O 36 \n",
"ADV-V-ADV-S 35 \n",
"V-S-IO 34 \n",
"S-ADV-O-V 34 \n",
"V-S-O-ADV 33 \n",
"O-V-S-ADV 33 \n",
"S-V-IO-O 32 \n",
"O-ADV-V-ADV 31 \n",
"O-IO-V 30 \n",
"ADV-VC-S-P 29 \n",
"P-ADV-VC 29 \n",
"ADV-S-ADV-V-ADV 28 \n",
"S-O 27 \n",
"ADV-S-V-ADV-ADV 27 \n",
"IO-V-O 27 \n",
"ADV-S-V-O-ADV 27 \n",
"ADV-V-ADV-O 26 \n",
"ADV-ADV-V-S 26 \n",
"ADV-ADV-ADV-V 26 \n",
"ADV-V-O-S 25 \n",
"S-IO-V 25 \n",
"O-S-ADV-V 25 \n",
"S-ADV-ADV-V-O 24 \n",
"S-VC-ADV-P 24 \n",
"V-O-S-ADV 24 \n",
"ADV-ADV-O-V 24 \n",
"ADV-ADV-S-V 23 \n",
"ADV-S-P-VC 23 \n",
"ADV-S-ADV-V-O 23 \n",
"S-P-ADV-VC 22 \n",
"O-V-IO-ADV 22 \n",
"ADV-V-IO-ADV 22 \n",
"IO-S-ADV 22 \n",
"O2-V-O 21 \n",
"V-ADV-O-ADV 21 \n",
"S-V-ADV-O 21 \n",
"O-S-V-ADV 21 \n",
"ADV-V-IO-S 20 \n",
"O-S-V-IO 20 \n",
"VC-S-P-ADV 20 \n",
"S-VC 20 \n",
"ADV-S-P 20 \n",
"IO-O 19 \n",
"ADV-V-O-IO 18 \n",
"ADV-ADV-V-O-ADV 18 \n",
"P-S-ADV 18 \n",
"S-ADV-P-VC 18 \n",
"ADV-V-ADV-ADV-ADV 18 \n",
"ADV-ADV-V-ADV-ADV 18 \n",
"V-S-ADV-ADV-ADV 18 \n",
"ADV-V-S-O-ADV 17 \n",
"P-ADV-S 17 \n",
"S-V-ADV-ADV-ADV 17 \n",
"ADV-O-ADV-V 17 \n",
"V-IO-S-ADV 17 \n",
"ADV-VC-P-S 17 \n",
"O-V-IO-S 17 \n",
"V-IO-O-ADV 16 \n",
"S-V-IO-ADV 16 \n",
"P-VC-S-ADV 16 \n",
"VC-ADV-P 16 \n",
"ADV-ADV-ADV 15 \n",
"V-S-ADV-O 15 \n",
"ADV-V-O-ADV-ADV 15 \n",
"O-ADV-ADV-V 15 \n",
"IO-O-V 15 \n",
"S-ADV-ADV-V-ADV 15 \n",
"ADV-P-S 15 \n",
"ADV-V-S-ADV-ADV 14 \n",
"ADV-P-VC-S 14 \n",
"ADV-P-VC-ADV 14 \n",
"ADV-ADV-V-IO 13 \n",
"ADV-S-O-V 12 \n",
"ADV-S-VC-P 12 \n",
"IO-V-S 12 \n",
"V-O-IO-ADV 12 \n",
"S-V-O-ADV-ADV 12 \n",
"O2-O-V 12 \n",
"V-O-ADV-ADV-ADV 11 \n",
"O-O2-V 11 \n",
"V-IO-S-O 11 \n",
"V-IO-ADV-O 11 \n",
"ADV-IO-V 11 \n",
"O-V-ADV-ADV-ADV 11 \n",
"V-ADV-ADV-S 11 \n",
"S-P-ADV-ADV 11 \n",
"S-ADV-ADV 10 \n",
"ADV-V-O-S-ADV 10 \n",
"ADV-VC-P-ADV 10 \n",
"O-V-O2 10 \n",
"P-VC-ADV-ADV 10 \n",
"S-V-O-O2 10 \n",
"ADV-ADV-ADV-V-O 10 \n",
"ADV-ADV-V-S-ADV 10 \n",
"ADV-P-ADV 10 \n",
"S-O-V-ADV-ADV 9 \n",
"O-V-S-ADV-ADV 9 \n",
"ADV-V-ADV-S-ADV 9 \n",
"ADV-O-V-IO 9 \n",
"S-IO-ADV 9 \n",
"P-ADV-VC-S 9 \n",
"S-O-V-O2 9 \n",
"S-V-O-IO 9 \n",
"ADV-ADV-ADV-V-ADV 9 \n",
"V-ADV-IO 9 \n",
"IO-S 9 \n",
"P-VC-ADV-S 9 \n",
"ADV-V-O-O2 9 \n",
"ADV-VC-S-P-ADV 8 \n",
"ADV-V-ADV-O-ADV 8 \n",
"ADV-ADV-S-V-IO 8 \n",
"O-ADV-V-S 8 \n",
"ADV-S-ADV 8 \n",
"V-IO-ADV-ADV 8 \n",
"ADV-IO-V-O 8 \n",
"ADV-O-V-O2 8 \n",
"S-ADV-ADV-V-O-ADV 8 \n",
"IO-S-V 8 \n",
"S-ADV-ADV-ADV-V 8 \n",
"ADV-V-IO-O-ADV 7 \n",
"O-O2 7 \n",
"V-O2-O 7 \n",
"S-ADV-V-IO-O 7 \n",
"S-IO-V-O 7 \n",
"V-O-S-O2 7 \n",
"S-ADV-ADV-V-ADV-ADV 7 \n",
"S-ADV-V-ADV-O 7 \n",
"O-IO 7 \n",
"O-ADV-ADV 7 \n",
"ADV-ADV-P-VC 7 \n",
"ADV-ADV-S-V-ADV 7 \n",
"ADV-VC 7 \n",
"ADV-ADV-VC-P 7 \n",
"O-ADV-V-IO 7 \n",
"V-O-O2-ADV 7 \n",
"ADV-S-ADV-V-ADV-ADV 7 \n",
"ADV-ADV-P 7 \n",
"S-IO-V-ADV 6 \n",
"ADV-S-O-V-ADV 6 \n",
"IO-ADV-V 6 \n",
"S-O-V-IO 6 \n",
"ADV-ADV-S-V-O 6 \n",
"O-ADV-S-V 6 \n",
"S-P-VC-ADV-ADV 6 \n",
"VC-P-ADV-ADV 6 \n",
"ADV-O-V-ADV-ADV 6 \n",
"ADV-O-V-S 6 \n",
"ADV-S-V-IO-O 6 \n",
"S-ADV-ADV-O-V 6 \n",
"ADV-ADV-V-O-S 6 \n",
"P-S-VC-ADV 6 \n",
"O-V-ADV-S 6 \n",
"ADV-IO 6 \n",
"O-IO-V-ADV 6 \n",
"IO-V-S-ADV 6 \n",
"S-ADV-V-IO-ADV 5 \n",
"S-ADV-P-VC-ADV 5 \n",
"ADV-S-V-ADV-ADV-ADV 5 \n",
"IO-V-ADV 5 \n",
"ADV-ADV-V-ADV-O 5 \n",
"O-ADV-V-ADV-ADV 5 \n",
"VC-P-ADV-S 5 \n",
"S-ADV-ADV-V-IO 5 \n",
"ADV-S-P-VC-ADV 5 \n",
"ADV-O-ADV 5 \n",
"S-V-IO-O-ADV 5 \n",
"VC-S-ADV-P 5 \n",
"ADV-P-S-ADV 5 \n",
"ADV-S-ADV-ADV-V 5 \n",
"V-ADV-O-ADV-ADV 5 \n",
"S-ADV-VC-P-ADV 5 \n",
"S-ADV-O-V-ADV 5 \n",
"ADV-V-O-ADV-ADV-ADV 5 \n",
"ADV-O-IO-V 5 \n",
"ADV-ADV-V-S-O 5 \n",
"S-IO-O-V 5 \n",
"O-V-S-IO 5 \n",
"ADV-O-S-V 5 \n",
"V-S-O-ADV-ADV 5 \n",
"ADV-ADV-O-V-ADV 5 \n",
"ADV-S-ADV-V-ADV-O 4 \n",
"ADV-ADV-VC-P-ADV 4 \n",
"V-ADV-ADV-O 4 \n",
"O-V-S-O2 4 \n",
"S-ADV-IO-V 4 \n",
"ADV-S-V-ADV-O 4 \n",
"ADV-S-V-IO-ADV 4 \n",
"ADV-V-ADV-ADV-S 4 \n",
"ADV-ADV-S-ADV-V 4 \n",
"Conj2P 4 \n",
"S-ADV-V-O-ADV-ADV 4 \n",
"VC-ADV-S-P 4 \n",
"O-S-IO-V 4 \n",
"ADV-IO-O-V 4 \n",
"O-IO-ADV-V 4 \n",
"ADV-V-O-S-ADV-ADV 4 \n",
"V-ADV-S-O 4 \n",
"S-ADV-O-ADV-V 4 \n",
"S-ADV-V-O-IO 4 \n",
"ADV-ADV-S 4 \n",
"ADV-O-V-S-ADV 4 \n",
"O2-V-O-ADV 4 \n",
"S-O-ADV-V-ADV 4 \n",
"IO-S-V-O 4 \n",
"O-S-V-ADV-ADV 4 \n",
"VC-ADV-ADV-P 4 \n",
"V-O-S-ADV-ADV-ADV 4 \n",
"P-ADV-ADV 4 \n",
"ADV-VC-ADV-P 3 \n",
"V-IO-ADV-S 3 \n",
"S-ADV-VC-ADV-P 3 \n",
"ADV-S-ADV-O-V 3 \n",
"ADV-V-IO-S-ADV 3 \n",
"V-IO-O-ADV-ADV 3 \n",
"ADV-ADV-IO-V 3 \n",
"ADV-IO-S 3 \n",
"ADV-S-V-O-ADV-ADV 3 \n",
"S-ADV-ADV-ADV-V-ADV 3 \n",
"IO-V-O-ADV 3 \n",
"O2-O-V-ADV 3 \n",
"V-ADV-ADV-S-ADV 3 \n",
"V-ADV-S-ADV-ADV 3 \n",
"V-S-ADV-ADV-IO 3 \n",
"ADV-S-IO-V 3 \n",
"S-ADV-ADV-V-ADV-O 3 \n",
"IO-ADV 3 \n",
"ADV-V-S-IO 3 \n",
"V-O-ADV-O2 3 \n",
"ADV-V-O-ADV-S 3 \n",
"S-ADV-O-V-IO 3 \n",
"ADV-IO-S-V 3 \n",
"ADV-ADV-ADV-O-V 3 \n",
"O-IO-V-S 3 \n",
"ADV-ADV-S-ADV-V-O 3 \n",
"S-V-ADV-O-ADV 3 \n",
"P-S-ADV-VC 3 \n",
"ADV-V-ADV-IO 3 \n",
"ADV-V-S-ADV-ADV-ADV 3 \n",
"S-ADV-V-ADV-ADV-ADV 3 \n",
"V-IO-O-S 3 \n",
"ADV-VC-P-S-ADV 3 \n",
"O-ADV-V-O2 3 \n",
"ADV-O-ADV-V-ADV 3 \n",
"ADV-P-ADV-VC 3 \n",
"S-O-IO-V 3 \n",
"S-ADV-V-O-O2 3 \n",
"O-V-IO-S-ADV 3 \n",
"O-S-ADV-V-ADV 3 \n",
"O-S-V-IO-ADV 3 \n",
"ADV-ADV-VC-P-S 3 \n",
"ADV-S-ADV-ADV-V-O-ADV 3 \n",
"IO-S-O-V 3 \n",
"V-IO-S-O-ADV 3 \n",
"IO-ADV-V-S 3 \n",
"ADV-IO-V-ADV 3 \n",
"O-ADV-ADV-V-ADV 3 \n",
"ADV-S-ADV-V-ADV-ADV-ADV 3 \n",
"IO-S-V-ADV 3 \n",
"ADV-S-IO 3 \n",
"VC-S-P-ADV-ADV 3 \n",
"P-ADV-S-ADV 3 \n",
"V-O-ADV-ADV-ADV-ADV 3 \n",
"ADV-P-VC-ADV-S 2 \n",
"O-ADV-S-ADV-V 2 \n",
"ADV-S-V-O-IO 2 \n",
"S-V-O-O2-ADV-ADV 2 \n",
"ADV-P-VC-S-ADV 2 \n",
"ADV-V-S-IO-ADV 2 \n",
"S-O2-V-O 2 \n",
"ADV-ADV-V-S-ADV-ADV 2 \n",
"ADV-ADV-ADV-S-V-ADV 2 \n",
"ADV-ADV-S-O-V 2 \n",
"S-ADV-IO-V-O 2 \n",
"ADV-S-ADV-V-IO 2 \n",
"V-O-IO-O2 2 \n",
"V-IO-O-O2 2 \n",
"S-ADV-ADV-ADV-V-O 2 \n",
"ADV-ADV-S-V-O-ADV 2 \n",
"ADV-V-IO-ADV-O 2 \n",
"ADV-S-ADV-ADV-V-O 2 \n",
"S-V-O-O2-ADV 2 \n",
"ADV-O2-V-O 2 \n",
"S-O-ADV 2 \n",
"V-O-ADV-S 2 \n",
"ADV-ADV-V-IO-S 2 \n",
"ADV-ADV-P-VC-S 2 \n",
"VC-P-S-ADV-ADV 2 \n",
"ADV-IO-V-S-ADV 2 \n",
"V-O-IO-ADV-ADV 2 \n",
"V-IO-ADV-S-ADV 2 \n",
"ADV-ADV-S-VC-P 2 \n",
"IO-V-ADV-ADV 2 \n",
"IO-V-S-O 2 \n",
"ADV-ADV-ADV-V-ADV-ADV 2 \n",
"O-IO-ADV-ADV-V 2 \n",
"V-S-ADV-O-ADV 2 \n",
"ADV-ADV-V-IO-O 2 \n",
"S-V-O-ADV-O2 2 \n",
"V-S-O-IO 2 \n",
"S-O-O2-V 2 \n",
"ADV-ADV-V-ADV-S 2 \n",
"V-O-ADV-IO 2 \n",
"V-IO-S-ADV-ADV 2 \n",
"V-O-S-ADV-ADV 2 \n",
"ADV-ADV-VC-S-P 2 \n",
"ADV-ADV-ADV-V-S 2 \n",
"O-ADV-V-S-ADV 2 \n",
"ADV-ADV-S-P-VC 2 \n",
"S-O-ADV-V-IO 2 \n",
"O-V-ADV-S-ADV 2 \n",
"O-IO-V-S-ADV 2 \n",
"ADV-V-ADV-IO-O 2 \n",
"S-V-IO-ADV-ADV 2 \n",
"ADV-S-P-ADV-VC 2 \n",
"S-VC-ADV-ADV-P-ADV 2 \n",
"ADV-S-ADV-ADV-V-ADV 2 \n",
"ADV-ADV-V-O-IO 2 \n",
"IO-ADV-V-ADV 2 \n",
"V-ADV-ADV-ADV-S 2 \n",
"ADV-S-ADV-V-O-ADV 2 \n",
"S-ADV-ADV-ADV-V-ADV-ADV 2 \n",
"ADV-V-O-O2-ADV 2 \n",
"S-IO-V-ADV-O 2 \n",
"ADV-S-ADV-ADV-V-ADV-ADV 2 \n",
"S-VC-P-ADV-ADV-ADV 2 \n",
"ADV-V-S-ADV-O 2 \n",
"O2-S-V-O 2 \n",
"ADV-ADV-V-O-ADV-ADV 2 \n",
"S-IO-V-O-ADV 2 \n",
"ADV-V-IO-ADV-ADV 2 \n",
"VC-ADV-P-S-ADV 2 \n",
"S-IO-ADV-V 2 \n",
"ADV-P-ADV-VC-S 2 \n",
"VC-P-S-ADV 2 \n",
"S-V-O-ADV-ADV-ADV 2 \n",
"S-V-ADV-ADV-ADV-ADV 2 \n",
"S-P-ADV-VC-ADV 2 \n",
"P-ADV-ADV-ADV 2 \n",
"V-O-ADV-ADV-ADV-ADV-ADV 2 \n",
"O-V-ADV-ADV-ADV-ADV 2 \n",
"V-ADV-ADV-ADV-ADV 2 \n",
"IO-S-ADV-ADV 2 \n",
"ADV-S-ADV-O-V-ADV 2 \n",
"VC-ADV-P-ADV 2 \n",
"P-VC-S-ADV-ADV 2 \n",
"S-ADV-ADV-P 2 \n",
"S-ADV-ADV-P-VC 2 \n",
"ADV-P-ADV-S 2 \n",
"V-ADV-IO-ADV 2 \n",
"S-V-ADV-IO-ADV 1 \n",
"S-V-ADV-IO-ADV-ADV 1 \n",
"ADV-V-S-ADV-ADV-ADV-ADV 1 \n",
"ADV-ADV-O-V-S-IO 1 \n",
"ADV-O2-O-V 1 \n",
"ADV-ADV-ADV-V-ADV-O 1 \n",
"O-V-S-ADV-IO 1 \n",
"V-IO-ADV-ADV-O 1 \n",
"ADV-S-V-ADV-ADV-ADV-ADV 1 \n",
"ADV-V-ADV-ADV-S-ADV 1 \n",
"ADV-IO-ADV-S-ADV 1 \n",
"ADV-ADV-VC-P-ADV-S 1 \n",
"ADV-S-V-O-IO-ADV 1 \n",
"ADV-V-ADV-S-ADV-ADV 1 \n",
"ADV-S-ADV-V-ADV-IO-ADV 1 \n",
"ADV-S-ADV-V-O-O2-ADV 1 \n",
"O2-ADV-V-ADV 1 \n",
"Conj6P 1 \n",
"S-O2-O-V 1 \n",
"ADV-S-ADV-ADV-V-O-IO-ADV 1 \n",
"S-ADV-ADV-V-O-ADV-ADV 1 \n",
"ADV-ADV-ADV-S-V-O 1 \n",
"ADV-O-V-IO-ADV 1 \n",
"ADV-S-ADV-ADV-O-V-ADV 1 \n",
"ADV-O-O2-V 1 \n",
"V-O-S-ADV-O2 1 \n",
"VC-ADV-P-S 1 \n",
"ADV-ADV-O 1 \n",
"ADV-ADV-S-ADV-V-ADV 1 \n",
"ADV-V-ADV-ADV-O-ADV 1 \n",
"ADV-ADV-ADV-S-O-V 1 \n",
"P-ADV-S-ADV-VC 1 \n",
"S-ADV-IO-O 1 \n",
"S-ADV-ADV-ADV-ADV-O-V 1 \n",
"O-S 1 \n",
"V-S-O-IO-ADV 1 \n",
"ADV-ADV-S-ADV-ADV-V 1 \n",
"ADV-ADV-ADV-V-S-ADV-ADV 1 \n",
"ADV-ADV-ADV-V-O-ADV-ADV-ADV 1 \n",
"V-O-S-IO-ADV 1 \n",
"P-ADV-ADV-S-VC 1 \n",
"P-ADV-S-VC 1 \n",
"IO-V-ADV-O 1 \n",
"O-S-ADV-V-IO 1 \n",
"ADV-ADV-V-IO-ADV-ADV 1 \n",
"ADV-S-O-ADV 1 \n",
"IO-ADV-V-O 1 \n",
"S-P-ADV-ADV-VC 1 \n",
"ADV-ADV-ADV-V-O-ADV 1 \n",
"S-ADV-O 1 \n",
"ADV-P-S-VC 1 \n",
"O-ADV-ADV-ADV 1 \n",
"S-V-IO-ADV-O 1 \n",
"S-ADV-ADV-V-O-IO 1 \n",
"V-ADV-S-ADV-O 1 \n",
"S-ADV-V-O-IO-ADV 1 \n",
"ADV-S-O-V-O2 1 \n",
"O-O2-V-ADV 1 \n",
"ADV-S-IO-V-O 1 \n",
"O-ADV-ADV-V-IO 1 \n",
"ADV-V-S-ADV-O-ADV 1 \n",
"V-ADV-O-S 1 \n",
"ADV-S-ADV-ADV-ADV-V-O 1 \n",
"ADV-S-O-ADV-V 1 \n",
"ADV-O-ADV-V-O2 1 \n",
"S-V-O-IO-ADV 1 \n",
"V-O-S-O2-ADV 1 \n",
"ADV-S-ADV-VC-P 1 \n",
"ADV-V-O-ADV-S-IO-ADV 1 \n",
"V-S-IO-ADV 1 \n",
"IO-ADV-V-O-O2-ADV-ADV-ADV 1 \n",
"ADV-VC-S-ADV-P 1 \n",
"ADV-V-IO-S-O 1 \n",
"O2-V-S-O 1 \n",
"IO-ADV-S-ADV-V-O-ADV 1 \n",
"V-ADV-ADV-ADV-S-ADV-ADV 1 \n",
"O-S-O2-V-ADV-ADV 1 \n",
"O-V-S-ADV-ADV-ADV 1 \n",
"O-S-O2-V-ADV 1 \n",
"O-ADV-V-ADV-S-ADV-ADV-ADV 1 \n",
"S-ADV-V-IO-O-ADV 1 \n",
"P-VC-ADV-S-ADV 1 \n",
"S-V-ADV-ADV-ADV-O 1 \n",
"ADV-ADV-V-S-ADV-ADV-ADV 1 \n",
"S-ADV-ADV-ADV 1 \n",
"ADV-S-V-IO-ADV-ADV 1 \n",
"V-S-IO-O 1 \n",
"ADV-ADV-IO 1 \n",
"VC-ADV-ADV-S-P 1 \n",
"ADV-V-O-IO-ADV 1 \n",
"S-ADV-ADV-V-IO-O 1 \n",
"V-S-ADV-O-IO 1 \n",
"VC-S 1 \n",
"ADV-O2-O-V-ADV 1 \n",
"S-ADV-ADV-O-V-ADV 1 \n",
"S-O-V-IO-ADV 1 \n",
"ADV-ADV-O-V-S-ADV 1 \n",
"V-ADV-O-ADV-ADV-ADV 1 \n",
"ADV-S-ADV-V-IO-O 1 \n",
"ADV-ADV-P-ADV-VC-ADV 1 \n",
"ADV-O-S-V-O2 1 \n",
"O-ADV-V-ADV-S 1 \n",
"V-O2-O-ADV 1 \n",
"S-ADV-V-ADV-IO 1 \n",
"S-VC-ADV 1 \n",
"ADV-P-VC-ADV-S-ADV 1 \n",
"O2-ADV-V-O 1 \n",
"ADV-S-V-O-O2 1 \n",
"ADV-VC-P-ADV-S 1 \n",
"S-ADV-ADV-ADV-O-V 1 \n",
"ADV-V-S-O-ADV-ADV 1 \n",
"O-V-S-O2-ADV 1 \n",
"IO-S-V-O-ADV 1 \n",
"ADV-ADV-ADV-ADV 1 \n",
"V-O-O2-IO 1 \n",
"O-O2-IO 1 \n",
"IO-V-O-O2-ADV 1 \n",
"V-O-O2-IO-ADV 1 \n",
"ADV-ADV-V-O-O2-IO-ADV 1 \n",
"O-ADV-V-O2-ADV 1 \n",
"S-ADV-ADV-ADV-O-IO-V 1 \n",
"ADV-IO-ADV-V 1 \n",
"P-S-ADV-ADV 1 \n",
"ADV-S-P-ADV 1 \n",
"V-ADV-IO-O 1 \n",
"IO-O-ADV-V 1 \n",
"S-IO-O-ADV-V 1 \n",
"O-ADV-V-S-ADV-ADV-ADV-ADV-ADV 1 \n",
"ADV-ADV-ADV-ADV-V 1 \n",
"ADV-IO-S-ADV 1 \n",
"S-ADV-V-O-ADV-O2-ADV 1 \n",
"ADV-ADV-V-S-ADV-O 1 \n",
"S-VC-ADV-ADV-P 1 \n",
"S-ADV-ADV-V-ADV-O-ADV 1 \n",
"ADV-ADV-P-ADV-VC-S 1 \n",
"P-ADV-ADV-S 1 \n",
"S-O-IO-V-ADV 1 \n",
"ADV-ADV-S-P 1 \n",
"ADV-ADV-ADV-S-V 1 \n",
"V-O-ADV-ADV-S-ADV 1 \n",
"ADV-S-IO-ADV-ADV 1 \n",
"S-O2-O-V-ADV 1 \n",
"O-O2-ADV 1 \n",
"S-V-ADV-O-IO-ADV 1 \n",
"O-ADV-O2-V 1 \n",
"ADV-ADV-O-V-S 1 \n",
"O-V-IO-ADV-ADV 1 \n",
"ADV-ADV-P-ADV 1 \n",
"S-V-ADV-O-IO 1 \n",
"O-V-IO-S-O2-ADV 1 \n",
"ADV-O2-O-IO-V 1 \n",
"ADV-S-ADV-P 1 \n",
"ADV-ADV-ADV-V-ADV-ADV-ADV 1 \n",
"ADV-ADV-S-O-V-IO 1 \n",
"IO-ADV-S-V-O 1 \n",
"ADV-VC-P-ADV-ADV 1 \n",
"ADV-O-IO-S-V 1 \n",
"ADV-S-ADV-ADV-V-ADV-ADV-ADV 1 \n",
"O-V-O2-IO 1 \n",
"ADV-ADV-S-VC-P-ADV 1 \n",
"V-O-ADV-IO-ADV-ADV 1 \n",
"ADV-V-IO-ADV-S 1 \n",
"V-ADV-IO-ADV-S-ADV 1 \n",
"ADV-V-ADV-ADV-ADV-ADV 1 \n",
"S-ADV-O-V-ADV-ADV 1 \n",
"V-ADV-ADV-ADV-O 1 \n",
"ADV-S-IO-ADV 1 \n",
"V-S-IO-O-O2 1 \n",
"IO-V-S-ADV-ADV 1 \n",
"VC-P-ADV-S-ADV 1 \n",
"V-S-ADV-O-ADV-ADV 1 \n",
"ADV-ADV-ADV-ADV-V-S-ADV-ADV 1 \n",
"S-ADV-ADV-O2-V-O 1 \n",
"O-V-ADV-O2 1 \n",
"V-O2 1 \n",
"S-ADV-VC-P-ADV-ADV 1 \n",
"ADV-ADV-V-ADV-ADV-ADV 1 \n",
"S-ADV-P-ADV-ADV 1 \n",
"S-VC-P-ADV-ADV 1 \n",
"O-IO-V-ADV-ADV 1 \n",
"O-S-V-ADV-ADV-ADV 1 \n",
"V-O-O2-ADV-ADV-ADV 1 \n",
"ADV-ADV-S-O-V-ADV 1 \n",
"S-O-ADV-IO-V 1 \n",
"IO-O-ADV-ADV-ADV 1 \n",
"ADV-V-IO-S-O-ADV 1 \n",
"O-O2-V-IO-ADV 1 \n",
"S-V-IO-O-ADV-ADV 1 \n",
"O2-O-V-ADV-ADV 1 \n",
"O-ADV-IO-V 1 \n",
"V-O-S-IO 1 \n",
"ADV-ADV-V-IO-S-O-ADV 1 \n",
"ADV-IO-V-O-ADV 1 \n",
"S-ADV-ADV-IO 1 \n",
"S-VC-ADV-P-ADV 1 \n",
"ADV-O-V-ADV-O2 1 \n",
"ADV-IO-ADV 1 \n",
"ADV-S-ADV-V-IO-ADV 1 \n",
"ADV-ADV-V-IO-ADV 1 \n",
"ADV-ADV-O-ADV-V 1 \n",
"ADV-O-V-ADV-S 1 \n",
"S-P-ADV-ADV-ADV 1 \n",
"ADV-ADV-VC-P-S-ADV 1 \n",
"ADV-O-ADV-ADV-V 1 \n",
"ADV-VC-ADV-P-ADV 1 \n",
"ADV-O-O2-V-ADV 1 \n",
"S-ADV-ADV-VC-P-ADV 1 \n",
"IO-O-ADV-V-S 1 \n",
"IO-O-S-V-ADV 1 \n",
"ADV-ADV-O-S-V 1 \n",
"S-ADV-V-ADV-O-ADV-ADV 1 \n",
"S-O-V-O2-ADV 1 \n",
"ADV-ADV-ADV-S-ADV 1 \n",
"S-ADV-O-V-O2-IO 1 \n",
"ADV-ADV-ADV-ADV-ADV-V 1 \n",
"ADV-S-ADV-ADV-ADV-V-IO-ADV 1 \n",
"S-ADV-ADV-ADV-ADV-V-O 1 \n",
"ADV-V-ADV-O-S-ADV 1 \n",
"ADV-S-O-V-IO 1 \n",
"ADV-ADV-O-ADV-V-ADV 1 \n",
"ADV-S-ADV-V-O-ADV-ADV 1 \n",
"O2-V-ADV-O 1 \n",
"S-ADV-ADV-ADV-V-O-ADV 1 \n",
"S-IO-O 1 \n",
"ADV-P-ADV-ADV 1 \n",
"ADV-S-ADV-P-VC-ADV 1 \n",
"ADV-ADV-S-ADV-P-VC 1 \n",
"ADV-ADV-O-V-ADV-ADV 1 \n",
"IO-V-O-ADV-ADV 1 \n",
"ADV-O-ADV-V-S-ADV 1 \n",
"IO-ADV-ADV-V-ADV 1 \n",
"IO-S-O-V-ADV 1 \n",
"S-ADV-O2-V-ADV 1 \n",
"O2-V-ADV-O-ADV 1 \n",
"O2-V 1 \n",
"ADV-S-ADV-ADV-O-ADV-V 1 \n",
"ADV-S-ADV-ADV-V-ADV-ADV-O 1 \n",
"VC-ADV-S-P-ADV 1 \n",
"P-ADV-VC-ADV 1 \n",
"O-ADV-ADV-ADV-V 1 \n",
"IO-S-ADV-V 1 \n",
"S-ADV-O-ADV-V-ADV 1 \n",
"IO-ADV-S-ADV 1 \n",
"V-ADV-O2-ADV 1 \n",
"V-S-ADV-ADV-O 1 \n",
"S-IO-V-ADV-ADV 1 \n"
]
}
],
"source": [
"import os\n",
"import xml.etree.ElementTree as ET\n",
"import re\n",
"from collections import defaultdict\n",
"\n",
"# Your list of paullist\n",
"orderlist = [...]\n",
"\n",
"# Create a dictionary to store frequencies\n",
"rule_frequencies = defaultdict(int)\n",
"print('Reading the inputfiles ',end='')\n",
"\n",
"for bo in booklist:\n",
" InputFile = os.path.join(InputDir, f'{bo}.xml')\n",
" print('.',end='')\n",
"\n",
" # Load the XML file\n",
" tree = ET.parse(InputFile)\n",
" root = tree.getroot()\n",
"\n",
" # Iterate over 'Tree' elements\n",
" for tree_element in root.findall('.//Tree'):\n",
" # Iterate over child nodes of the current 'Tree' element\n",
" for node in tree_element.findall('.//Node'):\n",
" # Determine the current rule\n",
" node_cat = node.get('Cat')\n",
" node_rule = node.get('Rule')\n",
" if node_cat == \"CL\":\n",
" #print(node_cat, node_rule)\n",
" if 'CL' not in node_rule: \n",
" if 'Cl' not in node_rule: \n",
" # Update the frequency in the dictionary\n",
" rule_frequencies[node_rule] += 1\n",
"\n",
"# Print the table of frequencies\n",
"print(\"\\n\\nFrequency Table:\")\n",
"print(\"{:<20} {:<10}\".format(\"Node Rule\", \"Frequency\"))\n",
"print(\"-\" * 30)\n",
"\n",
"# Sort the table by frequency in descending order\n",
"sorted_frequencies = sorted(rule_frequencies.items(), key=lambda x: x[1], reverse=True)\n",
"\n",
"# Print the sorted table\n",
"for rule, frequency in sorted_frequencies:\n",
" print(\"{:<20} {:<10}\".format(rule, frequency))\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "9799b56d-1151-4332-bae2-67ac9f714d9f",
"metadata": {},
"source": [
"# 3 - References\n",
"##### [Back to TOC](#TOC) "
]
},
{
"cell_type": "markdown",
"id": "22877556-5c1f-4576-9e03-5c452cc1cacb",
"metadata": {},
"source": [
"#### Footnotes:\n",
"\n",
"1 Porter, Stanley. E. \"Greek Word Order, Still an Unexplored Area in New Testament Studies?\" in Stanley E. Porter, *Linguistic Analysis of the Greek New Testament, Studies in Tools, Methods, and Practices* (Grand Rapids: Baker Academic, 2015), 347-363."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "79d0622e-46c9-44cb-9216-e85297aede81",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}