{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Accent patterns\n",
    "\n",
    "Request by Robert Voogdgeert.\n",
    "\n",
    "Make a CSV of half verses in a representation that only shows accents and word boundaries."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "\n",
    "from tf.app import use\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<b title=\"local github\">TF-app:</b> <span title=\"repo clone offline under ~/github\">~/github/annotation/app-bhsa/code</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<b title=\"local release\">data:</b> <span title=\"rv1.6=#bac4a9f5a2bbdede96ba6caea45e762fe88f88c5 offline under ~/text-fabric-data\">~/text-fabric-data/etcbc/bhsa/tf/c</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<b title=\"local release\">data:</b> <span title=\"r1.2=#1ac68e976ee4a7f23eb6bb4c6f401a033d0ec169 offline under ~/text-fabric-data\">~/text-fabric-data/etcbc/phono/tf/c</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<b title=\"local release\">data:</b> <span title=\"r1.2=#395dfe2cb69c261862fab9f0289e594a52121d5c offline under ~/text-fabric-data\">~/text-fabric-data/etcbc/parallels/tf/c</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<style>tr.tf.ltr, td.tf.ltr, th.tf.ltr { text-align: left ! important;}\n",
       "tr.tf.rtl, td.tf.rtl, th.tf.rtl { text-align: right ! important;}\n",
       "@font-face {\n",
       "  font-family: \"Ezra SIL\";\n",
       "  src: local('Ezra SIL'), local('EzraSIL'),\n",
       "    url('/server/static/fonts/SILEOT.woff') format('woff'),\n",
       "    url('https://github.com/annotation/text-fabric/blob/master/tf/server/static/fonts/SILEOT.woff?raw=true') format('woff');\n",
       "}\n",
       "\n",
       "@font-face {\n",
       "  font-family: \"SBL Hebrew\";\n",
       "  src: local('SBL Hebrew'), local('SBLHebrew'),\n",
       "    url('/server/static/fonts/SBL_Hbrw.woff') format('woff'),\n",
       "    url('https://github.com/annotation/text-fabric/blob/master/tf/server/static/fonts/SBL_Hbrw.woff?raw=true') format('woff');\n",
       "}\n",
       "\n",
       "@font-face {\n",
       "  font-family: \"Estrangelo Edessa\";\n",
       "  src: local('Estrangelo Edessa'), local('EstrangeloEdessa');\n",
       "    url('/server/static/fonts/SyrCOMEdessa.woff') format('woff'),\n",
       "    url('https://github.com/annotation/text-fabric/blob/master/tf/server/static/fonts/SyrCOMEdessa.woff?raw=true') format('woff');\n",
       "}\n",
       "\n",
       "@font-face {\n",
       "  font-family: AmiriQuran;\n",
       "  font-style: normal;\n",
       "  font-weight: 400;\n",
       "  src: local('Amiri Quran'), local('AmiriQuran'),\n",
       "    url('/server/static/fonts/AmiriQuran.woff') format('woff'),\n",
       "    url('https://github.com/annotation/text-fabric/blob/master/tf/server/static/fonts/AmiriQuran.woff?raw=true') format('woff');\n",
       "}\n",
       "\n",
       "@font-face {\n",
       "  font-family: AmiriQuranColored;\n",
       "  font-style: normal;\n",
       "  font-weight: 400;\n",
       "  src: local('Amiri Quran Colored'), local('AmiriQuranColored'),\n",
       "    url('/server/static/fonts/AmiriQuranColored.woff') format('woff'),\n",
       "    url('https://github.com/annotation/text-fabric/blob/master/tf/server/static/fonts/AmiriQuranColored.woff?raw=true') format('woff');\n",
       "}\n",
       "\n",
       "@font-face {\n",
       "  font-family: \"Santakku\";\n",
       "  src: local('Santakku'),\n",
       "    url('/server/static/fonts/Santakku.woff') format('woff'),\n",
       "    url('https://github.com/annotation/text-fabric/blob/master/tf/server/static/fonts/Santakku.woff?raw=true') format('woff');\n",
       "}\n",
       "\n",
       "@font-face {\n",
       "  font-family: \"SantakkuM\";\n",
       "  src: local('SantakkuM'),\n",
       "    url('/server/static/fonts/SantakkuM.woff') format('woff'),\n",
       "    url('https://github.com/annotation/text-fabric/blob/master/tf/server/static/fonts/SantakkuM.woff?raw=true') format('woff');\n",
       "}\n",
       "/* bypassing some classical notebook settings */\n",
       "div#notebook {\n",
       "  line-height: unset;\n",
       "}\n",
       "/* neutral text */\n",
       ".txtn,.txtn a:visited,.txtn a:link {\n",
       "    font-family: sans-serif;\n",
       "    font-size: medium;\n",
       "    direction: ltr;\n",
       "    text-decoration: none;\n",
       "    color: var(--text-color);\n",
       "}\n",
       "/* transcription text */\n",
       ".txtt,.txtt a:visited,.txtt a:link {\n",
       "    font-family: monospace;\n",
       "    font-size: medium;\n",
       "    direction: ltr;\n",
       "    text-decoration: none;\n",
       "    color: var(--text-color);\n",
       "}\n",
       "/* source text */\n",
       ".txto,.txto a:visited,.txto a:link {\n",
       "    font-family: serif;\n",
       "    font-size: medium;\n",
       "    direction: ltr;\n",
       "    text-decoration: none;\n",
       "    color: var(--text-color);\n",
       "}\n",
       "/* phonetic text */\n",
       ".txtp,.txtp a:visited,.txtp a:link {\n",
       "    font-family: Gentium, sans-serif;\n",
       "    font-size: medium;\n",
       "    direction: ltr;\n",
       "    text-decoration: none;\n",
       "    color: var(--text-color);\n",
       "}\n",
       "/* original script text */\n",
       ".txtu,.txtu a:visited,.txtu a:link {\n",
       "    font-family: Gentium, sans-serif;\n",
       "    font-size: medium;\n",
       "    text-decoration: none;\n",
       "    color: var(--text-color);\n",
       "}\n",
       "/* hebrew */\n",
       ".txtu.hbo,.lex.hbo {\n",
       "    font-family: \"Ezra SIL\", \"SBL Hebrew\", sans-serif;\n",
       "    font-size: large;\n",
       "    direction: rtl ! important;\n",
       "    unicode-bidi: embed;\n",
       "}\n",
       "/* syriac */\n",
       ".txtu.syc,.lex.syc {\n",
       "    font-family: \"Estrangelo Edessa\", sans-serif;\n",
       "    font-size: medium;\n",
       "    direction: rtl ! important;\n",
       "    unicode-bidi: embed;\n",
       "}\n",
       "/* neo aramaic */\n",
       ".txtu.cld,.lex.cld {\n",
       "    font-family: \"CharisSIL-R\", sans-serif;\n",
       "    font-size: medium;\n",
       "    direction: ltr ! important;\n",
       "    unicode-bidi: embed;\n",
       "}\n",
       "/* standard arabic */\n",
       ".txtu.ara,.lex.ara {\n",
       "    font-family: \"AmiriQuran\", sans-serif;\n",
       "    font-size: large;\n",
       "    direction: rtl ! important;\n",
       "    unicode-bidi: embed;\n",
       "}\n",
       "/* cuneiform */\n",
       ".txtu.akk,.lex.akk {\n",
       "    font-family: Santakku, sans-serif;\n",
       "    font-size: large;\n",
       "    direction: ltr ! important;\n",
       "    unicode-bidi: embed;\n",
       "}\n",
       "/* greek */\n",
       ".txtu.grc,.lex.grc a:link {\n",
       "    font-family: Gentium, sans-serif;\n",
       "    font-size: medium;\n",
       "    direction: ltr ! important;\n",
       "    unicode-bidi: embed;\n",
       "}\n",
       "a:hover {\n",
       "    text-decoration: underline | important;\n",
       "    color: #0000ff | important;\n",
       "}\n",
       ".ltr {\n",
       "    direction: ltr ! important;\n",
       "}\n",
       ".rtl {\n",
       "    direction: rtl ! important;\n",
       "}\n",
       ".features {\n",
       "    font-family: monospace;\n",
       "    font-size: medium;\n",
       "    font-weight: bold;\n",
       "    color: var(--features);\n",
       "    display: flex;\n",
       "    flex-flow: column nowrap;\n",
       "    justify-content: flex-start;\n",
       "    align-items: flex-start;\n",
       "    align-content: flex-start;\n",
       "    padding: 0.1rem;\n",
       "    margin: 0.1rem;\n",
       "    direction: ltr;\n",
       "    border: var(--meta-width) solid var(--meta-color);\n",
       "    border-radius: var(--meta-width);\n",
       "}\n",
       ".features div,.features span {\n",
       "    padding: 0;\n",
       "    margin: -0.1rem 0;\n",
       "}\n",
       ".features .f {\n",
       "    font-family: sans-serif;\n",
       "    font-size: small;\n",
       "    font-weight: normal;\n",
       "    color: #5555bb;\n",
       "}\n",
       ".features .xft {\n",
       "  color: #000000;\n",
       "  background-color: #eeeeee;\n",
       "  font-size: medium;\n",
       "  margin: 0.1rem 0rem;\n",
       "}\n",
       ".features .xft .f {\n",
       "  color: #000000;\n",
       "  background-color: #eeeeee;\n",
       "  font-size: small;\n",
       "  font-weight: normal;\n",
       "}\n",
       ".section {\n",
       "    font-family: sans-serif;\n",
       "    font-size: small;\n",
       "    font-weight: bold;\n",
       "    color: var(--section);\n",
       "    unicode-bidi: embed;\n",
       "    text-align: start;\n",
       "}\n",
       ".structure {\n",
       "    font-family: sans-serif;\n",
       "    font-size: small;\n",
       "    font-weight: bold;\n",
       "    color: var(--structure);\n",
       "    unicode-bidi: embed;\n",
       "    text-align: start;\n",
       "}\n",
       ".comments {\n",
       "    display: flex;\n",
       "    justify-content: flex-start;\n",
       "    align-items: flex-start;\n",
       "    align-content: flex-start;\n",
       "    flex-flow: column nowrap;\n",
       "}\n",
       ".nd, a:link.nd {\n",
       "    font-family: sans-serif;\n",
       "    font-size: small;\n",
       "    color: var(--node);\n",
       "    vertical-align: super;\n",
       "    direction: ltr ! important;\n",
       "    unicode-bidi: embed;\n",
       "}\n",
       ".lex {\n",
       "  color: var(--lex-color);;\n",
       "}\n",
       ".children,.children.ltr {\n",
       "    display: flex;\n",
       "    border: 0;\n",
       "    background-color: #ffffff;\n",
       "    justify-content: flex-start;\n",
       "    align-items: flex-start;\n",
       "    align-content: flex-start;\n",
       "}\n",
       ".children.stretch {\n",
       "    align-items: stretch;\n",
       "}\n",
       ".children.hor {\n",
       "    flex-flow: row nowrap;\n",
       "}\n",
       ".children.hor.wrap {\n",
       "    flex-flow: row wrap;\n",
       "}\n",
       ".children.ver {\n",
       "    flex-flow: column nowrap;\n",
       "}\n",
       ".children.ver.wrap {\n",
       "    flex-flow: column wrap;\n",
       "}\n",
       ".contnr {\n",
       "    width: fit-content;\n",
       "    display: flex;\n",
       "    justify-content: flex-start;\n",
       "    align-items: flex-start;\n",
       "    align-content: flex-start;\n",
       "    flex-flow: column nowrap;\n",
       "    background: #ffffff none repeat scroll 0 0;\n",
       "    padding:  0.5rem 0.1rem 0.1rem 0.1rem;\n",
       "    margin: 0.8rem 0.1rem 0.1rem 0.1rem;\n",
       "    border-style: solid;\n",
       "    font-size: small;\n",
       "}\n",
       ".contnr.trm {\n",
       "    background-attachment: local;\n",
       "}\n",
       ".contnr.cnul {\n",
       "    padding:  0;\n",
       "    margin: 0;\n",
       "    border-style: solid;\n",
       "    font-size: xx-small;\n",
       "}\n",
       ".contnr.cnul,.lbl.cnul {\n",
       "    border-color: var(--border-color-nul);\n",
       "    border-width: var(--border-width-nul);\n",
       "    border-radius: var(--border-width-nul);\n",
       "}\n",
       ".contnr.c0,.lbl.c0 {\n",
       "    border-color: var(--border-color0);\n",
       "    border-width: var(--border-width0);\n",
       "    border-radius: var(--border-width0);\n",
       "}\n",
       ".contnr.c1,.lbl.c1 {\n",
       "    border-color: var(--border-color1);\n",
       "    border-width: var(--border-width1);\n",
       "    border-radius: var(--border-width1);\n",
       "}\n",
       ".contnr.c2,.lbl.c2 {\n",
       "    border-color: var(--border-color2);\n",
       "    border-width: var(--border-width2);\n",
       "    border-radius: var(--border-width2);\n",
       "}\n",
       ".contnr.c3,.lbl.c3 {\n",
       "    border-color: var(--border-color3);\n",
       "    border-width: var(--border-width3);\n",
       "    border-radius: var(--border-width3);\n",
       "}\n",
       ".contnr.c4,.lbl.c4 {\n",
       "    border-color: var(--border-color4);\n",
       "    border-width: var(--border-width4);\n",
       "    border-radius: var(--border-width4);\n",
       "}\n",
       "span.plain {\n",
       "    display: inline-block;\n",
       "    white-space: pre-wrap;\n",
       "}\n",
       ".plain {\n",
       "    background-color: #ffffff;\n",
       "}\n",
       ".plain.l,.contnr.l,.contnr.l>.lbl {\n",
       "    border-left-style: dotted\n",
       "}\n",
       ".plain.r,.contnr.r,.contnr.r>.lbl {\n",
       "    border-right-style: dotted\n",
       "}\n",
       ".plain.lno,.contnr.lno,.contnr.lno>.lbl {\n",
       "    border-left-style: none\n",
       "}\n",
       ".plain.rno,.contnr.rno,.contnr.rno>.lbl {\n",
       "    border-right-style: none\n",
       "}\n",
       ".plain.l {\n",
       "    padding-left: 0.2rem;\n",
       "    margin-left: 0.1rem;\n",
       "    border-width: var(--border-width-plain);\n",
       "}\n",
       ".plain.r {\n",
       "    padding-right: 0.2rem;\n",
       "    margin-right: 0.1rem;\n",
       "    border-width: var(--border-width-plain);\n",
       "}\n",
       ".lbl {\n",
       "    font-family: monospace;\n",
       "    margin-top: -1.2rem;\n",
       "    margin-left: 1rem;\n",
       "    background: #ffffff none repeat scroll 0 0;\n",
       "    padding: 0 0.3rem;\n",
       "    border-style: solid;\n",
       "    display: block;\n",
       "    color: var(--label)\n",
       "}\n",
       ".lbl.trm {\n",
       "    background-attachment: local;\n",
       "    margin-top: 0.1rem;\n",
       "    margin-left: 0.1rem;\n",
       "    padding: 0.1rem 0.1rem;\n",
       "    border-style: none;\n",
       "}\n",
       ".lbl.cnul {\n",
       "    font-size: xx-small;\n",
       "}\n",
       ".lbl.c0 {\n",
       "    font-size: small;\n",
       "}\n",
       ".lbl.c1 {\n",
       "    font-size: small;\n",
       "}\n",
       ".lbl.c2 {\n",
       "    font-size: medium;\n",
       "}\n",
       ".lbl.c3 {\n",
       "    font-size: medium;\n",
       "}\n",
       ".lbl.c4 {\n",
       "    font-size: large;\n",
       "}\n",
       ".occs, a:link.occs {\n",
       "    font-size: small;\n",
       "}\n",
       "\n",
       "/* PROVENANCE */\n",
       "\n",
       "div.prov {\n",
       "\tmargin: 2rem;\n",
       "\tpadding: 1rem;\n",
       "\tborder: 0.1rem solid var(--fog-rim);\n",
       "}\n",
       "div.pline {\n",
       "\tdisplay: flex;\n",
       "\tflex-flow: row nowrap;\n",
       "\tjustify-content: stretch;\n",
       "\talign-items: baseline;\n",
       "}\n",
       "div.p2line {\n",
       "\tmargin-left: 2em;\n",
       "\tdisplay: flex;\n",
       "\tflex-flow: row nowrap;\n",
       "\tjustify-content: stretch;\n",
       "\talign-items: baseline;\n",
       "}\n",
       "div.psline {\n",
       "\tdisplay: flex;\n",
       "\tflex-flow: row nowrap;\n",
       "\tjustify-content: stretch;\n",
       "\talign-items: baseline;\n",
       "\tbackground-color: var(--gold-mist-back);\n",
       "}\n",
       "div.pname {\n",
       "\tflex: 0 0 5rem;\n",
       "\tfont-weight: bold;\n",
       "}\n",
       "div.pval {\n",
       "    flex: 1 1 auto;\n",
       "}\n",
       "\n",
       ":root {\n",
       "\t--node:               hsla(120, 100%,  20%, 1.0  );\n",
       "\t--label:              hsla(  0, 100%,  20%, 1.0  );\n",
       "\t--section:            hsla(  0, 100%,  25%, 1.0  );\n",
       "\t--structure:          hsla(120, 100%,  25%, 1.0  );\n",
       "\t--features:           hsla(  0,   0%,  30%, 1.0  );\n",
       "  --text-color:         hsla( 60,  80%,  10%, 1.0  );\n",
       "  --lex-color:          hsla(220,  90%,  60%, 1.0  );\n",
       "  --meta-color:         hsla(  0,   0%,  90%, 0.7  );\n",
       "  --meta-width:         0.15rem;\n",
       "  --border-color-nul:   hsla(  0,   0%,  90%, 0.5  );\n",
       "  --border-color0:      hsla(  0,   0%,  90%, 0.9  );\n",
       "  --border-color1:      hsla(  0,   0%,  80%, 0.9  );\n",
       "  --border-color2:      hsla(  0,   0%,  70%, 0.9  );\n",
       "  --border-color3:      hsla(  0,   0%,  80%, 0.8  );\n",
       "  --border-color4:      hsla(  0,   0%,  60%, 0.9  );\n",
       "  --border-width-nul:   0.1rem;\n",
       "  --border-width0:      0.1rem;\n",
       "  --border-width1:      0.15rem;\n",
       "  --border-width2:      0.2rem;\n",
       "  --border-width3:      0.3rem;\n",
       "  --border-width4:      0.25rem;\n",
       "  --border-width-plain: 0.1rem;\n",
       "}\n",
       ".hl {\n",
       "  background-color: var(--hl-strong);\n",
       "}\n",
       "span.hl {\n",
       "\tbackground-color: var(--hl-strong);\n",
       "\tborder-width: 0;\n",
       "\tborder-radius: 0.1rem;\n",
       "\tborder-style: solid;\n",
       "}\n",
       "div.contnr.hl,div.lbl.hl {\n",
       "  background-color: var(--hl-strong);\n",
       "}\n",
       "div.contnr.hl {\n",
       "  border-color: var(--hl-rim) ! important;\n",
       "\tborder-width: 0.2rem ! important;\n",
       "}\n",
       "\n",
       "span.hlbx {\n",
       "\tborder-color: var(--hl-rim);\n",
       "\tborder-width: 0.2rem ! important;\n",
       "\tborder-style: solid;\n",
       "\tborder-radius: 0.3rem;\n",
       "  padding: 0.2rem;\n",
       "  margin: 0.2rem;\n",
       "}\n",
       "\n",
       "span.plain {\n",
       "  display: inline-block;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "\n",
       ":root {\n",
       "\t--hl-strong:        hsla( 60, 100%,  70%, 0.9  );\n",
       "\t--hl-rim:           hsla( 55,  80%,  50%, 1.0  );\n",
       "}\n",
       "</style>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "A = use(\"ETCBC/bhsa:clone\", hoist=globals(), silent=\"deep\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chunks\n",
    "\n",
    "You can configure a chunk to be `half_verse` or `clause`.\n",
    "\n",
    "If the chunk is `half_verse`, we use the feature `label` to identify it within the verse.\n",
    "\n",
    "If the chunk is `clause`, we use the sentence number and the clause number to identify it.\n",
    "\n",
    "In `chunkTypes` we store a mapping of all chunk types we support to functions that provide a label for such chunks."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "chunkTypes = dict(\n",
    "    half_verse=F.label.v,\n",
    "    clause=lambda n: f'{F.number.v(L.u(n, otype=\"sentence\")[0])}.{F.number.v(n)}',\n",
    "    clause_atom=F.number.v,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "lines_to_next_cell": 2
   },
   "source": [
    "Here is a function that shows chunks."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def showChunks(chunks):\n",
    "    for c in chunks:\n",
    "        cType = F.otype.v(c)\n",
    "        headFunc = chunkTypes.get(cType, None)\n",
    "        head = \"?\" if headFunc is None else headFunc(c)\n",
    "        passage = T.sectionFromNode(c)\n",
    "        heading = \"{} {}:{} {}\".format(*passage, head)\n",
    "        text = T.text(c, fmt=\"text-trans-full\")\n",
    "        print(f\"{heading}\\n\\t{text}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's inspect a few half verses (the first and second ones and one which contains\n",
    "a word with an in-word space):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Genesis 1:1 A\n",
      "\tB.:-R;>CI73JT B.@R@74> >:ELOHI92JM \n",
      "Genesis 1:1 B\n",
      "\t>;71T HA-C.@MA73JIM W:->;71T H@->@75REY00 \n",
      "1_Chronicles 2:54 A\n",
      "\tB.:N;74J FAL:M@81> B.;71JT_LE33XEM03 W.-N:VO74WP@TI80J <AV:RO73WT_B.;74JT_JOW>@92B \n"
     ]
    }
   ],
   "source": [
    "chunkType = \"half_verse\"\n",
    "\n",
    "(h1, h2) = F.otype.s(chunkType)[0:2]\n",
    "v = T.nodeFromSection((\"1_Chronicles\", 2, 54))\n",
    "h3 = L.d(v, otype=chunkType)[0]\n",
    "\n",
    "showChunks((h1, h2, h3))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's inspect a few clauses (the first ten)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Genesis 1:1 1.1\n",
      "\tB.:-R;>CI73JT B.@R@74> >:ELOHI92JM >;71T HA-C.@MA73JIM W:->;71T H@->@75REY00 \n",
      "Genesis 1:2 2.1\n",
      "\tW:-H@->@81REY H@J:T@71H TO33HW.03 W@-BO80HW. \n",
      "Genesis 1:2 3.1\n",
      "\tW:-XO73CEK: <AL&P.:N;74J T:HO92WM \n",
      "Genesis 1:2 4.1\n",
      "\tW:-R74W.XA >:ELOHI80JM M:RAXE73PET <AL&P.:N;71J HA-M.@75JIM00 \n",
      "Genesis 1:3 5.1\n",
      "\tWA-J.O71>MER >:ELOHI73JM \n",
      "Genesis 1:3 6.1\n",
      "\tJ:HI74J >O92WR \n",
      "Genesis 1:3 7.1\n",
      "\tWA45-J:HIJ&>O75WR00 \n",
      "Genesis 1:4 8.1\n",
      "\tWA-J.A94R:> >:ELOHI91JM >ET&H@->O73WR \n",
      "Genesis 1:4 8.2\n",
      "\tK.IJ&VO92WB \n",
      "Genesis 1:4 9.1\n",
      "\tWA-J.AB:D.;74L >:ELOHI80JM B.;71JN H@->O73WR W.-B;71JN HA-XO75CEK:00 \n"
     ]
    }
   ],
   "source": [
    "chunkType = \"clause\"\n",
    "\n",
    "chunks = F.otype.s(chunkType)[0:10]\n",
    "\n",
    "showChunks(chunks)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pattern from a chunk\n",
    "\n",
    "We define a function to get the accent pattern from a chunk."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The function works by stripping all non-digit-non-space material, then splitting on space, then\n",
    "dividing the numbers into pairs, and then joining everything together.\n",
    "\n",
    "We exclude some marks, because they are not proper cantillation accents."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "excludedAccents = {\n",
    "    \"35\",\n",
    "    \"45\",\n",
    "    \"75\",\n",
    "    \"95\",  # meteg\n",
    "    \"52\",\n",
    "    \"53\",  # upper and lower dots\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "stripPat = re.compile(r\"[^0-9 ]\")\n",
    "accentPat = re.compile(r\"[0-9]{2}\")\n",
    "\n",
    "\n",
    "def getAccents(chunk):\n",
    "    trans = T.text(chunk, fmt=\"text-trans-full\").replace(\"_\", \" \")\n",
    "    words = stripPat.sub(\"\", trans).split()\n",
    "    items = []\n",
    "    for word in words:\n",
    "        accents = [ac for ac in accentPat.findall(word) if ac not in excludedAccents]\n",
    "        items.append(\"_\".join(accents))\n",
    "    return \" \".join(items)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "73 74 92\n",
      "71 73 71 00\n",
      "74 81 71 33_03 74_80 73 74 92\n",
      "73 74 92 71 73 71 00\n",
      "81 71 33_03 80\n",
      "73 74 92\n",
      "74 80 73 71 00\n",
      "71 73\n",
      "74 92\n",
      "00\n",
      "94 91 73\n",
      "92\n",
      "74 80 71 73 71 00\n"
     ]
    }
   ],
   "source": [
    "for c in (h1, h2, h3, *chunks):\n",
    "    print(getAccents(c))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "lines_to_next_cell": 2
   },
   "source": [
    "# Process the selection\n",
    "\n",
    "We define a function to process a given selection with a given chunk type.\n",
    "\n",
    "The file is saved to the `destination`, by default your Downloads folder."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process(selection, chunkType, destination=\"~/Downloads\"):\n",
    "    A.indent(reset=True)\n",
    "    A.info(f\"Gather all {chunkType}s ...\")\n",
    "    rows = []\n",
    "\n",
    "    headFunc = chunkTypes.get(chunkType, None)\n",
    "    if not headFunc:\n",
    "        A.error(f\"Chunk type {chunkType} not supported\")\n",
    "        return\n",
    "\n",
    "    for v in F.otype.s(\"verse\"):\n",
    "        (book, chapter, verse) = T.sectionFromNode(v)\n",
    "        if selection is not None and book not in selection:\n",
    "            continue\n",
    "        for chunk in L.d(v, otype=chunkType):\n",
    "            head = headFunc(chunk)\n",
    "            accents = getAccents(chunk)\n",
    "            rows.append((book, chapter, verse, head, accents))\n",
    "    A.info(f\"{len(rows)} {chunkType}s done\")\n",
    "\n",
    "    csvRaw = f\"{destination}/accents-{chunkType}.csv\"\n",
    "    csv = os.path.expanduser(csvRaw)\n",
    "\n",
    "    with open(csv, \"w\") as fh:\n",
    "        for row in rows:\n",
    "            fh.write(\",\".join(str(f) for f in row) + \"\\n\")\n",
    "\n",
    "    A.info(f\"Results written to {csvRaw}\")\n",
    "    return rows"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Selection\n",
    "\n",
    "You may choose to do all books or selected books only."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# tweak this cell by specifying the set of books you want done (English book names)\n",
    "# books = None means: all books\n",
    "\n",
    "books = None\n",
    "# books = {'Numbers', 'Ruth'}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Half verses"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  0.00s Gather all half_verses ...\n",
      "  2.84s 45180 half_verses done\n",
      "  2.93s Results written to ~/Downloads/accents-half_verse.csv\n"
     ]
    }
   ],
   "source": [
    "rows = process(books, \"half_verse\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Genesis', 1, 1, 'A', '73 74 92'),\n",
       " ('Genesis', 1, 1, 'B', '71 73 71 00'),\n",
       " ('Genesis', 1, 2, 'A', '81 71 33_03 80 73 74 92'),\n",
       " ('Genesis', 1, 2, 'B', '74 80 73 71 00'),\n",
       " ('Genesis', 1, 3, 'A', '71 73 74 92'),\n",
       " ('Genesis', 1, 3, 'B', '00'),\n",
       " ('Genesis', 1, 4, 'A', '94 91 73 92'),\n",
       " ('Genesis', 1, 4, 'B', '74 80 71 73 71 00'),\n",
       " ('Genesis', 1, 5, 'A', '63 70_05 03 80 73 74 92'),\n",
       " ('Genesis', 1, 5, 'B', '71 73 71 00')]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rows[0:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clauses"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  0.00s Gather all clauses ...\n",
      "  3.53s 88071 clauses done\n",
      "  3.68s Results written to ~/Downloads/accents-clause.csv\n"
     ]
    }
   ],
   "source": [
    "rows = process(books, \"clause\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Genesis', 1, 1, '1.1', '73 74 92 71 73 71 00'),\n",
       " ('Genesis', 1, 2, '2.1', '81 71 33_03 80'),\n",
       " ('Genesis', 1, 2, '3.1', '73 74 92'),\n",
       " ('Genesis', 1, 2, '4.1', '74 80 73 71 00'),\n",
       " ('Genesis', 1, 3, '5.1', '71 73'),\n",
       " ('Genesis', 1, 3, '6.1', '74 92'),\n",
       " ('Genesis', 1, 3, '7.1', '00'),\n",
       " ('Genesis', 1, 4, '8.1', '94 91 73'),\n",
       " ('Genesis', 1, 4, '8.2', '92'),\n",
       " ('Genesis', 1, 4, '9.1', '74 80 71 73 71 00')]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rows[0:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clause atoms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  0.00s Gather all clause_atoms ...\n",
      "  2.79s 90688 clause_atoms done\n",
      "  2.94s Results written to ~/Downloads/accents-clause_atom.csv\n"
     ]
    }
   ],
   "source": [
    "rows = process(books, \"clause_atom\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Genesis', 1, 1, 1, '73 74 92 71 73 71 00'),\n",
       " ('Genesis', 1, 2, 2, '81 71 33_03 80'),\n",
       " ('Genesis', 1, 2, 3, '73 74 92'),\n",
       " ('Genesis', 1, 2, 4, '74 80 73 71 00'),\n",
       " ('Genesis', 1, 3, 5, '71 73'),\n",
       " ('Genesis', 1, 3, 6, '74 92'),\n",
       " ('Genesis', 1, 3, 7, '00'),\n",
       " ('Genesis', 1, 4, 8, '94 91 73'),\n",
       " ('Genesis', 1, 4, 9, '92'),\n",
       " ('Genesis', 1, 4, 10, '74 80 71 73 71 00')]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rows[0:10]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}