# Accent patterns

Request by Robert Voogdgeert.

Make a CSV of half verses in a representation that only shows accents and word boundaries.

In [1]:
import os
import re

from tf.app import use


In [2]:
A = use("ETCBC/bhsa:clone", hoist=globals(), silent="deep")

# Chunks

You can configure a chunk to be `half_verse` or `clause`.

If the chunk is `half_verse`, we use the feature `label` to identify it within the verse.

If the chunk is `clause`, we use the sentence number and the clause number to identify it.

In `chunkTypes` we store a mapping of all chunk types we support to functions that provide a label for such chunks.

In [3]:
chunkTypes = dict(
 half_verse=F.label.v,
 clause=lambda n: f'{F.number.v(L.u(n, otype="sentence")[0])}.{F.number.v(n)}',
 clause_atom=F.number.v,
)

Here is a function that shows chunks.

In [4]:
def showChunks(chunks):
 for c in chunks:
 cType = F.otype.v(c)
 headFunc = chunkTypes.get(cType, None)
 head = "?" if headFunc is None else headFunc(c)
 passage = T.sectionFromNode(c)
 heading = "{} {}:{} {}".format(*passage, head)
 text = T.text(c, fmt="text-trans-full")
 print(f"{heading}\n\t{text}")

Let's inspect a few half verses (the first and second ones and one which contains
a word with an in-word space):

In [5]:
chunkType = "half_verse"

(h1, h2) = F.otype.s(chunkType)[0:2]
v = T.nodeFromSection(("1_Chronicles", 2, 54))
h3 = L.d(v, otype=chunkType)[0]

showChunks((h1, h2, h3))

Genesis 1:1 A
	B.:-R;>CI73JT B.@R@74> >:ELOHI92JM 
Genesis 1:1 B
	>;71T HA-C.@MA73JIM W:->;71T H@->@75REY00 
1_Chronicles 2:54 A
	B.:N;74J FAL:M@81> B.;71JT_LE33XEM03 W.-N:VO74WP@TI80J @92B 


Let's inspect a few clauses (the first ten).

In [6]:
chunkType = "clause"

chunks = F.otype.s(chunkType)[0:10]

showChunks(chunks)

Genesis 1:1 1.1
	B.:-R;>CI73JT B.@R@74> >:ELOHI92JM >;71T HA-C.@MA73JIM W:->;71T H@->@75REY00 
Genesis 1:2 2.1
	W:-H@->@81REY H@J:T@71H TO33HW.03 W@-BO80HW. 
Genesis 1:2 3.1
	W:-XO73CEK: :ELOHI80JM M:RAXE73PET MER >:ELOHI73JM 
Genesis 1:3 6.1
	J:HI74J >O92WR 
Genesis 1:3 7.1
	WA45-J:HIJ&>O75WR00 
Genesis 1:4 8.1
	WA-J.A94R:> >:ELOHI91JM >ET&H@->O73WR 
Genesis 1:4 8.2
	K.IJ&VO92WB 
Genesis 1:4 9.1
	WA-J.AB:D.;74L >:ELOHI80JM B.;71JN H@->O73WR W.-B;71JN HA-XO75CEK:00 


# Pattern from a chunk

We define a function to get the accent pattern from a chunk.

The function works by stripping all non-digit-non-space material, then splitting on space, then
dividing the numbers into pairs, and then joining everything together.

We exclude some marks, because they are not proper cantillation accents.

In [7]:
excludedAccents = {
 "35",
 "45",
 "75",
 "95", # meteg
 "52",
 "53", # upper and lower dots
}

In [8]:
stripPat = re.compile(r"[^0-9 ]")
accentPat = re.compile(r"[0-9]{2}")


def getAccents(chunk):
 trans = T.text(chunk, fmt="text-trans-full").replace("_", " ")
 words = stripPat.sub("", trans).split()
 items = []
 for word in words:
 accents = [ac for ac in accentPat.findall(word) if ac not in excludedAccents]
 items.append("_".join(accents))
 return " ".join(items)

In [9]:
for c in (h1, h2, h3, *chunks):
 print(getAccents(c))

73 74 92
71 73 71 00
74 81 71 33_03 74_80 73 74 92
73 74 92 71 73 71 00
81 71 33_03 80
73 74 92
74 80 73 71 00
71 73
74 92
00
94 91 73
92
74 80 71 73 71 00


# Process the selection

We define a function to process a given selection with a given chunk type.

The file is saved to the `destination`, by default your Downloads folder.

In [10]:
def process(selection, chunkType, destination="~/Downloads"):
 A.indent(reset=True)
 A.info(f"Gather all {chunkType}s ...")
 rows = []

 headFunc = chunkTypes.get(chunkType, None)
 if not headFunc:
 A.error(f"Chunk type {chunkType} not supported")
 return

 for v in F.otype.s("verse"):
 (book, chapter, verse) = T.sectionFromNode(v)
 if selection is not None and book not in selection:
 continue
 for chunk in L.d(v, otype=chunkType):
 head = headFunc(chunk)
 accents = getAccents(chunk)
 rows.append((book, chapter, verse, head, accents))
 A.info(f"{len(rows)} {chunkType}s done")

 csvRaw = f"{destination}/accents-{chunkType}.csv"
 csv = os.path.expanduser(csvRaw)

 with open(csv, "w") as fh:
 for row in rows:
 fh.write(",".join(str(f) for f in row) + "\n")

 A.info(f"Results written to {csvRaw}")
 return rows

# Selection

You may choose to do all books or selected books only.

In [11]:
# tweak this cell by specifying the set of books you want done (English book names)
# books = None means: all books

books = None
# books = {'Numbers', 'Ruth'}

## Half verses

In [12]:
rows = process(books, "half_verse")

 0.00s Gather all half_verses ...
 2.84s 45180 half_verses done
 2.93s Results written to ~/Downloads/accents-half_verse.csv


In [13]:
rows[0:10]

[('Genesis', 1, 1, 'A', '73 74 92'),
 ('Genesis', 1, 1, 'B', '71 73 71 00'),
 ('Genesis', 1, 2, 'A', '81 71 33_03 80 73 74 92'),
 ('Genesis', 1, 2, 'B', '74 80 73 71 00'),
 ('Genesis', 1, 3, 'A', '71 73 74 92'),
 ('Genesis', 1, 3, 'B', '00'),
 ('Genesis', 1, 4, 'A', '94 91 73 92'),
 ('Genesis', 1, 4, 'B', '74 80 71 73 71 00'),
 ('Genesis', 1, 5, 'A', '63 70_05 03 80 73 74 92'),
 ('Genesis', 1, 5, 'B', '71 73 71 00')]

# Clauses

In [14]:
rows = process(books, "clause")

 0.00s Gather all clauses ...
 3.53s 88071 clauses done
 3.68s Results written to ~/Downloads/accents-clause.csv


In [15]:
rows[0:10]

[('Genesis', 1, 1, '1.1', '73 74 92 71 73 71 00'),
 ('Genesis', 1, 2, '2.1', '81 71 33_03 80'),
 ('Genesis', 1, 2, '3.1', '73 74 92'),
 ('Genesis', 1, 2, '4.1', '74 80 73 71 00'),
 ('Genesis', 1, 3, '5.1', '71 73'),
 ('Genesis', 1, 3, '6.1', '74 92'),
 ('Genesis', 1, 3, '7.1', '00'),
 ('Genesis', 1, 4, '8.1', '94 91 73'),
 ('Genesis', 1, 4, '8.2', '92'),
 ('Genesis', 1, 4, '9.1', '74 80 71 73 71 00')]

# Clause atoms

In [16]:
rows = process(books, "clause_atom")

 0.00s Gather all clause_atoms ...
 2.79s 90688 clause_atoms done
 2.94s Results written to ~/Downloads/accents-clause_atom.csv


In [17]:
rows[0:10]

[('Genesis', 1, 1, 1, '73 74 92 71 73 71 00'),
 ('Genesis', 1, 2, 2, '81 71 33_03 80'),
 ('Genesis', 1, 2, 3, '73 74 92'),
 ('Genesis', 1, 2, 4, '74 80 73 71 00'),
 ('Genesis', 1, 3, 5, '71 73'),
 ('Genesis', 1, 3, 6, '74 92'),
 ('Genesis', 1, 3, 7, '00'),
 ('Genesis', 1, 4, 8, '94 91 73'),
 ('Genesis', 1, 4, 9, '92'),
 ('Genesis', 1, 4, 10, '74 80 71 73 71 00')]