# X-Word Latin Hexameters

Seen on Facebook tonight from C. Dozier:
"Is there a list of three-word Latin hexameters?"

- Last updated 11/1/17 1:14pm

In [1]:
# Imports

import os
import string
import re
import html 

from pprint import pprint

from cltk.corpus.latin import latinlibrary
from cltk.tokenize.line import LineTokenizer
from cltk.stem.latin.j_v import JVReplacer

Arabic not supported. Install `pyarabic` library to tokenize Arabic.


In [2]:
# Setup CLTK tools

line_tokenizer = LineTokenizer('latin')
replacer = JVReplacer()

In [3]:
# Let's start with what we already know...
# There is at least one three-word hexameter in Horace's *Satires*

# So we get the two Satires files from the Latin Library...
files = latinlibrary.fileids()
hor_sat_files = [file for file in files if 'horace/serm' in file]
hor_sat_raw = [latinlibrary.raw(file) for file in hor_sat_files]

In [4]:
# Preprocess texts

def preprocess(text):

 remove_list = [r'\bHorace\b',
 r'\bThe Latin Library\b',
 r'\bThe Classics Page',
 r'\bSermonum Liber .+\b',
 r'\bSERMONVM Q. HORATI FLACCI LIBER .+?\b'
 ]
 
 for pattern in remove_list:
 text = re.sub(pattern, '', text)
 
 text = html.unescape(text) # Handle html entities
 text = re.sub(r' ?', ' ',text) #  stripped incorrectly in corpus?
 text = re.sub(r'\x00',' ',text) #Another space problem?
 
 text = text.lower()
 text = replacer.replace(text) #Normalize u/v & i/j 
 
 punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»"
 translator = str.maketrans({key: " " for key in punctuation})
 text = text.translate(translator)
 
 translator = str.maketrans({key: " " for key in '0123456789'})
 text = text.translate(translator)
 
 text = re.sub('[ ]+',' ', text) # Remove double spaces
 text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
 
 return text

In [5]:
# ...preprocess the Satire texts...

hor_sat_edit = [preprocess(raw) for raw in hor_sat_raw]
print(hor_sat_edit[0][:500])


qui fit maecenas ut nemo quam sibi sortem 
seu ratio dederit seu fors obiecerit illa 
contentus uiuat laudet diuersa sequentis
o fortunati mercatores grauis annis 
miles ait multo iam fractus membra labore 
contra mercator nauim iactantibus austris
militia est potior quid enim concurritur horae 
momento cita mors uenit aut uictoria laeta 
agricolam laudat iuris legumque peritus 
sub galli cantum consultor ubi ostia pulsat 
ille datis uadibus qui rure extractus in urbem est 
solos felicis uiuent


In [6]:
# ...tokenize the poems by line...

hor_sat_lines = [line_tokenizer.tokenize(text) for text in hor_sat_edit]

In [7]:
# ...and test for three-word hexameters
tlh = []

for text in hor_sat_lines:
 for line in text:
 temp = line.split()
 if len(temp) == 3:
 tlh.append(" ".join(temp))

print(tlh)

['ambubaiarum collegia pharmacopolae']


Let's abstract this so that we can return all matches for *any* number of words per line for *any* poem in the Latin Library.

In [8]:
# A function for getting hexameters of a certain length.

def return_x_word_hexameters(text, word_count):
 matches = []
 for line in text:
 temp = line.split()
 if len(temp) == word_count:
 matches.append(" ".join(temp))
 return matches

In [9]:
# Example based on what we did above...

print(return_x_word_hexameters(hor_sat_lines[0], 3))

['ambubaiarum collegia pharmacopolae']


In [10]:
# We need to get a list of hexameter poems in the Latin Library. This
# may be a good thing to automate in the future, but for now it is
# going to take a bit of knowledge of Latin poetry and some hard-coded
# filenames. I'll update this as I think of more, but I'll get it
# started with the basics. And I'll skip all of the hexameter lines
# from elegy for now, but they really should be included in the
# update as well.

# Basing the first pass on (Berstein, Gervias, Lin 2015) Table 1
# http://www.digitalhumanities.org/dhq/vol/9/3/000237/000237.html

lucretius = [file for file in files if 'lucr' in file]
vergil = [file for file in files if 'vergil/' in file]
horace = [file for file in files if 'horace/ars' in file]
horace += [file for file in files if 'horace/epi' in file]
horace += [file for file in files if 'horace/serm' in file]
ovid = [file for file in files if 'ovid.m' in file]
manilius = [file for file in files if 'manil' in file]
persius = [file for file in files if 'persius.txt' in file]
lucan = [file for file in files if 'lucan/' in file]
ilias = [file for file in files if 'ilias' in file]
statius = [file for file in files if 'statius/ac' in file]
statius += [file for file in files if 'statius/th' in file]
silius = [file for file in files if 'silius' in file]
valerius = [file for file in files if 'valer' in file]
juvenal = [file for file in files if 'juv' in file]
# juvencus -- Not in LL?
ausonius = [file for file in files if 'aus.mos' in file]
claudian = [file for file in files if 'claudian.pros' in file]
# other works of Claudian?
# corippus -- Not in LL?

hexameter_files = lucretius + vergil + horace + ovid + manilius \
 + persius + lucan + ilias + statius + silius \
 + valerius + juvenal + ausonius + claudian

In [11]:
# Preprocess texts

def preprocess(text):

 remove_list = [r'\bHorace: .+',
 r'\bAppendix Vergiliana\b',
 r'\bThe Miscellany\b',
 r'\bThe Latin Library\b',
 r'\bThe Classics Page',
 r'\bThe Classics Homepage',
 r'\bSermonum Liber .+\b',
 r'\bVergil: Aeneid .+\b',
 r'\bManilius, Liber .+\b',
 r'\bMetamorposes\b',
 r'\bLucan Liber .+',
 r'\bStatius: Thebaid .+',
 r'\bStatius: Achilleid .+',
 r'\bValerius Flaccus: Liber .+',
 r'\bSERMONVM Q. HORATI FLACCI LIBER .+?\b',
 r'PVBLIVS PAPINIVS STATIVS',
 r'Silius, Liber .+'
 ]
 
 for pattern in remove_list:
 text = re.sub(pattern, '', text)
 
 text = html.unescape(text) # Handle html entities
 text = re.sub(r' ?', ' ',text) #  stripped incorrectly in corpus?
 text = re.sub(r'\x00',' ',text) #Another space problem?
 
 # Fix partial lines
 text = re.sub(r'\d+(a|b)', ' ', text)
 
 text = text.lower()
 text = replacer.replace(text) #Normalize u/v & i/j 
 
 punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»"
 translator = str.maketrans({key: " " for key in punctuation})
 text = text.translate(translator)
 
 translator = str.maketrans({key: " " for key in '0123456789'})
 text = text.translate(translator)
 
 text = re.sub('[ ]+',' ', text) # Remove double spaces
 text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
 
 return text

In [12]:
# Preprocess files as above

hexameter_raw = [latinlibrary.raw(file) for file in hexameter_files]
hexameter_edit = [preprocess(raw) for raw in hexameter_raw] # need to preprocess LL paratexts
hexameter_lines = [line_tokenizer.tokenize(text) for text in hexameter_edit]

In [13]:
three_word_hexameters = []

for lines in hexameter_lines:
 temp = return_x_word_hexameters(lines, 3)
 if temp:
 three_word_hexameters.append(temp)

 
three_word_hexameters = [x for y in three_word_hexameters for x in y]
pprint(three_word_hexameters)

['insatiabiliter defleuimus aeternumque',
 'hic cursus fuit',
 'munera laetitiamque dii',
 'audentis fortuna iuuat',
 'incipias conferre manum',
 'turnus ad haec',
 'ferro accincta uocat',
 'numina magna deum',
 'uos agitate fugam',
 'infabricata fugae studio',
 'ergo iussa parat',
 'carpathium libycumque secant',
 'externique iterum thalami',
 'maius opus moueo',
 'exitiis positura modum',
 'iliadumque labor uestes',
 'immemor est nostri',
 'rex prior haec',
 'tum sic effatur',
 'euryali et nisi',
 'egit in aduersos',
 'ambubaiarum collegia pharmacopolae',
 'sic quoque fallebat',
 'redeuntem colle lycaeo',
 'i pete diuersi',
 'lapidosas aesaris undas',
 'iam tempora titan',
 'asperum iter temptans',
 'auctores tibi dant',
 'uarronemque fuga magnum',
 'cum uincere posset',
 'mens natat et',
 'pars prior at',
 'solusque per omnis',
 'heliconidasque pallidamque pirenen',
 'fuit frugi pudicus',
 'non segnior illo',
 'quam carthago suos',
 'mox deinde uidenti',
 'numina pugnastis nobis',
 

In [14]:
# All of the above appear to be either attested 'incomplete' lines,
# corruptions, artefacts of the printed page, or digitization problems.
# (Also, I haven't stripped out Persius's *Prologue* yet.)
#
# So, the only other genuine three-word hexameter returned here is...
#
# Lucr. DRN 3.907: insatiabiliter defleuimus aeternumque

In [15]:
# All well and good. But now with our function, we can quickly
# refactor to get any number...

In [16]:
four_word_hexameters = []

for lines in hexameter_lines:
 temp = return_x_word_hexameters(lines, 4)
 if temp:
 four_word_hexameters.append(temp)

 
four_word_hexameters = [x for y in four_word_hexameters for x in y]
pprint(four_word_hexameters)

['iphianassai turparunt sanguine foede',
 'seruitium contra paupertas diuitiaeque',
 'inflammasset equos nocturno graiiugenarum',
 'principio fundamenti natura carebit',
 'naturam clandestinam caecamque adhibere',
 'sed uanus stolidis haec',
 'amplexi quod habent peru',
 'possit ibi quicquam consistere',
 'certare ingenio contendere nobilitate',
 'exiguis interuallis conuecta resultant',
 'significant clandestinos caecosque subesse',
 'mobilibus digitis expergefacta figurant',
 'finita uariare figurarum ratione',
 'uersibus ostendam corpuscula materiai',
 'undique protelo plagarum continuato',
 'disiectare aestus diuersi materiai',
 'dissimili perfecta figura principiorum',
 'corpora discedunt conexaque conuenientis',
 'dissimili constare figura principiorum',
 'euanescere paulatim stinguique colorem',
 'dissoluuntur enim positurae principiorum',
 'consequitur grauitas membrorum praepediuntur',
 'ancipitique refutatu conuincere falsum',
 'corporibus nostris extrinsecus insinuatas',
 'e

In [17]:
# Same caution with the results as above, but nothing for which
# we cannot set up some kind of test, e.g. test the number of 
# syllables per line to be between 13 and 17.