In [1]:
# Imports
import html
import string
import re

import collections

import gensim
from gensim.models import Word2Vec

from nltk.tokenize import PunktSentenceTokenizer

from cltk.stem.latin.j_v import JVReplacer
from cltk.corpus.latin import latinlibrary

from matplotlib import pyplot

from pprint import pprint
import pickle

In [2]:
# Set up NLP tools
replacer = JVReplacer()
tokenizer = PunktSentenceTokenizer()

In [3]:
# Preprocess texts
def preprocess(text):
 
 text = html.unescape(text) # Handle html entities
 
 text = text.lower()
 text = replacer.replace(text) #Normalize u/v & i/j
 
 punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»"
 translator = str.maketrans({key: " " for key in punctuation})
 text = text.translate(translator)
 
 translator = str.maketrans({key: " " for key in '0123456789'})
 text = text.translate(translator)
 
 return text

In [4]:
# Build word2vec model on Latin Library texts

## Results of following lines are pickled as ll_w2v.p
# ll_raw = latinlibrary.raw()
# ll_sentences = tokenizer.tokenize(ll_raw)
# ll_sentences = [preprocess(sent).split() for sent in ll_sentences]
# ll_model = gensim.models.Word2Vec(ll_sentences, min_count=2, size=300, workers=4)

ll_model = pickle.load(open("./data/ll_w2v.p", "rb" ))

In [5]:
# Use 'most-similar' to produce a Latin version of:
# king - man + woman = queen
ll_model.wv.most_similar(positive=['rex', 'femina'], 
 negative=['uir'], topn=3)

[('regina', 0.6140977144241333),
 ('matre', 0.6033270955085754),
 ('coniuge', 0.5800632834434509)]

In [6]:
# king - man + woman = queen; mulier variation
ll_model.wv.most_similar(positive=['rex', 'mulier'], 
 negative=['uir'], topn=3)

[('regina', 0.5619041323661804),
 ('uxor', 0.5604838132858276),
 ('mater', 0.5291939377784729)]

In [7]:
# queen - woman + man = king
ll_model.wv.most_similar(positive=['regina', 'uir'], 
 negative=['femina'], topn=3)

[('rex', 0.7597681283950806),
 ('comes', 0.7183645963668823),
 ('dux', 0.7138530015945435)]

In [8]:
# father - man + woman = mother
ll_model.wv.most_similar(positive=['pater', 'mulier'], 
 negative=['uir'], topn=3)

[('mater', 0.6681894659996033),
 ('uxor', 0.6311129927635193),
 ('puella', 0.6174641847610474)]

In [9]:
# Use 'doesnt_match' to remove non-color from list of colors
ll_model.wv.doesnt_match('ruber flauus uiridis caerulus purpureus grauis'.split())

'grauis'

In [10]:
# ...or 'bad' emperor from 'good'
ll_model.wv.doesnt_match('augustus nero nerua traianus hadrianus antoninus'.split())

'nero'

In [11]:
# Some similarity measures...
def print_wv_sim(word1, word2):
 print("Similarity score for {} and {}: {}".format(word1, word2, ll_model.wv.similarity(word1, word2)))

print_wv_sim('rex', 'regina')

Similarity score for rex and regina: 0.6395386929624856


In [12]:
# Some similarity measures...
print_wv_sim('femina', 'mulier')

Similarity score for femina and mulier: 0.717120040628728


In [13]:
# Some examples from Axelson's 'Unpoetische Wörter'
print_wv_sim('uxor', 'coniunx')
print_wv_sim('puella', 'uirgo')
print_wv_sim('famulus', 'famula')
print_wv_sim('lassus', 'fessus')
print_wv_sim('gladius', 'ensis')
print_wv_sim('terra', 'tellus')
print_wv_sim('mors', 'letum')


Similarity score for uxor and coniunx: 0.7239514246158751
Similarity score for puella and uirgo: 0.8496341449771854
Similarity score for famulus and famula: 0.7061932968048992
Similarity score for lassus and fessus: 0.820626450365237
Similarity score for gladius and ensis: 0.7989289913212123
Similarity score for terra and tellus: 0.5780294477506212
Similarity score for mors and letum: 0.3612532580370732


In [14]:
print_wv_sim('amor', 'bellum')

Similarity score for amor and bellum: -0.026459743634875108
