{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Imports\n", "import html\n", "import string\n", "import re\n", "\n", "import collections\n", "\n", "import gensim\n", "from gensim.models import Word2Vec\n", "\n", "from nltk.tokenize import PunktSentenceTokenizer\n", "\n", "from cltk.stem.latin.j_v import JVReplacer\n", "from cltk.corpus.latin import latinlibrary\n", "\n", "from matplotlib import pyplot\n", "\n", "from pprint import pprint\n", "import pickle" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Set up NLP tools\n", "replacer = JVReplacer()\n", "tokenizer = PunktSentenceTokenizer()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Preprocess texts\n", "def preprocess(text):\n", " \n", " text = html.unescape(text) # Handle html entities\n", " \n", " text = text.lower()\n", " text = replacer.replace(text) #Normalize u/v & i/j\n", " \n", " punctuation =\"\\\"#$%&\\'()*+,-/:;<=>@[\\]^_`{|}~.?!«»\"\n", " translator = str.maketrans({key: \" \" for key in punctuation})\n", " text = text.translate(translator)\n", " \n", " translator = str.maketrans({key: \" \" for key in '0123456789'})\n", " text = text.translate(translator)\n", " \n", " return text" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Build word2vec model on Latin Library texts\n", "\n", "## Results of following lines are pickled as ll_w2v.p\n", "# ll_raw = latinlibrary.raw()\n", "# ll_sentences = tokenizer.tokenize(ll_raw)\n", "# ll_sentences = [preprocess(sent).split() for sent in ll_sentences]\n", "# ll_model = gensim.models.Word2Vec(ll_sentences, min_count=2, size=300, workers=4)\n", "\n", "ll_model = pickle.load(open(\"./data/ll_w2v.p\", \"rb\" ))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('regina', 0.6140977144241333),\n", " ('matre', 0.6033270955085754),\n", " ('coniuge', 0.5800632834434509)]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Use 'most-similar' to produce a Latin version of:\n", "# king - man + woman = queen\n", "ll_model.wv.most_similar(positive=['rex', 'femina'], \n", " negative=['uir'], topn=3)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('regina', 0.5619041323661804),\n", " ('uxor', 0.5604838132858276),\n", " ('mater', 0.5291939377784729)]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# king - man + woman = queen; mulier variation\n", "ll_model.wv.most_similar(positive=['rex', 'mulier'], \n", " negative=['uir'], topn=3)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('rex', 0.7597681283950806),\n", " ('comes', 0.7183645963668823),\n", " ('dux', 0.7138530015945435)]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# queen - woman + man = king\n", "ll_model.wv.most_similar(positive=['regina', 'uir'], \n", " negative=['femina'], topn=3)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('mater', 0.6681894659996033),\n", " ('uxor', 0.6311129927635193),\n", " ('puella', 0.6174641847610474)]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# father - man + woman = mother\n", "ll_model.wv.most_similar(positive=['pater', 'mulier'], \n", " negative=['uir'], topn=3)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'grauis'" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Use 'doesnt_match' to remove non-color from list of colors\n", "ll_model.wv.doesnt_match('ruber flauus uiridis caerulus purpureus grauis'.split())" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'nero'" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# ...or 'bad' emperor from 'good'\n", "ll_model.wv.doesnt_match('augustus nero nerua traianus hadrianus antoninus'.split())" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Similarity score for rex and regina: 0.6395386929624856\n" ] } ], "source": [ "# Some similarity measures...\n", "def print_wv_sim(word1, word2):\n", " print(\"Similarity score for {} and {}: {}\".format(word1, word2, ll_model.wv.similarity(word1, word2)))\n", "\n", "print_wv_sim('rex', 'regina')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Similarity score for femina and mulier: 0.717120040628728\n" ] } ], "source": [ "# Some similarity measures...\n", "print_wv_sim('femina', 'mulier')" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Similarity score for uxor and coniunx: 0.7239514246158751\n", "Similarity score for puella and uirgo: 0.8496341449771854\n", "Similarity score for famulus and famula: 0.7061932968048992\n", "Similarity score for lassus and fessus: 0.820626450365237\n", "Similarity score for gladius and ensis: 0.7989289913212123\n", "Similarity score for terra and tellus: 0.5780294477506212\n", "Similarity score for mors and letum: 0.3612532580370732\n" ] } ], "source": [ "# Some examples from Axelson's 'Unpoetische Wörter'\n", "print_wv_sim('uxor', 'coniunx')\n", "print_wv_sim('puella', 'uirgo')\n", "print_wv_sim('famulus', 'famula')\n", "print_wv_sim('lassus', 'fessus')\n", "print_wv_sim('gladius', 'ensis')\n", "print_wv_sim('terra', 'tellus')\n", "print_wv_sim('mors', 'letum')\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Similarity score for amor and bellum: -0.026459743634875108\n" ] } ], "source": [ "print_wv_sim('amor', 'bellum')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }