{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Imports\n",
    "import html\n",
    "import string\n",
    "import re\n",
    "\n",
    "import collections\n",
    "\n",
    "import gensim\n",
    "from gensim.models import Word2Vec\n",
    "\n",
    "from nltk.tokenize import PunktSentenceTokenizer\n",
    "\n",
    "from cltk.stem.latin.j_v import JVReplacer\n",
    "from cltk.corpus.latin import latinlibrary\n",
    "\n",
    "from matplotlib import pyplot\n",
    "\n",
    "from pprint import pprint\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Set up NLP tools\n",
    "replacer = JVReplacer()\n",
    "tokenizer = PunktSentenceTokenizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Preprocess texts\n",
    "def preprocess(text):\n",
    "    \n",
    "    text = html.unescape(text) # Handle html entities\n",
    "    \n",
    "    text = text.lower()\n",
    "    text = replacer.replace(text) #Normalize u/v & i/j\n",
    "    \n",
    "    punctuation =\"\\\"#$%&\\'()*+,-/:;<=>@[\\]^_`{|}~.?!«»\"\n",
    "    translator = str.maketrans({key: \" \" for key in punctuation})\n",
    "    text = text.translate(translator)\n",
    "    \n",
    "    translator = str.maketrans({key: \" \" for key in '0123456789'})\n",
    "    text = text.translate(translator)\n",
    "    \n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Build word2vec model on Latin Library texts\n",
    "\n",
    "## Results of following lines are pickled as ll_w2v.p\n",
    "# ll_raw = latinlibrary.raw()\n",
    "# ll_sentences = tokenizer.tokenize(ll_raw)\n",
    "# ll_sentences = [preprocess(sent).split() for sent in ll_sentences]\n",
    "# ll_model = gensim.models.Word2Vec(ll_sentences, min_count=2, size=300, workers=4)\n",
    "\n",
    "ll_model = pickle.load(open(\"./data/ll_w2v.p\", \"rb\" ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('regina', 0.6140977144241333),\n",
       " ('matre', 0.6033270955085754),\n",
       " ('coniuge', 0.5800632834434509)]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Use 'most-similar' to produce a Latin version of:\n",
    "# king - man + woman = queen\n",
    "ll_model.wv.most_similar(positive=['rex', 'femina'], \n",
    "                         negative=['uir'], topn=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('regina', 0.5619041323661804),\n",
       " ('uxor', 0.5604838132858276),\n",
       " ('mater', 0.5291939377784729)]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# king - man + woman = queen; mulier variation\n",
    "ll_model.wv.most_similar(positive=['rex', 'mulier'], \n",
    "                         negative=['uir'], topn=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('rex', 0.7597681283950806),\n",
       " ('comes', 0.7183645963668823),\n",
       " ('dux', 0.7138530015945435)]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# queen - woman + man = king\n",
    "ll_model.wv.most_similar(positive=['regina', 'uir'], \n",
    "                         negative=['femina'], topn=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('mater', 0.6681894659996033),\n",
       " ('uxor', 0.6311129927635193),\n",
       " ('puella', 0.6174641847610474)]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# father - man + woman = mother\n",
    "ll_model.wv.most_similar(positive=['pater', 'mulier'], \n",
    "                         negative=['uir'], topn=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'grauis'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Use 'doesnt_match' to remove non-color from list of colors\n",
    "ll_model.wv.doesnt_match('ruber flauus uiridis caerulus purpureus grauis'.split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'nero'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# ...or 'bad' emperor from 'good'\n",
    "ll_model.wv.doesnt_match('augustus nero nerua traianus hadrianus antoninus'.split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Similarity score for rex and regina: 0.6395386929624856\n"
     ]
    }
   ],
   "source": [
    "# Some similarity measures...\n",
    "def print_wv_sim(word1, word2):\n",
    "    print(\"Similarity score for {} and {}: {}\".format(word1, word2, ll_model.wv.similarity(word1, word2)))\n",
    "\n",
    "print_wv_sim('rex', 'regina')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Similarity score for femina and mulier: 0.717120040628728\n"
     ]
    }
   ],
   "source": [
    "# Some similarity measures...\n",
    "print_wv_sim('femina', 'mulier')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Similarity score for uxor and coniunx: 0.7239514246158751\n",
      "Similarity score for puella and uirgo: 0.8496341449771854\n",
      "Similarity score for famulus and famula: 0.7061932968048992\n",
      "Similarity score for lassus and fessus: 0.820626450365237\n",
      "Similarity score for gladius and ensis: 0.7989289913212123\n",
      "Similarity score for terra and tellus: 0.5780294477506212\n",
      "Similarity score for mors and letum: 0.3612532580370732\n"
     ]
    }
   ],
   "source": [
    "# Some examples from Axelson's 'Unpoetische Wörter'\n",
    "print_wv_sim('uxor', 'coniunx')\n",
    "print_wv_sim('puella', 'uirgo')\n",
    "print_wv_sim('famulus', 'famula')\n",
    "print_wv_sim('lassus', 'fessus')\n",
    "print_wv_sim('gladius', 'ensis')\n",
    "print_wv_sim('terra', 'tellus')\n",
    "print_wv_sim('mors', 'letum')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Similarity score for amor and bellum: -0.026459743634875108\n"
     ]
    }
   ],
   "source": [
    "print_wv_sim('amor', 'bellum')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}