{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Imports\n", "\n", "import os\n", "import string\n", "import math\n", "import re\n", "from collections import Counter\n", "from pprint import pprint\n", "import html \n", "\n", "import numpy as np\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "from cltk.corpus.latin import latinlibrary\n", "from cltk.tokenize.word import WordTokenizer\n", "from cltk.stem.latin.j_v import JVReplacer" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Setup CLTK tools\n", "\n", "word_tokenizer = WordTokenizer('latin')\n", "replacer = JVReplacer()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Setup files\n", "\n", "files = latinlibrary.fileids()\n", "\n", "# Typical setup\n", "#files = [file for file in files]\n", "\n", "# Filter for classical texts\n", "classical = []\n", "\n", "remove = [\"The Bible\",\"Ius Romanum\",\"Papal Bulls\",\"Medieval Latin\",\"Christian Latin\",\"Christina Latin\",\"Neo-Latin\",\"The Miscellany\",\"Contemporary Latin\"]\n", "\n", "for file in files:\n", " raw = latinlibrary.raw(file)\n", " if not any(x in raw for x in remove):\n", " classical.append(file)\n", "\n", "files = classical" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Preprocess texts\n", "\n", "def preprocess(text): \n", "\n", " text = html.unescape(text) # Handle html entities\n", " text = re.sub(r' ?', ' ',text) #  stripped incorrectly in corpus?\n", " text = re.sub('\\x00',' ',text) #Another space problem?\n", " \n", " text = text.lower()\n", " text = replacer.replace(text) #Normalize u/v & i/j\n", " \n", " punctuation =\"\\\"#$%&\\'()*+,-/:;<=>@[\\]^_`{|}~.?!«»\"\n", " translator = str.maketrans({key: \" \" for key in punctuation})\n", " text = text.translate(translator)\n", " \n", " translator = str.maketrans({key: \" \" for key in '0123456789'})\n", " text = text.translate(translator)\n", "\n", " remove_list = [r'\\bthe latin library\\b',\n", " r'\\bthe classics page\\b',\n", " r'\\bneo-latin\\b', \n", " r'\\bmedieval latin\\b',\n", " r'\\bchristian latin\\b',\n", " r'\\bchristina latin\\b',\n", " r'\\bpapal bulls\\b',\n", " r'\\bthe miscellany\\b',\n", " ]\n", "\n", " for pattern in remove_list:\n", " text = re.sub(pattern, '', text)\n", " \n", " text = re.sub('[ ]+',' ', text) # Remove double spaces\n", " text = re.sub('\\s+\\n+\\s+','\\n', text) # Remove double lines and trim spaces around new lines\n", " \n", " return text" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Make list of texts\n", "\n", "raw_files = []\n", "\n", "for file in files:\n", " raw = latinlibrary.raw(file)\n", " raw = preprocess(raw)\n", " if len(raw) < 1000:\n", " pass\n", " else:\n", " raw_tokens = raw.split()\n", " raw = \" \".join(raw_tokens[50:-50])\n", " raw_files.append(raw)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Following [Alajmi 2012]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Make document-term matrix and vocabulary\n", "\n", "vectorizer = CountVectorizer(input='content')\n", "dtm = vectorizer.fit_transform(raw_files)\n", "dtm = dtm.toarray()\n", "\n", "vocab = vectorizer.get_feature_names()\n", "vocab = np.array(vocab)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "M = len(vocab)\n", "N= len(raw_files)\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Make array of probabilities per book\n", "\n", "raw_lengths = [len(tokens.split()) for tokens in raw_files]\n", "l = np.array(raw_lengths)\n", "ll = l.reshape(len(l),1)\n", "\n", "probs = dtm/ll\n", "\n", "P=probs" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Calculate mean probability\n", "# i.e. Sum of probabilities for each word / number of documents\n", "\n", "probsum = np.ravel(probs.sum(axis=0))\n", "MP = probsum/N" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Make array of bar probability\n", "\n", "length = sum(raw_lengths)\n", "barprobs = dtm/length\n", "bP=barprobs" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "variance = (P-bP) ** 2\n", "varsum = np.ravel(variance.sum(axis=0))\n", "VP = varsum/N" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "cutoff = 100" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['et', 'in', 'non', 'est', 'ut', 'cum', 'ad', 'quod', 'qui', 'si', 'sed', 'quam', 'quae', 'esse', 'de', 'ex', 'nec', 'aut', 'quid', 'se', 'atque', 'hoc', 'me', 'etiam', 'ab', 'te', 'enim', 'per', 'sunt', 'ac', 'ne', 'quo', 'iam', 'mihi', 'haec', 'tamen', 'id', 'sit', 'neque', 'tibi', 'quidem', 'ego', 'ita', 'nam', 'autem', 'eius', 'nihil', 'nunc', 'erat', 'quoque', 'eo', 'ea', 'tu', 'modo', 'tum', 'quem', 'quibus', 'inter', 'ille', 'esset', 'qua', 'fuit', 'sic', 'ipse', 'pro', 'hic', 'omnia', 'nisi', 'uel', 'illa', 'ubi', 'ante', 'tam', 'res', 'sibi', 'sine', 'an', 'eum', 'his', 'uero', 'causa', 'quia', 'quos', 'quis', 'at', 'omnes', 'apud', 'magis', 'nos', 'post', 'rem', 're', 'dum', 'omnibus', 'igitur', 'potest', 'tantum', 'inquit', 'deinde', 'itaque']\n" ] } ], "source": [ "# Return top counts\n", "\n", "freq = np.ravel(dtm.sum(axis=0))\n", "wordfreq = list(zip(vocab,freq))\n", "wordfreq.sort(key=lambda x: x[1], reverse=True)\n", "wf = [item[0] for item in wordfreq]\n", "wf = wf[:cutoff]\n", "print(wf)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['et', 'in', 'est', 'non', 'cum', 'ut', 'ad', 'quod', 'qui', 'sed', 'si', 'quae', 'quam', 'nec', 'ex', 'de', 'esse', 'se', 'aut', 'atque', 'per', 'quid', 'ab', 'sunt', 'me', 'hoc', 'te', 'etiam', 'quo', 'ne', 'mihi', 'iam', 'enim', 'ac', 'haec', 'tibi', 'tamen', 'eius', 'neque', 'nam', 'id', 'sit', 'ego', 'nunc', 'erat', 'quoque', 'ille', 'inter', 'ita', 'quidem', 'fuit', 'quem', 'autem', 'ipse', 'quibus', 'eo', 'tu', 'esset', 'qua', 'sic', 'nihil', 'hic', 'ea', 'tum', 'ante', 'modo', 'pro', 'sibi', 'eum', 'illa', 'his', 'nisi', 'omnia', 'uel', 'post', 'res', 'sine', 'tam', 'bellum', 'ubi', 'quos', 'at', 'apud', 'omnes', 'quis', 'dum', 'uero', 'causa', 'an', 'tantum', 'omnibus', 'cui', 'cuius', 'rem', 'deinde', 'sub', 'nos', 'primum', 'sua', 'igitur']\n" ] } ], "source": [ "# Return top mean prob\n", "\n", "test = list(zip(vocab,MP))\n", "test.sort(key=lambda x: x[1], reverse=True)\n", "mp = [item[0] for item in test]\n", "mp = mp[:cutoff]\n", "print(mp)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['et', 'in', 'est', 'non', 'cum', 'ut', 'ad', 'quod', 'qui', 'si', 'sed', 'quae', 'quam', 'de', 'nec', 'ex', 'me', 'esse', 'te', 'se', 'aut', 'per', 'atque', 'quid', 'sunt', 'ab', 'mihi', 'tibi', 'hoc', 'ac', 'enim', 'etiam', 'ne', 'iam', 'quo', 'eius', 'ego', 'neque', 'bellum', 'id', 'autem', 'haec', 'esset', 'erat', 'tamen', 'cos', 'sit', 'inter', 'quoque', 'tu', 'fuit', 'nam', 'quidem', 'nunc', 'eo', 'quem', 'ita', 'ille', 'senatu', 'qua', 'quibus', 'eum', 'ipse', 'ante', 'hic', 'aduersus', 'senatus', 'nihil', 'ea', 'res', 'sic', 'antiocho', 'sibi', 'post', 'tum', 'uel', 'deinde', 'his', 'rem', 'illa', 'causa', 'milia', 'apud', 'pro', 'cn', 'inquit', 'contra', 'modo', 'exercitu', 'omnia', 'nisi', 'gel', 'eorum', 'marius', 'bello', 'sine', 'philippo', 'regem', 'mea', 'fr']\n" ] } ], "source": [ "# Return top variance prob\n", "\n", "test = list(zip(vocab,VP))\n", "test.sort(key=lambda x: x[1], reverse=True)\n", "vp = [item[0] for item in test]\n", "vp = vp[:cutoff]\n", "print(vp)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "with np.errstate(divide='ignore', invalid='ignore'):\n", " logprobs = np.where(probs != 0, np.log10(1/probs), 0)\n", "ent = probs * logprobs" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['et', 'in', 'est', 'non', 'cum', 'ut', 'ad', 'quod', 'qui', 'sed', 'si', 'quae', 'quam', 'nec', 'ex', 'de', 'esse', 'se', 'aut', 'per', 'atque', 'quid', 'ab', 'sunt', 'hoc', 'quo', 'etiam', 'ne', 'me', 'te', 'iam', 'enim', 'ac', 'mihi', 'haec', 'tamen', 'tibi', 'nam', 'eius', 'neque', 'sit', 'nunc', 'id', 'erat', 'ille', 'quem', 'quoque', 'inter', 'ita', 'fuit', 'quidem', 'ego', 'ipse', 'quibus', 'autem', 'qua', 'eo', 'sic', 'esset', 'tu', 'nihil', 'hic', 'ea', 'tum', 'modo', 'ante', 'pro', 'sibi', 'omnia', 'his', 'nisi', 'illa', 'uel', 'post', 'res', 'eum', 'sine', 'tam', 'quos', 'ubi', 'omnes', 'at', 'dum', 'quis', 'apud', 'uero', 'causa', 'bellum', 'tantum', 'an', 'cui', 'omnibus', 'cuius', 'primum', 'nos', 'sua', 'sub', 'rem', 'magis', 'deinde']\n" ] } ], "source": [ "ents = np.ravel(ent.sum(axis=0))\n", "entrank = list(zip(vocab,ents))\n", "entrank.sort(key=lambda x: x[1], reverse=True)\n", "e = [item[0] for item in entrank]\n", "e = e[:cutoff]\n", "print(e)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def borda_sort(lists):\n", " ### From http://stackoverflow.com/a/30259368/1816347 ###\n", " scores = {}\n", " for l in lists:\n", " for idx, elem in enumerate(reversed(l)):\n", " if not elem in scores:\n", " scores[elem] = 0\n", " scores[elem] += idx\n", " return sorted(scores.keys(), key=lambda elem: scores[elem], reverse=True)\n", "\n", "lists = [wf, mp, vp, e]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['et', 'in', 'est', 'non', 'cum', 'ut', 'ad', 'quod', 'qui', 'sed', 'si', 'quae', 'quam', 'nec', 'de', 'ex', 'esse', 'se', 'aut', 'atque', 'quid', 'per', 'me', 'ab', 'sunt', 'te', 'hoc', 'etiam', 'ne', 'quo', 'enim', 'mihi', 'ac', 'iam', 'tibi', 'haec', 'tamen', 'neque', 'eius', 'id', 'sit', 'ego', 'nam', 'erat', 'nunc', 'quoque', 'autem', 'quidem', 'ita', 'inter', 'ille', 'quem', 'fuit', 'eo', 'esset', 'tu', 'quibus', 'ipse', 'qua', 'nihil', 'ea', 'sic', 'hic', 'tum', 'ante', 'modo', 'pro', 'sibi', 'eum', 'illa', 'uel', 'res', 'his', 'omnia', 'nisi', 'bellum', 'post', 'sine', 'tam', 'ubi', 'causa', 'apud', 'quos', 'cos', 'at', 'omnes', 'quis', 'uero', 'an', 'senatu', 'rem', 'dum', 'aduersus', 'senatus', 'deinde', 'antiocho', 'tantum', 'omnibus', 'nos', 'quia', 'milia', 'cui', 'inquit', 'cn', 'cuius', 'contra', 'magis', 'exercitu', 're', 'primum', 'gel', 'sub', 'eorum', 'marius', 'sua', 'igitur', 'bello', 'potest', 'philippo', 'regem', 'mea', 'fr', 'itaque']\n" ] } ], "source": [ "print(borda_sort(lists))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Other Latin stopword lists" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tesserae_base = ['qui', 'quis', 'et', 'sum', 'in', 'is', 'non', 'hic', 'ego', 'ut']" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# Cf. http://www.perseus.tufts.edu/hopper/stopwords\n", "# Same as the list w. the following:\n", "# from cltk.stop.latin.stops import STOPS_LIST\n", "perseus = ['ab', 'ac', 'ad', 'adhic', 'aliqui', 'aliquis', 'an', 'ante', 'apud', 'at', 'atque', 'aut', 'autem', 'cum', 'cur', 'de', 'deinde', 'dum', 'ego', 'enim', 'ergo', 'es', 'est', 'et', 'etiam', 'etsi', 'ex', 'fio', 'haud', 'hic', 'iam', 'idem', 'igitur', 'ille', 'in', 'infra', 'inter', 'interim', 'ipse', 'is', 'ita', 'magis', 'modo', 'mox', 'nam', 'ne', 'nec', 'necque', 'neque', 'nisi', 'non', 'nos', 'o', 'ob', 'per', 'possum', 'post', 'pro', 'quae', 'quam', 'quare', 'qui', 'quia', 'quicumque', 'quidem', 'quilibet', 'quis', 'quisnam', 'quisquam', 'quisque', 'quisquis', 'quo', 'quoniam', 'sed', 'si', 'sic', 'sive', 'sub', 'sui', 'sum', 'super', 'suus', 'tam', 'tamen', 'trans', 'tu', 'tum', 'ubi', 'uel', 'uero', 'unus', 'ut']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### References" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "- Alajmi, A., Saad, E.M., and R.R. Darwish. 2012. \"Toward an Arabic Stop-Words List Generation,\" *International Journal of Computer Applications* 48(8): 8-13." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 1 }