{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Illustrating common terms usage using Wikinews in english" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## getting data\n", "\n", "We get the cirrussearch dump of wikinews (a dump meant for elastic-search indexation)." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "LANG=\"english\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "--2019-05-12 19:33:27-- https://dumps.wikimedia.org/other/cirrussearch/20170327/enwikinews-20170327-cirrussearch-content.json.gz\n", "Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620:0:861:4:208:80:155:106, 208.80.155.106\n", "Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620:0:861:4:208:80:155:106|:443... connected.\n", "HTTP request sent, awaiting response... 404 Not Found\n", "2019-05-12 19:33:28 ERROR 404: Not Found.\n", "\n" ] }, { "ename": "CalledProcessError", "evalue": "Command 'b'\\nfdate=20170327\\nfname=enwikinews-$fdate-cirrussearch-content.json.gz\\nif [ ! -e $fname ]\\nthen\\n wget \"https://dumps.wikimedia.org/other/cirrussearch/$fdate/$fname\"\\nfi\\n'' returned non-zero exit status 8.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mCalledProcessError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_cell_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'bash'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'\\nfdate=20170327\\nfname=enwikinews-$fdate-cirrussearch-content.json.gz\\nif [ ! -e $fname ]\\nthen\\n wget \"https://dumps.wikimedia.org/other/cirrussearch/$fdate/$fname\"\\nfi\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/IPython/core/interactiveshell.py\u001b[0m in \u001b[0;36mrun_cell_magic\u001b[0;34m(self, magic_name, line, cell)\u001b[0m\n\u001b[1;32m 2321\u001b[0m \u001b[0mmagic_arg_s\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvar_expand\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstack_depth\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2322\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuiltin_trap\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2323\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmagic_arg_s\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2324\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2325\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/IPython/core/magics/script.py\u001b[0m in \u001b[0;36mnamed_script_magic\u001b[0;34m(line, cell)\u001b[0m\n\u001b[1;32m 140\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 141\u001b[0m \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscript\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 142\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshebang\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 143\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[0;31m# write a basic docstring:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36mshebang\u001b[0;34m(self, line, cell)\u001b[0m\n", "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/IPython/core/magic.py\u001b[0m in \u001b[0;36m\u001b[0;34m(f, *a, **k)\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[0;31m# but it's overkill for just that one bit of state.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 186\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmagic_deco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 187\u001b[0;31m \u001b[0mcall\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 188\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/IPython/core/magics/script.py\u001b[0m in \u001b[0;36mshebang\u001b[0;34m(self, line, cell)\u001b[0m\n\u001b[1;32m 243\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstderr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflush\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraise_error\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m!=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 245\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mCalledProcessError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 246\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_run_script\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mto_close\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mCalledProcessError\u001b[0m: Command 'b'\\nfdate=20170327\\nfname=enwikinews-$fdate-cirrussearch-content.json.gz\\nif [ ! -e $fname ]\\nthen\\n wget \"https://dumps.wikimedia.org/other/cirrussearch/$fdate/$fname\"\\nfi\\n'' returned non-zero exit status 8." ] } ], "source": [ "%%bash\n", "\n", "fdate=20170327\n", "fname=enwikinews-$fdate-cirrussearch-content.json.gz\n", "if [ ! -e $fname ]\n", "then\n", " wget \"https://dumps.wikimedia.org/other/cirrussearch/$fdate/$fname\"\n", "fi\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# iterator\n", "import gzip\n", "import json\n", "\n", "FDATE = 20170327\n", "FNAME = \"enwikinews-%s-cirrussearch-content.json.gz\" % FDATE\n", "\n", "def iter_texts(fpath=FNAME):\n", " with gzip.open(fpath, \"rt\") as f:\n", " for l in f:\n", " data = json.loads(l)\n", " if \"title\" in data:\n", " yield data[\"title\"]\n", " yield data[\"text\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# also prepare nltk\n", "import nltk\n", "nltk.download(\"punkt\")\n", "nltk.download(\"stopwords\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preparing data\n", "\n", "we arrange the corpus as required by gensim" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# make a custom tokenizer\n", "import re\n", "from nltk.tokenize import sent_tokenize\n", "from nltk.tokenize import RegexpTokenizer\n", "tokenizer = RegexpTokenizer('\\w[\\w-]*|\\d[\\d,]*')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# prepare a text\n", "def prepare(txt):\n", " # lower case\n", " txt = txt.lower()\n", " return [tokenizer.tokenize(sent) \n", " for sent in sent_tokenize(txt, language=LANG)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# we put all data in ram, it's not so much\n", "corpus = []\n", "for txt in iter_texts():\n", " corpus.extend(prepare(txt))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# how many sentences and words ?\n", "words_count = sum(len(s) for s in corpus)\n", "print(\"Corpus has %d words in %d sentences\" % (words_count, len(corpus)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Testing bigram with and without common terms\n", "\n", "The `Phrases` model gives us the possiblity of handling common terms, that is words that appears much time in a text and are there only to link objects between them.\n", "While you could remove them, you may information, for *\"the president is in america\"* is not the same as *\"the president of america\"*\n", "\n", "The common_terms parameter Phrases can help you deal with them in a smarter way, keeping them around but avoiding them to crush frequency statistics." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from gensim.models.phrases import Phrases" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# which are the stop words we will use\n", "from nltk.corpus import stopwords\n", "\" \".join(stopwords.words(LANG))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# a version of corups without stop words\n", "stop_words = frozenset(stopwords.words(LANG))\n", "def stopwords_filter(txt):\n", " return [w for w in txt if w not in stop_words]\n", "st_corpus = [stopwords_filter(txt) for txt in corpus]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# bigram std\n", "%time bigram = Phrases(st_corpus)\n", "# bigram with common terms\n", "%time bigram_ct = Phrases(corpus, common_terms=stopwords.words(LANG))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### bigram with common terms inside\n", "\n", "What are (some of) the bigram founds thanks to common terms" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# grams that have more than 2 terms, are those with common terms\n", "ct_ngrams = set((g[1], g[0].decode(\"utf-8\"))\n", " for g in bigram_ct.export_phrases(corpus) \n", " if len(g[0].split()) > 2)\n", "ct_ngrams = sorted(list(ct_ngrams))\n", "print(len(ct_ngrams), \"grams with common terms found\")\n", "# highest scores\n", "ct_ngrams[-20:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# did we found any bigram with same words but different stopwords\n", "import collections\n", "by_terms = collections.defaultdict(set)\n", "for ngram, score in bigram_ct.export_phrases(corpus):\n", " grams = ngram.split()\n", " by_terms[(grams[0], grams[-1])].add(ngram)\n", "for k, v in by_terms.items():\n", " if len(v) > 1:\n", " print(b\"-\".join(k).decode(\"utf-8\"),\" : \", [w.decode(\"utf-8\") for w in v])\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 2 }