{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Illustrating common terms usage using Wikinews in english"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## getting data\n",
    "\n",
    "We get the cirrussearch dump of wikinews (a dump meant for elastic-search indexation)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "LANG=\"english\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "--2019-05-12 19:33:27--  https://dumps.wikimedia.org/other/cirrussearch/20170327/enwikinews-20170327-cirrussearch-content.json.gz\n",
      "Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620:0:861:4:208:80:155:106, 208.80.155.106\n",
      "Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620:0:861:4:208:80:155:106|:443... connected.\n",
      "HTTP request sent, awaiting response... 404 Not Found\n",
      "2019-05-12 19:33:28 ERROR 404: Not Found.\n",
      "\n"
     ]
    },
    {
     "ename": "CalledProcessError",
     "evalue": "Command 'b'\\nfdate=20170327\\nfname=enwikinews-$fdate-cirrussearch-content.json.gz\\nif [ ! -e  $fname ]\\nthen\\n    wget \"https://dumps.wikimedia.org/other/cirrussearch/$fdate/$fname\"\\nfi\\n'' returned non-zero exit status 8.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mCalledProcessError\u001b[0m                        Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-2-fd54bac10b20>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_cell_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'bash'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'\\nfdate=20170327\\nfname=enwikinews-$fdate-cirrussearch-content.json.gz\\nif [ ! -e  $fname ]\\nthen\\n    wget \"https://dumps.wikimedia.org/other/cirrussearch/$fdate/$fname\"\\nfi\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/IPython/core/interactiveshell.py\u001b[0m in \u001b[0;36mrun_cell_magic\u001b[0;34m(self, magic_name, line, cell)\u001b[0m\n\u001b[1;32m   2321\u001b[0m             \u001b[0mmagic_arg_s\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvar_expand\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstack_depth\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2322\u001b[0m             \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuiltin_trap\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2323\u001b[0;31m                 \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmagic_arg_s\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2324\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2325\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/IPython/core/magics/script.py\u001b[0m in \u001b[0;36mnamed_script_magic\u001b[0;34m(line, cell)\u001b[0m\n\u001b[1;32m    140\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    141\u001b[0m                 \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscript\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 142\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshebang\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    143\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    144\u001b[0m         \u001b[0;31m# write a basic docstring:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m<decorator-gen-109>\u001b[0m in \u001b[0;36mshebang\u001b[0;34m(self, line, cell)\u001b[0m\n",
      "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/IPython/core/magic.py\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(f, *a, **k)\u001b[0m\n\u001b[1;32m    185\u001b[0m     \u001b[0;31m# but it's overkill for just that one bit of state.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    186\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mmagic_deco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 187\u001b[0;31m         \u001b[0mcall\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    188\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    189\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mcallable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/IPython/core/magics/script.py\u001b[0m in \u001b[0;36mshebang\u001b[0;34m(self, line, cell)\u001b[0m\n\u001b[1;32m    243\u001b[0m             \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstderr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflush\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    244\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraise_error\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m!=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 245\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mCalledProcessError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    246\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    247\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_run_script\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mto_close\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mCalledProcessError\u001b[0m: Command 'b'\\nfdate=20170327\\nfname=enwikinews-$fdate-cirrussearch-content.json.gz\\nif [ ! -e  $fname ]\\nthen\\n    wget \"https://dumps.wikimedia.org/other/cirrussearch/$fdate/$fname\"\\nfi\\n'' returned non-zero exit status 8."
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "\n",
    "fdate=20170327\n",
    "fname=enwikinews-$fdate-cirrussearch-content.json.gz\n",
    "if [ ! -e  $fname ]\n",
    "then\n",
    "    wget \"https://dumps.wikimedia.org/other/cirrussearch/$fdate/$fname\"\n",
    "fi\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# iterator\n",
    "import gzip\n",
    "import json\n",
    "\n",
    "FDATE = 20170327\n",
    "FNAME = \"enwikinews-%s-cirrussearch-content.json.gz\" % FDATE\n",
    "\n",
    "def iter_texts(fpath=FNAME):\n",
    "    with gzip.open(fpath, \"rt\") as f:\n",
    "        for l in f:\n",
    "            data = json.loads(l)\n",
    "            if \"title\" in data:\n",
    "                yield data[\"title\"]\n",
    "                yield data[\"text\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# also prepare nltk\n",
    "import nltk\n",
    "nltk.download(\"punkt\")\n",
    "nltk.download(\"stopwords\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preparing data\n",
    "\n",
    "we arrange the corpus as required by gensim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make a custom tokenizer\n",
    "import re\n",
    "from nltk.tokenize import sent_tokenize\n",
    "from nltk.tokenize import RegexpTokenizer\n",
    "tokenizer = RegexpTokenizer('\\w[\\w-]*|\\d[\\d,]*')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# prepare a text\n",
    "def prepare(txt):\n",
    "    # lower case\n",
    "    txt = txt.lower()\n",
    "    return [tokenizer.tokenize(sent) \n",
    "            for sent in sent_tokenize(txt, language=LANG)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we put all data in ram, it's not so much\n",
    "corpus = []\n",
    "for txt in iter_texts():\n",
    "    corpus.extend(prepare(txt))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# how many sentences and words ?\n",
    "words_count = sum(len(s) for s in corpus)\n",
    "print(\"Corpus has %d words in %d sentences\" % (words_count, len(corpus)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Testing bigram with and without common terms\n",
    "\n",
    "The `Phrases` model gives us the possiblity of handling common terms, that is words that appears much time in a text and are there only to link objects between them.\n",
    "While you could remove them, you may information, for *\"the president is in america\"* is not the same as *\"the president of america\"*\n",
    "\n",
    "The common_terms parameter Phrases can help you deal with them in a smarter way, keeping them around but avoiding them to crush frequency statistics."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.models.phrases import Phrases"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# which are the stop words we will use\n",
    "from nltk.corpus import stopwords\n",
    "\" \".join(stopwords.words(LANG))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# a version of corups without stop words\n",
    "stop_words = frozenset(stopwords.words(LANG))\n",
    "def stopwords_filter(txt):\n",
    "    return [w for w in txt if w not in stop_words]\n",
    "st_corpus = [stopwords_filter(txt) for txt in corpus]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# bigram std\n",
    "%time bigram = Phrases(st_corpus)\n",
    "# bigram with common terms\n",
    "%time bigram_ct = Phrases(corpus, common_terms=stopwords.words(LANG))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### bigram with common terms inside\n",
    "\n",
    "What are (some of) the bigram founds thanks to common terms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# grams that have more than 2 terms, are those with common terms\n",
    "ct_ngrams = set((g[1], g[0].decode(\"utf-8\"))\n",
    "                     for g in bigram_ct.export_phrases(corpus) \n",
    "                     if len(g[0].split()) > 2)\n",
    "ct_ngrams = sorted(list(ct_ngrams))\n",
    "print(len(ct_ngrams), \"grams with common terms found\")\n",
    "# highest scores\n",
    "ct_ngrams[-20:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# did we found any bigram with same words but different stopwords\n",
    "import collections\n",
    "by_terms = collections.defaultdict(set)\n",
    "for ngram, score in bigram_ct.export_phrases(corpus):\n",
    "    grams = ngram.split()\n",
    "    by_terms[(grams[0], grams[-1])].add(ngram)\n",
    "for k, v  in by_terms.items():\n",
    "    if len(v) > 1:\n",
    "        print(b\"-\".join(k).decode(\"utf-8\"),\" : \", [w.decode(\"utf-8\") for w in v])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}