{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup Visdom\n",
    "\n",
    "Install it with:\n",
    "\n",
    "`pip install visdom`\n",
    "\n",
    "Start the server:\n",
    "\n",
    "`python -m visdom.server`\n",
    "\n",
    "Visdom now can be accessed at http://localhost:8097 in the browser.\n",
    "\n",
    "\n",
    "# LDA Training Visualization\n",
    "\n",
    "Knowing about the progress and performance of a model, as we train them, could be very helpful in understanding it’s learning process and makes it easier to debug and optimize them. In this notebook, we will learn how to visualize training statistics for LDA topic model in gensim. To monitor the training, a list of Metrics is passed to the LDA function call for plotting their values live as the training progresses. \n",
    "\n",
    "\n",
    "<img src=\"visdom_graph.png\">\n",
    "\n",
    "\n",
    "Let's plot the training stats for an LDA model being trained on kaggle's [fake news dataset](https://www.kaggle.com/mrisdal/fake-news). We will use the four evaluation metrics available for topic models in gensim: Coherence, Perplexity, Topic diff and Convergence. (using separate hold_out and test corpus for evaluating the perplexity)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] File b'fake.csv' does not exist: b'fake.csv'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-1-de0530cf9fd8>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mdf_fake\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'fake.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     10\u001b[0m \u001b[0mdf_fake\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'title'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'text'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'language'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     11\u001b[0m \u001b[0mdf_fake\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_fake\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnotnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_fake\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdf_fake\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlanguage\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'english'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[1;32m    700\u001b[0m                     skip_blank_lines=skip_blank_lines)\n\u001b[1;32m    701\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 702\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    703\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    704\u001b[0m     \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    427\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    428\u001b[0m     \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 429\u001b[0;31m     \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    430\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    431\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m    893\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    894\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 895\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    896\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    897\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m   1120\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'c'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1121\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'c'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1122\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1123\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1124\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'python'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m   1851\u001b[0m         \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'usecols'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0musecols\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1852\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1853\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1854\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munnamed_cols\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munnamed_cols\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1855\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._setup_parser_source\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] File b'fake.csv' does not exist: b'fake.csv'"
     ]
    }
   ],
   "source": [
    "from gensim.models import ldamodel\n",
    "from gensim.corpora import Dictionary\n",
    "import pandas as pd\n",
    "import re\n",
    "from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "df_fake = pd.read_csv('fake.csv')\n",
    "df_fake[['title', 'text', 'language']].head()\n",
    "df_fake = df_fake.loc[(pd.notnull(df_fake.text)) & (df_fake.language == 'english')]\n",
    "\n",
    "# remove stopwords and punctuations\n",
    "def preprocess(row):\n",
    "    return strip_punctuation(remove_stopwords(row.lower()))\n",
    "    \n",
    "df_fake['text'] = df_fake['text'].apply(preprocess)\n",
    "\n",
    "# Convert data to required input format by LDA\n",
    "texts = []\n",
    "for line in df_fake.text:\n",
    "    lowered = line.lower()\n",
    "    words = re.findall(r'\\w+', lowered, flags = re.UNICODE | re.LOCALE)\n",
    "    texts.append(words)\n",
    "\n",
    "dictionary = Dictionary(texts)\n",
    "\n",
    "training_texts = texts[:5000]\n",
    "holdout_texts = texts[5000:7500]\n",
    "test_texts = texts[7500:10000]\n",
    "\n",
    "training_corpus = [dictionary.doc2bow(text) for text in training_texts]\n",
    "holdout_corpus = [dictionary.doc2bow(text) for text in holdout_texts]\n",
    "test_corpus = [dictionary.doc2bow(text) for text in test_texts]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "from gensim.models.callbacks import CoherenceMetric, DiffMetric, PerplexityMetric, ConvergenceMetric\n",
    "\n",
    "# define perplexity callback for hold_out and test corpus\n",
    "pl_holdout = PerplexityMetric(corpus=holdout_corpus, logger=\"visdom\", title=\"Perplexity (hold_out)\")\n",
    "pl_test = PerplexityMetric(corpus=test_corpus, logger=\"visdom\", title=\"Perplexity (test)\")\n",
    "\n",
    "# define other remaining metrics available\n",
    "ch_umass = CoherenceMetric(corpus=training_corpus, coherence=\"u_mass\", logger=\"visdom\", title=\"Coherence (u_mass)\")\n",
    "ch_cv = CoherenceMetric(corpus=training_corpus, texts=training_texts, coherence=\"c_v\", logger=\"visdom\", title=\"Coherence (c_v)\")\n",
    "diff_kl = DiffMetric(distance=\"kullback_leibler\", logger=\"visdom\", title=\"Diff (kullback_leibler)\")\n",
    "convergence_kl = ConvergenceMetric(distance=\"jaccard\", logger=\"visdom\", title=\"Convergence (jaccard)\")\n",
    "\n",
    "callbacks = [pl_holdout, pl_test, ch_umass, ch_cv, diff_kl, convergence_kl]\n",
    "\n",
    "# training LDA model\n",
    "model = ldamodel.LdaModel(corpus=training_corpus, id2word=dictionary, num_topics=35, passes=50, chunksize=1500, iterations=200, alpha='auto', callbacks=callbacks)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When the model is set for training, you can open http://localhost:8097 to see the training progress."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# to get a metric value on a trained model\n",
    "print(CoherenceMetric(corpus=training_corpus, coherence=\"u_mass\").get_value(model=model))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The four types of graphs which are plotted for LDA:\n",
    "\n",
    "**Coherence**\n",
    "\n",
    "Coherence measures are generally based on the idea of computing the sum of pairwise scores of top *n* words w<sub>1</sub>, ...,w<sub>n</sub> used to describe the topic. There are four coherence measures available in gensim: `u_mass, c_v, c_uci, c_npmi`. A good model will generate coherent topics, i.e., topics with high topic coherence scores. Good topics can be described by a short label based on the topic terms they spit out. \n",
    "\n",
    "<img src=\"Coherence.gif\">\n",
    "\n",
    "Now, this graph along with the others explained below, can be used to decide if it's time to stop the training. We     can see if the value stops changing after some epochs and that we are able to get the highest possible coherence       of our model.  \n",
    "\n",
    "\n",
    "**Perplexity**\n",
    "\n",
    "Perplexity is a measurement of how well a probability distribution or probability model predicts a sample. In LDA, topics are described by a probability distribution over vocabulary words. So, perplexity can be used to evaluate the topic-term distribution output by LDA.\n",
    "\n",
    "<img src=\"Perplexity.gif\">\n",
    "\n",
    "For a good model, perplexity should be low.\n",
    "\n",
    "\n",
    "**Topic Difference**\n",
    "\n",
    "Topic Diff calculates the distance between two LDA models. This distance is calculated based on the topics, by either using their probability distribution over vocabulary words (kullback_leibler, hellinger) or by simply using the common vocabulary words between the topics from both model.\n",
    "\n",
    "<img src=\"Diff.gif\">\n",
    "\n",
    "In the heatmap, X-axis define the Epoch no. and Y-axis define the distance between identical topics from consecutive epochs. For ex. a particular cell in the heatmap with values (x=3, y=5, z=0.4) represent the distance(=0.4) between the topic 5 from 3rd epoch and topic 5 from 2nd epoch. With increasing epochs, the distance between the identical topics should decrease.\n",
    "  \n",
    "  \n",
    "**Convergence**\n",
    "\n",
    "Convergence is the sum of the difference between all the identical topics from two consecutive epochs. It is basically the sum of column values in the heatmap above.\n",
    "\n",
    "<img src=\"Convergence.gif\">\n",
    "\n",
    "The model is said to be converged when the convergence value stops descending with increasing epochs."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Training Logs\n",
    "\n",
    "We can also log the metric values after every epoch to the shell apart from visualizing them in Visdom. The only change is to define `logger=\"shell\"` instead of `\"visdom\"` in the input callbacks."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "import logging\n",
    "from gensim.models.callbacks import CoherenceMetric, DiffMetric, PerplexityMetric, ConvergenceMetric\n",
    "\n",
    "logging.basicConfig(level=logging.INFO)\n",
    "logger = logging.getLogger(__name__)\n",
    "logger.setLevel(logging.DEBUG)\n",
    "\n",
    "# define perplexity callback for hold_out and test corpus\n",
    "pl_holdout = PerplexityMetric(corpus=holdout_corpus, logger=\"shell\", title=\"Perplexity (hold_out)\")\n",
    "pl_test = PerplexityMetric(corpus=test_corpus, logger=\"shell\", title=\"Perplexity (test)\")\n",
    "\n",
    "# define other remaining metrics available\n",
    "ch_umass = CoherenceMetric(corpus=training_corpus, coherence=\"u_mass\", logger=\"shell\", title=\"Coherence (u_mass)\")\n",
    "diff_kl = DiffMetric(distance=\"kullback_leibler\", logger=\"shell\", title=\"Diff (kullback_leibler)\")\n",
    "convergence_jc = ConvergenceMetric(distance=\"jaccard\", logger=\"shell\", title=\"Convergence (jaccard)\")\n",
    "\n",
    "callbacks = [pl_holdout, pl_test, ch_umass, diff_kl, convergence_jc]\n",
    "\n",
    "# training LDA model\n",
    "model = ldamodel.LdaModel(corpus=training_corpus, id2word=dictionary, num_topics=35, passes=2, eval_every=None, callbacks=callbacks)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The metric values can also be accessed from the model instance for custom uses."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.metrics"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}