{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## NMF topic modeling on 20 newsgroups\n",
    "\n",
    "This notebook is basically expanded version of [this example](http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py) from scikit-learn documentation."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import print_function\n",
    "\n",
    "import numpy as np\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
    "from sklearn.decomposition import NMF\n",
    "from sklearn.datasets import fetch_20newsgroups\n",
    "\n",
    "n_samples = 8000\n",
    "n_features = 1000\n",
    "n_components = 10\n",
    "n_top_words = 20\n",
    "\n",
    "\n",
    "def kl_loss(x, y, eps=1e-10):\n",
    "    return -(x.toarray() * np.log(y+eps)).sum() / x.shape[0]\n",
    "\n",
    "\n",
    "def frobenius_loss(x, y):\n",
    "    return np.square(x - y).sum() / x.shape[0]\n",
    "\n",
    "\n",
    "def print_top_words(model, feature_names, n_top_words):\n",
    "    for topic_idx, topic in enumerate(model.components_):\n",
    "        print(\"Topic #%d: \" % topic_idx)\n",
    "        topic_words = \" \".join([feature_names[i]\n",
    "                             for i in topic.argsort()[:-n_top_words - 1:-1]])\n",
    "        print(topic_words)\n",
    "    print()\n",
    "    \n",
    "    \n",
    "def score_model(model, data):\n",
    "    if model.beta_loss == 'kullback-leibler':\n",
    "        loss_function =  kl_loss\n",
    "    elif model.beta_loss == 'frobenius':\n",
    "        loss_function = frobenius_loss\n",
    "    \n",
    "    reduced_data = model.transform(data)\n",
    "    reconstructed_data = model.inverse_transform(reduced_data)\n",
    "    \n",
    "    return loss_function(data, reconstructed_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading dataset...\n",
      "CPU times: user 1.86 s, sys: 69.9 ms, total: 1.93 s\n",
      "Wall time: 1.97 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"Loading dataset...\")\n",
    "dataset = fetch_20newsgroups(shuffle=True, random_state=1,\n",
    "                             remove=('headers', 'footers', 'quotes'))\n",
    "\n",
    "data_train = dataset.data[:n_samples]\n",
    "data_test = dataset.data[n_samples:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Use tf-idf features for NMF.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Extracting tf-idf features for NMF...\n",
      "CPU times: user 2.42 s, sys: 8.18 ms, total: 2.43 s\n",
      "Wall time: 2.43 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"Extracting tf-idf features for NMF...\")\n",
    "tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,\n",
    "                                   max_features=n_features,\n",
    "                                   stop_words='english')\n",
    "\n",
    "tfidf_train = tfidf_vectorizer.fit_transform(data_train)\n",
    "tfidf_test = tfidf_vectorizer.transform(data_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### NMF model with Frobenius loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=8000 and n_features=1000...\n",
      "CPU times: user 1.39 s, sys: 40.4 ms, total: 1.43 s\n",
      "Wall time: 828 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"Fitting the NMF model (Frobenius norm) with tf-idf features, \"\n",
    "      \"n_samples=%d and n_features=%d...\"\n",
    "      % (n_samples, n_features))\n",
    "frobenius_nmf = NMF(n_components=n_components, random_state=1,\n",
    "          alpha=.1, l1_ratio=.5).fit(tfidf_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train reconstruction error: 0.890941957403\n",
      "test reconstruction error: 0.892431321223\n"
     ]
    }
   ],
   "source": [
    "print('train reconstruction error:', score_model(frobenius_nmf, tfidf_train))\n",
    "print('test reconstruction error:', score_model(frobenius_nmf, tfidf_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Topics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic #0: \n",
      "just don people think like know good time right ve make say did way really want going said ll thing\n",
      "Topic #1: \n",
      "card video monitor drivers cards vga bus driver color ram graphics mode bit board memory pc 16 speed performance controller\n",
      "Topic #2: \n",
      "god jesus bible christ faith believe christians christian church sin lord does life man hell truth belief say love father\n",
      "Topic #3: \n",
      "key chip clipper encryption keys government escrow use algorithm public nsa security phone secure law chips des data bit enforcement\n",
      "Topic #4: \n",
      "new 00 car sale 10 price shipping offer 50 20 15 condition 12 interested 11 used 30 25 sell old\n",
      "Topic #5: \n",
      "thanks does know mail advance hi info looking help anybody address appreciated email information post interested reply send like need\n",
      "Topic #6: \n",
      "windows file use dos files program using window problem running run version pc server application screen software ms ftp help\n",
      "Topic #7: \n",
      "edu soon cs university com internet ftp article pub send email mit david mail address ibm apr reply available export\n",
      "Topic #8: \n",
      "game team games year season play players hockey win league player nhl teams best played runs better hit think good\n",
      "Topic #9: \n",
      "drive scsi hard drives disk ide floppy controller mac cd power rom internal mb cable problem tape bus computer format\n",
      "\n"
     ]
    }
   ],
   "source": [
    "tfidf_feature_names = tfidf_vectorizer.get_feature_names()\n",
    "print_top_words(frobenius_nmf, tfidf_feature_names, n_top_words)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### NMF model with KL-divergence loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=8000 and n_features=1000...\n",
      "CPU times: user 12.1 s, sys: 380 ms, total: 12.5 s\n",
      "Wall time: 6.25 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"Fitting the NMF model (generalized Kullback-Leibler divergence) with \"\n",
    "      \"tf-idf features, n_samples=%d and n_features=%d...\"\n",
    "      % (n_samples, n_features))\n",
    "kl_nmf = NMF(n_components=n_components, random_state=1,\n",
    "          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,\n",
    "          l1_ratio=0.9).fit(tfidf_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train reconstruction error: 18.355714861\n",
      "test reconstruction error: 18.2931233004\n"
     ]
    }
   ],
   "source": [
    "print('train reconstruction error:', score_model(kl_nmf, tfidf_train))\n",
    "print('test reconstruction error:', score_model(kl_nmf, tfidf_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Topics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic #0: \n",
      "time like way right really did years good said make just think don long thing going new say want know\n",
      "Topic #1: \n",
      "use thanks need used using software work help does card hi drive video pc mac computer problem new like speed\n",
      "Topic #2: \n",
      "god question does say people believe true read word jesus says point religion bible life christian claim christians mean faith\n",
      "Topic #3: \n",
      "use government people public make state law used key number fact chip using rights note case legal war keys large\n",
      "Topic #4: \n",
      "new sale 10 year 20 15 shipping offer 12 50 following 16 1993 11 price years 30 00 condition 25\n",
      "Topic #5: \n",
      "thanks know mail post does information looking like com send interested email list address info reply net group advance help\n",
      "Topic #6: \n",
      "windows program file problem using run use version running files like window sun ftp try look available code image server\n",
      "Topic #7: \n",
      "just edu like don want try ve soon thing think things stuff sure oh case car deleted tell people bike\n",
      "Topic #8: \n",
      "good just does team ve game ll doesn better sure heard probably really thought got season mean isn play way\n",
      "Topic #9: \n",
      "think don know people year make win world let second won wouldn did actually mr come drive local hard said\n",
      "\n"
     ]
    }
   ],
   "source": [
    "tfidf_feature_names = tfidf_vectorizer.get_feature_names()\n",
    "print_top_words(kl_nmf, tfidf_feature_names, n_top_words)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "nnets",
   "language": "python",
   "name": "nnets"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}