{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## NMF topic modeling on 20 newsgroups\n", "\n", "This notebook is basically expanded version of [this example](http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py) from scikit-learn documentation." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from __future__ import print_function\n", "\n", "import numpy as np\n", "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", "from sklearn.decomposition import NMF\n", "from sklearn.datasets import fetch_20newsgroups\n", "\n", "n_samples = 8000\n", "n_features = 1000\n", "n_components = 10\n", "n_top_words = 20\n", "\n", "\n", "def kl_loss(x, y, eps=1e-10):\n", " return -(x.toarray() * np.log(y+eps)).sum() / x.shape[0]\n", "\n", "\n", "def frobenius_loss(x, y):\n", " return np.square(x - y).sum() / x.shape[0]\n", "\n", "\n", "def print_top_words(model, feature_names, n_top_words):\n", " for topic_idx, topic in enumerate(model.components_):\n", " print(\"Topic #%d: \" % topic_idx)\n", " topic_words = \" \".join([feature_names[i]\n", " for i in topic.argsort()[:-n_top_words - 1:-1]])\n", " print(topic_words)\n", " print()\n", " \n", " \n", "def score_model(model, data):\n", " if model.beta_loss == 'kullback-leibler':\n", " loss_function = kl_loss\n", " elif model.beta_loss == 'frobenius':\n", " loss_function = frobenius_loss\n", " \n", " reduced_data = model.transform(data)\n", " reconstructed_data = model.inverse_transform(reduced_data)\n", " \n", " return loss_function(data, reconstructed_data)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loading dataset...\n", "CPU times: user 1.86 s, sys: 69.9 ms, total: 1.93 s\n", "Wall time: 1.97 s\n" ] } ], "source": [ "%%time\n", "print(\"Loading dataset...\")\n", "dataset = fetch_20newsgroups(shuffle=True, random_state=1,\n", " remove=('headers', 'footers', 'quotes'))\n", "\n", "data_train = dataset.data[:n_samples]\n", "data_test = dataset.data[n_samples:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Use tf-idf features for NMF.\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Extracting tf-idf features for NMF...\n", "CPU times: user 2.42 s, sys: 8.18 ms, total: 2.43 s\n", "Wall time: 2.43 s\n" ] } ], "source": [ "%%time\n", "print(\"Extracting tf-idf features for NMF...\")\n", "tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,\n", " max_features=n_features,\n", " stop_words='english')\n", "\n", "tfidf_train = tfidf_vectorizer.fit_transform(data_train)\n", "tfidf_test = tfidf_vectorizer.transform(data_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### NMF model with Frobenius loss" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=8000 and n_features=1000...\n", "CPU times: user 1.39 s, sys: 40.4 ms, total: 1.43 s\n", "Wall time: 828 ms\n" ] } ], "source": [ "%%time\n", "print(\"Fitting the NMF model (Frobenius norm) with tf-idf features, \"\n", " \"n_samples=%d and n_features=%d...\"\n", " % (n_samples, n_features))\n", "frobenius_nmf = NMF(n_components=n_components, random_state=1,\n", " alpha=.1, l1_ratio=.5).fit(tfidf_train)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train reconstruction error: 0.890941957403\n", "test reconstruction error: 0.892431321223\n" ] } ], "source": [ "print('train reconstruction error:', score_model(frobenius_nmf, tfidf_train))\n", "print('test reconstruction error:', score_model(frobenius_nmf, tfidf_test))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Topics" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Topic #0: \n", "just don people think like know good time right ve make say did way really want going said ll thing\n", "Topic #1: \n", "card video monitor drivers cards vga bus driver color ram graphics mode bit board memory pc 16 speed performance controller\n", "Topic #2: \n", "god jesus bible christ faith believe christians christian church sin lord does life man hell truth belief say love father\n", "Topic #3: \n", "key chip clipper encryption keys government escrow use algorithm public nsa security phone secure law chips des data bit enforcement\n", "Topic #4: \n", "new 00 car sale 10 price shipping offer 50 20 15 condition 12 interested 11 used 30 25 sell old\n", "Topic #5: \n", "thanks does know mail advance hi info looking help anybody address appreciated email information post interested reply send like need\n", "Topic #6: \n", "windows file use dos files program using window problem running run version pc server application screen software ms ftp help\n", "Topic #7: \n", "edu soon cs university com internet ftp article pub send email mit david mail address ibm apr reply available export\n", "Topic #8: \n", "game team games year season play players hockey win league player nhl teams best played runs better hit think good\n", "Topic #9: \n", "drive scsi hard drives disk ide floppy controller mac cd power rom internal mb cable problem tape bus computer format\n", "\n" ] } ], "source": [ "tfidf_feature_names = tfidf_vectorizer.get_feature_names()\n", "print_top_words(frobenius_nmf, tfidf_feature_names, n_top_words)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### NMF model with KL-divergence loss" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=8000 and n_features=1000...\n", "CPU times: user 12.1 s, sys: 380 ms, total: 12.5 s\n", "Wall time: 6.25 s\n" ] } ], "source": [ "%%time\n", "print(\"Fitting the NMF model (generalized Kullback-Leibler divergence) with \"\n", " \"tf-idf features, n_samples=%d and n_features=%d...\"\n", " % (n_samples, n_features))\n", "kl_nmf = NMF(n_components=n_components, random_state=1,\n", " beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,\n", " l1_ratio=0.9).fit(tfidf_train)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train reconstruction error: 18.355714861\n", "test reconstruction error: 18.2931233004\n" ] } ], "source": [ "print('train reconstruction error:', score_model(kl_nmf, tfidf_train))\n", "print('test reconstruction error:', score_model(kl_nmf, tfidf_test))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Topics" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Topic #0: \n", "time like way right really did years good said make just think don long thing going new say want know\n", "Topic #1: \n", "use thanks need used using software work help does card hi drive video pc mac computer problem new like speed\n", "Topic #2: \n", "god question does say people believe true read word jesus says point religion bible life christian claim christians mean faith\n", "Topic #3: \n", "use government people public make state law used key number fact chip using rights note case legal war keys large\n", "Topic #4: \n", "new sale 10 year 20 15 shipping offer 12 50 following 16 1993 11 price years 30 00 condition 25\n", "Topic #5: \n", "thanks know mail post does information looking like com send interested email list address info reply net group advance help\n", "Topic #6: \n", "windows program file problem using run use version running files like window sun ftp try look available code image server\n", "Topic #7: \n", "just edu like don want try ve soon thing think things stuff sure oh case car deleted tell people bike\n", "Topic #8: \n", "good just does team ve game ll doesn better sure heard probably really thought got season mean isn play way\n", "Topic #9: \n", "think don know people year make win world let second won wouldn did actually mr come drive local hard said\n", "\n" ] } ], "source": [ "tfidf_feature_names = tfidf_vectorizer.get_feature_names()\n", "print_top_words(kl_nmf, tfidf_feature_names, n_top_words)" ] } ], "metadata": { "kernelspec": { "display_name": "nnets", "language": "python", "name": "nnets" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }