{ "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "l5VG6wN2VtMg" }, "source": [ "# Topic Models on WikiHow\n", "\n", "[WikiHow dataset page](https://github.com/mahnazkoupaee/WikiHow-Dataset)\n", "\n", "[Automatic Evaluation of Topic Coherence](https://www.aclweb.org/anthology/N10-1012)\n", "\n", "[Evaluation of Topic Modeling: Topic Coherence](https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/)\n", "\n", "[Topic Coherence in gensim](https://radimrehurek.com/gensim/models/coherencemodel.html)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "id": "HjhKY8usU1-c", "outputId": "689c5997-8d83-4d93-d895-ae35b99b20ec" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File ‘wikihowAll.csv’ already there; not retrieving.\r\n" ] } ], "source": [ "!wget -nc -O wikihowAll.csv https://query.data.world/s/lult233wfonljfadtexn2t5x5rb7is" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 527 }, "colab_type": "code", "id": "ziECGnpBVsd_", "outputId": "a8d6235b-7220-407d-e147-887b14eaef6d" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting git+https://github.com/lambdaofgod/mlutil\n", " Cloning https://github.com/lambdaofgod/mlutil to /tmp/pip-0itfu2mr-build\n", " Requirement already satisfied (use --upgrade to upgrade): mlutil==0.11 from git+https://github.com/lambdaofgod/mlutil in /home/kuba/Projects/mlutil\n", "Requirement already satisfied: gensim in /home/kuba/anaconda3/lib/python3.6/site-packages (from mlutil==0.11)\n", "Requirement already satisfied: nltk in /home/kuba/anaconda3/lib/python3.6/site-packages (from mlutil==0.11)\n", "Requirement already satisfied: pandas in /home/kuba/anaconda3/lib/python3.6/site-packages (from mlutil==0.11)\n", "Requirement already satisfied: numpy in /home/kuba/anaconda3/lib/python3.6/site-packages (from mlutil==0.11)\n", "Requirement already satisfied: tqdm in /home/kuba/anaconda3/lib/python3.6/site-packages (from mlutil==0.11)\n", "Requirement already satisfied: scipy>=0.18.1 in /home/kuba/anaconda3/lib/python3.6/site-packages (from gensim->mlutil==0.11)\n", "Requirement already satisfied: smart-open>=1.7.0 in /home/kuba/anaconda3/lib/python3.6/site-packages (from gensim->mlutil==0.11)\n", "Requirement already satisfied: six>=1.5.0 in /home/kuba/anaconda3/lib/python3.6/site-packages (from gensim->mlutil==0.11)\n", "Requirement already satisfied: singledispatch in /home/kuba/anaconda3/lib/python3.6/site-packages (from nltk->mlutil==0.11)\n", "Requirement already satisfied: python-dateutil>=2.5.0 in /home/kuba/anaconda3/lib/python3.6/site-packages (from pandas->mlutil==0.11)\n", "Requirement already satisfied: pytz>=2011k in /home/kuba/anaconda3/lib/python3.6/site-packages (from pandas->mlutil==0.11)\n", "Requirement already satisfied: boto>=2.32 in /home/kuba/anaconda3/lib/python3.6/site-packages (from smart-open>=1.7.0->gensim->mlutil==0.11)\n", "Requirement already satisfied: boto3 in /home/kuba/anaconda3/lib/python3.6/site-packages (from smart-open>=1.7.0->gensim->mlutil==0.11)\n", "Requirement already satisfied: requests in /home/kuba/anaconda3/lib/python3.6/site-packages (from smart-open>=1.7.0->gensim->mlutil==0.11)\n", "Requirement already satisfied: bz2file in /home/kuba/anaconda3/lib/python3.6/site-packages (from smart-open>=1.7.0->gensim->mlutil==0.11)\n", "Requirement already satisfied: botocore<1.13.0,>=1.12.130 in /home/kuba/anaconda3/lib/python3.6/site-packages (from boto3->smart-open>=1.7.0->gensim->mlutil==0.11)\n", "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/kuba/anaconda3/lib/python3.6/site-packages (from boto3->smart-open>=1.7.0->gensim->mlutil==0.11)\n", "Requirement already satisfied: s3transfer<0.3.0,>=0.2.0 in /home/kuba/anaconda3/lib/python3.6/site-packages (from boto3->smart-open>=1.7.0->gensim->mlutil==0.11)\n", "Requirement already satisfied: idna<2.9,>=2.5 in /home/kuba/anaconda3/lib/python3.6/site-packages (from requests->smart-open>=1.7.0->gensim->mlutil==0.11)\n", "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/kuba/anaconda3/lib/python3.6/site-packages (from requests->smart-open>=1.7.0->gensim->mlutil==0.11)\n", "Requirement already satisfied: certifi>=2017.4.17 in /home/kuba/anaconda3/lib/python3.6/site-packages (from requests->smart-open>=1.7.0->gensim->mlutil==0.11)\n", "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /home/kuba/anaconda3/lib/python3.6/site-packages (from requests->smart-open>=1.7.0->gensim->mlutil==0.11)\n", "Requirement already satisfied: docutils>=0.10 in /home/kuba/anaconda3/lib/python3.6/site-packages (from botocore<1.13.0,>=1.12.130->boto3->smart-open>=1.7.0->gensim->mlutil==0.11)\n", "\u001b[33mYou are using pip version 9.0.1, however version 19.1.1 is available.\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", "Requirement already satisfied: tqdm in /home/kuba/anaconda3/lib/python3.6/site-packages\n", "\u001b[33mYou are using pip version 9.0.1, however version 19.1.1 is available.\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "!pip install git+https://github.com/lambdaofgod/mlutil\n", "!pip install tqdm" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 85 }, "colab_type": "code", "id": "JOsF_0DNFeoa", "outputId": "52b48eda-15a0-4eda-b141-0c040b36244e" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "paramiko missing, opening SSH/SCP/SFTP paths will be disabled. `pip install paramiko` to suppress\n", "[nltk_data] Downloading package wordnet to /home/kuba/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n", "[nltk_data] Downloading package wordnet_ic to /home/kuba/nltk_data...\n", "[nltk_data] Package wordnet_ic is already up-to-date!\n" ] } ], "source": [ "from __future__ import print_function\n", "from time import time\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "import seaborn as sns\n", "\n", "import tqdm\n", "\n", "\n", "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", "from sklearn.decomposition import NMF, LatentDirichletAllocation\n", "from sklearn.datasets import fetch_20newsgroups\n", "from gensim.models.coherencemodel import CoherenceModel\n", "from gensim.corpora import Dictionary\n", "\n", "from IPython.display import display, Image\n", "\n", "import nltk\n", "nltk.download('wordnet')\n", "nltk.download('wordnet_ic')\n", "\n", "import mlutil\n", "from mlutil.textmining import get_wordnet_similarity\n", "\n", "\n", "import pyLDAvis\n", "import pyLDAvis.sklearn" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "pyLDAvis.enable_notebook()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": {}, "colab_type": "code", "id": "BHG0igq8LsU8" }, "outputs": [], "source": [ "def plot_correlations(m):\n", " m_corr = m @ m.T / (m ** 2).sum(axis=1)\n", " sns.heatmap(m)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": {}, "colab_type": "code", "id": "65yTXA4380wu" }, "outputs": [], "source": [ "n_features = 5000\n", "n_components = 10\n", "n_top_words = 10" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "bENQvODw8tUx" }, "source": [ "## Loading WikiHow" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 51 }, "colab_type": "code", "id": "elY43Y65J3MZ", "outputId": "d09640a1-3d01-4ccd-c2da-be90b253a5bb" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "wikihow size (215365, 3)\n", "valid wikihow size (removed empty text) (214294, 3)\n" ] } ], "source": [ "wikihow_df = pd.read_csv('wikihowAll.csv')\n", "print('wikihow size', wikihow_df.shape)\n", "wikihow_df = wikihow_df[~wikihow_df['text'].isna()]\n", "print('valid wikihow size (removed empty text)', wikihow_df.shape)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": {}, "colab_type": "code", "id": "5CegvpLPJ54Y" }, "outputs": [], "source": [ "data_samples = wikihow_df['text']\n", "n_samples = len(data_samples)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "id": "ml8rOQt275bU", "outputId": "8fe701d3-bfa7-45b0-9db9-ac6e55deb09d" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Extracting tf-idf features for NMF...\n", "done in 92.840s.\n", "Extracting tf features for LDA...\n", "done in 90.236s.\n", "\n" ] } ], "source": [ "# Use tf-idf features for NMF.\n", "print(\"Extracting tf-idf features for NMF...\")\n", "tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=5,\n", " max_features=n_features,\n", " stop_words='english')\n", "t0 = time()\n", "tfidf = tfidf_vectorizer.fit_transform(data_samples)\n", "print(\"done in %0.3fs.\" % (time() - t0))\n", "\n", "# Use tf (raw term count) features for LDA.\n", "print(\"Extracting tf features for LDA...\")\n", "tf_vectorizer = CountVectorizer(max_df=0.95, min_df=5,\n", " max_features=n_features,\n", " stop_words='english')\n", "t0 = time()\n", "tf = tf_vectorizer.fit_transform(data_samples)\n", "print(\"done in %0.3fs.\" % (time() - t0))\n", "print()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": {}, "colab_type": "code", "id": "ASxsYy4K761m" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting the NMF model (Frobenius norm) with tf-idf features n_samples=214294 and n_features=5000...\n", "done in 193.698s.\n", "\n", "Topics in NMF model (Frobenius norm):\n" ] } ], "source": [ "# Fit the NMF model\n", "print(\"Fitting the NMF model (Frobenius norm) with tf-idf features \"\n", " \"n_samples=%d and n_features=%d...\"\n", " % (n_samples, n_features))\n", "t0 = time()\n", "nmf = NMF(n_components=n_components, random_state=1,\n", " alpha=.1, l1_ratio=.5).fit(tfidf)\n", "print(\"done in %0.3fs.\" % (time() - t0))\n", "\n", "print(\"\\nTopics in NMF model (Frobenius norm):\")\n", "tfidf_feature_names = tfidf_vectorizer.get_feature_names()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": {}, "colab_type": "code", "id": "FsAF4cSQTwi9" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789
topic_0peopledonpersonlikefeeltimemakethingssayknow
topic_1addwatermixtureminutesoilheatstirbowlpanmix
topic_2clickscreenbuttonselectmenutapappiconfilepage
topic_3hairshampoocombdryconditionerlookskinscalpbrushoil
topic_4dogdogsvetpetpuppyfoodtrainingleashtreatbreed
topic_5skindoctorbodyhelpbloodfoodspainsymptomsdayexercise
topic_6usemakewaterpaperpaintcutsurecolorplaceglue
topic_7businessinformationneedstatecompanycardnumbercreditmoneyonline
topic_8catcatsvetfoodpetlitterveterinariantoyskittenbox
topic_9childchildrenkidsparentsparentbabyschoolhelpbehaviortoddler
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 \\\n", "topic_0 people don person like feel time \n", "topic_1 add water mixture minutes oil heat \n", "topic_2 click screen button select menu tap \n", "topic_3 hair shampoo comb dry conditioner look \n", "topic_4 dog dogs vet pet puppy food \n", "topic_5 skin doctor body help blood foods \n", "topic_6 use make water paper paint cut \n", "topic_7 business information need state company card \n", "topic_8 cat cats vet food pet litter \n", "topic_9 child children kids parents parent baby \n", "\n", " 6 7 8 9 \n", "topic_0 make things say know \n", "topic_1 stir bowl pan mix \n", "topic_2 app icon file page \n", "topic_3 skin scalp brush oil \n", "topic_4 training leash treat breed \n", "topic_5 pain symptoms day exercise \n", "topic_6 sure color place glue \n", "topic_7 number credit money online \n", "topic_8 veterinarian toys kitten box \n", "topic_9 school help behavior toddler " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "nmf_keywords_per_topic = mlutil.topic_modeling.top_topic_words(nmf, tfidf_feature_names, 100)\n", "display(nmf_keywords_per_topic.iloc[:,:10])" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "aNZCvvimNZ7V" }, "source": [ "## Topic coherence\n", "\n", "In the following we use average Resnik similarity of words from top topic keywords.\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": {}, "colab_type": "code", "id": "I1Hmx0WromYW" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 10/10 [03:01<00:00, 18.13s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "NMF-based topic model mean coherence: 0 1.037505\n", "1 1.003931\n", "2 0.958536\n", "3 1.273350\n", "4 1.448881\n", "5 0.864943\n", "6 1.378219\n", "7 0.715856\n", "8 1.833986\n", "9 1.831830\n", "dtype: float64\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "nmf_mean_coherence = mlutil.topic_modeling.get_topic_coherences(nmf_keywords_per_topic)\n", "print('NMF-based topic model mean coherence:', nmf_mean_coherence)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": {}, "colab_type": "code", "id": "b3NgQZx278MS" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=214294 and n_features=5000...\n", "done in 1760.113s.\n" ] } ], "source": [ "# Fit the KL divergence NMF model\n", "print(\"Fitting the NMF model (generalized Kullback-Leibler divergence) with \"\n", " \"tf-idf features, n_samples=%d and n_features=%d...\"\n", " % (n_samples, n_features))\n", "t0 = time()\n", "kl_nmf = NMF(n_components=n_components, random_state=1,\n", " beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,\n", " l1_ratio=.5).fit(tfidf)\n", "print(\"done in %0.3fs.\" % (time() - t0))\n", "\n", "tfidf_feature_names = tfidf_vectorizer.get_feature_names()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "J7hhVdwSwbo8" }, "source": [ "### Topics in NMF model (generalized Kullback-Leibler divergence)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "colab": {}, "colab_type": "code", "id": "m9_bJBnPuVfE" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789
topic_0timetrymakelikewaywantdonpeoplehelpjust
topic_1wateruseusingremovesuremakewarmdrysmallminutes
topic_2clickselectscreenrightopenusewanttypemenuwindow
topic_3lookwearhairliketrywantdonjuststylemake
topic_4petneeddogsurepossiblepreventsafeprovidelikelyvet
topic_5helpweightincludereducedoctorbodyhealthtreatmentneedincrease
topic_6useneedworkwaysuremakewantrightstartusing
topic_7useinformationonlinenumberwebsiteneedyearexampleprovidework
topic_8stirminutesmixaddmixtureservesugarplacesalttime
topic_9usemakesureplacesmallwantusingpapercutshape
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 \\\n", "topic_0 time try make like way want don \n", "topic_1 water use using remove sure make warm \n", "topic_2 click select screen right open use want \n", "topic_3 look wear hair like try want don \n", "topic_4 pet need dog sure possible prevent safe \n", "topic_5 help weight include reduce doctor body health \n", "topic_6 use need work way sure make want \n", "topic_7 use information online number website need year \n", "topic_8 stir minutes mix add mixture serve sugar \n", "topic_9 use make sure place small want using \n", "\n", " 7 8 9 \n", "topic_0 people help just \n", "topic_1 dry small minutes \n", "topic_2 type menu window \n", "topic_3 just style make \n", "topic_4 provide likely vet \n", "topic_5 treatment need increase \n", "topic_6 right start using \n", "topic_7 example provide work \n", "topic_8 place salt time \n", "topic_9 paper cut shape " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "kl_nmf_keywords_per_topic = mlutil.topic_modeling.top_topic_words(kl_nmf, tfidf_feature_names, 100)\n", "display(kl_nmf_keywords_per_topic.iloc[:,:10])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "colab": {}, "colab_type": "code", "id": "fddvz8GJufYF" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 10/10 [02:57<00:00, 19.10s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "KL-NMF-based topic model mean coherence: 0 1.218143\n", "1 0.635964\n", "2 0.715969\n", "3 0.829468\n", "4 0.707291\n", "5 0.734174\n", "6 1.303825\n", "7 0.778983\n", "8 1.028046\n", "9 0.870015\n", "dtype: float64\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "kl_nmf_mean_coherence = mlutil.topic_modeling.get_topic_coherences(kl_nmf_keywords_per_topic)\n", "print('KL-NMF-based topic model mean coherence:', kl_nmf_mean_coherence)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": {}, "colab_type": "code", "id": "dFd0ANqVVA82" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting LDA models with tf features, n_samples=214294 and n_features=5000...\n", "done in 840.556s.\n", "\n", "Topics in LDA model:\n" ] } ], "source": [ "print(\"Fitting LDA models with tf features, \"\n", " \"n_samples=%d and n_features=%d...\"\n", " % (n_samples, n_features))\n", "lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,\n", " learning_method='online',\n", " learning_offset=50.,\n", " random_state=0,\n", " n_jobs=-1)\n", "t0 = time()\n", "lda.fit(tf)\n", "print(\"done in %0.3fs.\" % (time() - t0))\n", "\n", "print(\"\\nTopics in LDA model:\")\n", "tf_feature_names = tf_vectorizer.get_feature_names()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": {}, "colab_type": "code", "id": "wpImYZjYuZzN" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789
topic_0foodfoodsbloodskineathelplikedoctordaymeat
topic_1usemakecutplaceendneedrighthandsurepaper
topic_2timemakeworkneedgoodlikewanthelpllpeople
topic_3helpchildtimefeeltrybodychildrenexercisedaysleep
topic_4clickbuttonscreenselectrightopencomputerusetapwindow
topic_5donmakepeoplelikepersonwanttimejustknowtry
topic_6wateruseadddryremovemakeoilplaceminutesclean
topic_7informationneedbusinessstatenumberfileexampleusecourtcredit
topic_8paintlookcolorhairmakeuselikewantcolorswear
topic_9dogcatwatermakeneedsuresoilplantplantshome
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 \\\n", "topic_0 food foods blood skin eat help like \n", "topic_1 use make cut place end need right \n", "topic_2 time make work need good like want \n", "topic_3 help child time feel try body children \n", "topic_4 click button screen select right open computer \n", "topic_5 don make people like person want time \n", "topic_6 water use add dry remove make oil \n", "topic_7 information need business state number file example \n", "topic_8 paint look color hair make use like \n", "topic_9 dog cat water make need sure soil \n", "\n", " 7 8 9 \n", "topic_0 doctor day meat \n", "topic_1 hand sure paper \n", "topic_2 help ll people \n", "topic_3 exercise day sleep \n", "topic_4 use tap window \n", "topic_5 just know try \n", "topic_6 place minutes clean \n", "topic_7 use court credit \n", "topic_8 want colors wear \n", "topic_9 plant plants home " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "lda_keywords_per_topic = mlutil.topic_modeling.top_topic_words(lda, tf_feature_names, 100)\n", "display(lda_keywords_per_topic.iloc[:,:10])" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "0cveKWLGHtWE" }, "source": [ "Warning: the results of LDA may be a bit misleading - I don't know whether getting topic keywords from LDA uses the same mechanism as in NMF (which will correspond to tf-idf features, instead of tf ones)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "colab": {}, "colab_type": "code", "id": "rI57DEEtulMs" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 10/10 [02:57<00:00, 18.68s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "LDA-based topic model mean coherence: 0 0.963005\n", "1 0.679626\n", "2 1.135926\n", "3 0.927911\n", "4 1.046612\n", "5 1.159201\n", "6 0.965963\n", "7 0.917089\n", "8 1.009249\n", "9 0.886062\n", "dtype: float64\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "lda_mean_coherence = mlutil.topic_modeling.get_topic_coherences(lda_keywords_per_topic)\n", "print('LDA-based topic model mean coherence:', lda_mean_coherence)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/kuba/anaconda3/lib/python3.6/site-packages/pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", "of pandas will change to not sort by default.\n", "\n", "To accept the future behavior, pass 'sort=False'.\n", "\n", "To retain the current behavior and silence the warning, pass 'sort=True'.\n", "\n", " return pd.concat([default_term_info] + list(topic_dfs))\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1min 45s, sys: 1.17 s, total: 1min 46s\n", "Wall time: 7min 53s\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "" ], "text/plain": [ "PreparedData(topic_coordinates= x y topics cluster Freq\n", "topic \n", "5 0.137085 -0.060702 1 1 16.656680\n", "7 0.133621 -0.115327 2 1 13.179283\n", "2 0.119489 -0.061512 3 1 13.048088\n", "6 -0.212601 0.148062 4 1 10.177278\n", "1 -0.179571 -0.019475 5 1 9.939044\n", "3 0.194816 0.068203 6 1 9.018379\n", "0 0.064451 0.203676 7 1 8.087163\n", "9 -0.012667 0.085813 8 1 7.772587\n", "8 -0.109913 0.009175 9 1 7.020371\n", "4 -0.134710 -0.257913 10 1 5.101128, topic_info= Category Freq Term Total loglift logprob\n", "4864 Default 136587.000000 water 136587.000000 30.0000 30.0000\n", "1379 Default 69325.000000 dog 69325.000000 29.0000 29.0000\n", "780 Default 69285.000000 child 69285.000000 28.0000 28.0000\n", "837 Default 54795.000000 click 54795.000000 27.0000 27.0000\n", "3084 Default 48059.000000 paint 48059.000000 26.0000 26.0000\n", "887 Default 49456.000000 color 49456.000000 25.0000 25.0000\n", "4742 Default 282876.000000 use 282876.000000 24.0000 24.0000\n", "2043 Default 40923.000000 hair 40923.000000 23.0000 23.0000\n", "2114 Default 188787.000000 help 188787.000000 22.0000 22.0000\n", "703 Default 39161.000000 cat 39161.000000 21.0000 21.0000\n", "3190 Default 91680.000000 person 91680.000000 20.0000 20.0000\n", "2626 Default 114449.000000 look 114449.000000 19.0000 19.0000\n", "3167 Default 146715.000000 people 146715.000000 18.0000 18.0000\n", "625 Default 31945.000000 button 31945.000000 17.0000 17.0000\n", "3738 Default 91550.000000 right 91550.000000 16.0000 16.0000\n", "3857 Default 29769.000000 screen 29769.000000 15.0000 15.0000\n", "1826 Default 60212.000000 food 60212.000000 14.0000 14.0000\n", "1716 Default 97040.000000 feel 97040.000000 13.0000 13.0000\n", "105 Default 87088.000000 add 87088.000000 12.0000 12.0000\n", "3893 Default 30752.000000 select 30752.000000 11.0000 11.0000\n", "1440 Default 52365.000000 dry 52365.000000 10.0000 10.0000\n", "3006 Default 56410.000000 open 56410.000000 9.0000 9.0000\n", "3827 Default 53531.000000 say 53531.000000 8.0000 8.0000\n", "2284 Default 60400.000000 information 60400.000000 7.0000 7.0000\n", "2522 Default 44743.000000 left 44743.000000 6.0000 6.0000\n", "949 Default 26321.000000 computer 26321.000000 5.0000 5.0000\n", "620 Default 47556.000000 business 47556.000000 4.0000 4.0000\n", "2993 Default 34422.000000 oil 34422.000000 3.0000 3.0000\n", "2565 Default 234490.000000 like 234490.000000 2.0000 2.0000\n", "511 Default 62515.000000 body 62515.000000 1.0000 1.0000\n", "... ... ... ... ... ... ...\n", "949 Topic10 21904.895492 computer 26321.146308 2.7920 -4.4461\n", "2819 Topic10 4640.664203 mode 5019.647394 2.8972 -5.9980\n", "4919 Topic10 14793.092736 windows 17741.756536 2.7939 -4.8387\n", "634 Topic10 5837.950654 cable 6721.626086 2.8348 -5.7685\n", "1439 Topic10 4329.907458 drum 4853.260443 2.8616 -6.0673\n", "2308 Topic10 8403.101385 install 10428.312859 2.7598 -5.4042\n", "1544 Topic10 15867.820236 enter 22118.256803 2.6436 -4.7686\n", "4800 Topic10 12237.155888 video 16880.582024 2.6540 -5.0284\n", "1429 Topic10 12525.577259 drive 18308.984339 2.5961 -5.0051\n", "545 Topic10 16939.093627 box 29693.154469 2.4144 -4.7032\n", "4500 Topic10 10906.641037 text 16527.879035 2.5600 -5.1435\n", "1053 Topic10 12251.782211 corner 19430.418228 2.5145 -5.0272\n", "656 Topic10 8776.785922 camera 12443.842069 2.6266 -5.3607\n", "3006 Topic10 24708.299273 open 56410.972239 2.1502 -4.3257\n", "3371 Topic10 12343.841864 press 21721.069980 2.4106 -5.0197\n", "3738 Topic10 26417.684683 right 91550.261958 1.7329 -4.2588\n", "2522 Topic10 17062.242355 left 44743.730699 2.0116 -4.6960\n", "720 Topic10 6961.576190 cell 9677.244748 2.6463 -5.5924\n", "235 Topic10 10545.066259 appear 20840.762076 2.2945 -5.1772\n", "3341 Topic10 10458.814523 power 21120.381708 2.2729 -5.1854\n", "387 Topic10 9106.326979 bar 16904.332871 2.3571 -5.3239\n", "3022 Topic10 11883.300916 option 29720.981163 2.0590 -5.0577\n", "4687 Topic10 12979.255808 type 48434.656701 1.6588 -4.9695\n", "2605 Topic10 16571.237107 ll 121520.830557 0.9833 -4.7252\n", "2920 Topic10 15039.093272 new 91734.008356 1.1675 -4.8222\n", "4742 Topic10 19940.886703 use 282876.990389 0.3235 -4.5401\n", "4844 Topic10 15987.392525 want 218934.177193 0.3587 -4.7610\n", "4749 Topic10 13316.107809 using 104694.435603 0.9136 -4.9439\n", "105 Topic10 11996.430159 add 87088.277880 0.9934 -5.0482\n", "4214 Topic10 11525.248189 start 85722.193839 0.9691 -5.0883\n", "\n", "[804 rows x 6 columns], token_table= Topic Freq Term\n", "term \n", "57 1 0.091501 abuse\n", "57 2 0.035637 abuse\n", "57 6 0.872820 abuse\n", "59 1 0.012399 academic\n", "59 2 0.001078 academic\n", "59 3 0.985954 academic\n", "76 1 0.009751 account\n", "76 2 0.810076 account\n", "76 3 0.005293 account\n", "76 5 0.002598 account\n", "76 6 0.000481 account\n", "76 7 0.003272 account\n", "76 8 0.002085 account\n", "76 9 0.002310 account\n", "76 10 0.164107 account\n", "89 4 0.999611 acrylic\n", "98 1 0.104354 activity\n", "98 2 0.046703 activity\n", "98 3 0.092412 activity\n", "98 6 0.650294 activity\n", "98 7 0.081464 activity\n", "98 8 0.022037 activity\n", "98 10 0.002701 activity\n", "104 10 0.999364 adapter\n", "105 1 0.016845 add\n", "105 2 0.063384 add\n", "105 3 0.029189 add\n", "105 4 0.465918 add\n", "105 5 0.059664 add\n", "105 7 0.036044 add\n", "... ... ... ...\n", "4943 2 0.069672 words\n", "4943 3 0.181047 words\n", "4943 5 0.019368 words\n", "4943 6 0.000381 words\n", "4943 7 0.005679 words\n", "4943 9 0.006526 words\n", "4943 10 0.018266 words\n", "4944 1 0.154814 work\n", "4944 2 0.065450 work\n", "4944 3 0.362134 work\n", "4944 4 0.075625 work\n", "4944 5 0.104387 work\n", "4944 6 0.139726 work\n", "4944 7 0.011165 work\n", "4944 8 0.011279 work\n", "4944 9 0.074691 work\n", "4944 10 0.000723 work\n", "4949 3 0.007568 workout\n", "4949 5 0.012298 workout\n", "4949 6 0.889240 workout\n", "4949 7 0.090816 workout\n", "4977 2 0.999886 www\n", "4982 5 0.995305 yarn\n", "4982 9 0.004550 yarn\n", "4989 3 0.005188 yoga\n", "4989 5 0.011791 yoga\n", "4989 6 0.982903 yoga\n", "4994 3 0.985873 youtube\n", "4994 9 0.014014 youtube\n", "4999 10 0.999346 zoom\n", "\n", "[2393 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[6, 8, 3, 7, 2, 4, 1, 10, 9, 5])" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)" ] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "WikiHow Topic Modeling.ipynb", "provenance": [], "version": "0.3.2" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 1 }