# Wikipedia training

In this tutorial we will:
 - Learn how to train the NMF topic model on English Wikipedia corpus
 - Compare it with LDA model
 - Evaluate results

In [1]:
%load_ext autoreload
%autoreload 2

import itertools
import json
import logging
import numpy as np
import pandas as pd
import scipy.sparse
import smart_open
import time
from tqdm import tqdm, tqdm_notebook

import gensim.downloader as api
from gensim import matutils
from gensim.corpora import MmCorpus, Dictionary
from gensim.models import LdaModel, CoherenceModel
from gensim.models.nmf import Nmf
from gensim.parsing.preprocessing import preprocess_string

tqdm.pandas()

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

# Preprocessing

### Load wikipedia dump
Let's use `gensim.downloader.api` for that

In [2]:
data = api.load("wiki-english-20171001")
article = next(iter(data))

for section_title, section_text in zip(
    article['section_titles'],
    article['section_texts']
):
    print("Section title: %s" % section_title)
    print("Section text: %s" % section_text[:100])

Section title: Introduction
Section text: 




'''Anarchism''' is a political philosophy that advocates self-governed societies based on volun
Section title: Etymology and terminology
Section text: 

The word ''anarchism'' is composed from the word ''anarchy'' and the suffix ''-ism'', themselves d
Section title: History
Section text: 

===Origins===
Woodcut from a Diggers document by William Everard

The earliest anarchist themes ca
Section title: Anarchist schools of thought
Section text: 
Portrait of philosopher Pierre-Joseph Proudhon (1809–1865) by Gustave Courbet. Proudhon was the pri
Section title: Internal issues and debates
Section text: 
consistent with anarchist values is a controversial subject among anarchists.

Anarchism is a philo
Section title: Topics of interest
Section text: Intersecting and overlapping between various schools of thought, certain topics of interest and inte
Section title: Criticisms
Section text: 
Criticisms of anarchism include moral criticisms and pra

Preprocess and save articles

In [3]:
def save_preprocessed_articles(filename, articles):
    with smart_open(filename, 'w+', encoding="utf8") as writer:
        for article in tqdm_notebook(articles):
            article_text = " ".join(
                " ".join(section)
                for section
                in zip(
                    article['section_titles'],
                    article['section_texts']
                )
            )
            article_text = preprocess_string(article_text)

            writer.write(json.dumps(article_text) + '\n')


def get_preprocessed_articles(filename):
    with smart_open(filename, 'r', encoding="utf8") as reader:
        for line in tqdm_notebook(reader):
            yield json.loads(
                line
            )

In [4]:
SAVE_ARTICLES = False

if SAVE_ARTICLES:
    save_preprocessed_articles('wiki_articles.jsonlines', data)

### Create and save dictionary

In [5]:
SAVE_DICTIONARY = False

if SAVE_DICTIONARY:
    dictionary = Dictionary(get_preprocessed_articles('wiki_articles.jsonlines'))
    dictionary.save('wiki.dict')

### Load and filter dictionary

In [6]:
dictionary = Dictionary.load('wiki.dict')
dictionary.filter_extremes()
dictionary.compactify()

2019-01-15 19:31:03,151 : INFO : loading Dictionary object from wiki.dict
2019-01-15 19:31:04,024 : INFO : loaded wiki.dict
2019-01-15 19:31:06,292 : INFO : discarding 1910258 tokens: [('abdelrahim', 49), ('abstention', 120), ('anarcha', 101), ('anarchica', 40), ('anarchosyndicalist', 20), ('antimilitar', 68), ('arbet', 194), ('archo', 100), ('arkhē', 5), ('autonomedia', 118)]...
2019-01-15 19:31:06,293 : INFO : keeping 100000 tokens which were in no less than 5 and no more than 2462447 (=50.0%) documents
2019-01-15 19:31:06,645 : INFO : resulting dictionary: Dictionary(100000 unique tokens: ['abandon', 'abil', 'abl', 'abolit', 'abstent']...)


### MmCorpus wrapper
In this way we'll:

- Make sure that documents are shuffled
- Be able to train-test split corpus without rewriting it

In [7]:
class RandomCorpus(MmCorpus):
    def __init__(self, random_seed=42, testset=False, testsize=1000, *args,
                 **kwargs):
        super().__init__(*args, **kwargs)

        random_state = np.random.RandomState(random_seed)
        self.indices = random_state.permutation(range(self.num_docs))
        if testset:
            self.indices = self.indices[:testsize]
        else:
            self.indices = self.indices[testsize:]

    def __iter__(self):
        for doc_id in self.indices:
            yield self[doc_id]
            
    def __len__(self):
        return len(self.indices)

### Create and save corpus

In [8]:
SAVE_CORPUS = False

if SAVE_CORPUS:
    corpus = (
        dictionary.doc2bow(article)
        for article
        in get_preprocessed_articles('wiki_articles.jsonlines')
    )
    
    RandomCorpus.serialize('wiki.mm', corpus)

### Load train and test corpus
Using `RandomCorpus` wrapper

In [9]:
train_corpus = RandomCorpus(
    random_seed=42, testset=False, testsize=2000, fname='wiki.mm'
)
test_corpus = RandomCorpus(
    random_seed=42, testset=True, testsize=2000, fname='wiki.mm'
)

2019-01-15 19:31:07,323 : INFO : loaded corpus index from wiki.mm.index
2019-01-15 19:31:07,324 : INFO : initializing cython corpus reader from wiki.mm
2019-01-15 19:31:07,325 : INFO : accepted corpus with 4924894 documents, 100000 features, 683375728 non-zero entries
2019-01-15 19:31:08,544 : INFO : loaded corpus index from wiki.mm.index
2019-01-15 19:31:08,544 : INFO : initializing cython corpus reader from wiki.mm
2019-01-15 19:31:08,545 : INFO : accepted corpus with 4924894 documents, 100000 features, 683375728 non-zero entries


## Metrics

In [10]:
def get_execution_time(func):
    start = time.time()

    result = func()

    return (time.time() - start), result


def get_tm_metrics(model, test_corpus):
    W = model.get_topics().T
    H = np.zeros((model.num_topics, len(test_corpus)))
    for bow_id, bow in enumerate(test_corpus):
        for topic_id, word_count in model.get_document_topics(bow):
            H[topic_id, bow_id] = word_count

    pred_factors = W.dot(H)
    pred_factors /= pred_factors.sum(axis=0)
    
    dense_corpus = matutils.corpus2dense(test_corpus, pred_factors.shape[0])

    perplexity = get_tm_perplexity(pred_factors, dense_corpus)

    l2_norm = get_tm_l2_norm(pred_factors, dense_corpus)

    model.normalize = True

    coherence = CoherenceModel(
        model=model,
        corpus=test_corpus,
        coherence='u_mass'
    ).get_coherence()

    topics = model.show_topics()

    model.normalize = False

    return dict(
        perplexity=perplexity,
        coherence=coherence,
        topics=topics,
        l2_norm=l2_norm,
    )


def get_tm_perplexity(pred_factors, dense_corpus):
    return np.exp(-(np.log(pred_factors, where=pred_factors > 0) * dense_corpus).sum() / dense_corpus.sum())


def get_tm_l2_norm(pred_factors, dense_corpus):
    return np.linalg.norm(dense_corpus / dense_corpus.sum(axis=0) - pred_factors)

Define dataframe in which we'll store metrics

In [11]:
tm_metrics = pd.DataFrame()

### Define common params for models

In [12]:
params = dict(
    corpus=train_corpus,
    chunksize=2000,
    num_topics=50,
    id2word=dictionary,
    passes=1,
    eval_every=10,
    minimum_probability=0,
    random_state=42,
)

## Training

### Train NMF and save it
Normalization is turned off to compute metrics correctly

In [13]:
row = dict()
row['model'] = 'nmf'
row['train_time'], nmf = get_execution_time(
    lambda: Nmf(
        use_r=False,
        normalize=False,
        **params
    )
)
nmf.save('nmf.model')

2019-01-15 19:33:21,875 : INFO : Loss (no outliers): 2186.768444126956	Loss (with outliers): 2186.768444126956
2019-01-15 19:34:49,514 : INFO : Loss (no outliers): 2298.434152045061	Loss (with outliers): 2298.434152045061
==Truncated==
2019-01-15 20:44:23,913 : INFO : Loss (no outliers): 1322.9664709183141	Loss (with outliers): 1322.9664709183141
2019-01-15 20:44:23,928 : INFO : saving Nmf object under nmf.model, separately None
2019-01-15 20:44:24,625 : INFO : saved nmf.model


### Load NMF and store metrics

In [14]:
nmf = Nmf.load('nmf.model')
row.update(get_tm_metrics(nmf, test_corpus))
tm_metrics = tm_metrics.append(pd.Series(row), ignore_index=True)

nmf.show_topics(50)

2019-01-15 20:44:24,872 : INFO : loading Nmf object from nmf.model
2019-01-15 20:44:25,150 : INFO : loading id2word recursively from nmf.model.id2word.* with mmap=None
2019-01-15 20:44:25,151 : INFO : loaded nmf.model
2019-01-15 20:44:54,148 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2019-01-15 20:44:54,336 : INFO : CorpusAccumulator accumulated stats from 2000 documents


[(0,
  '0.075*"parti" + 0.071*"elect" + 0.042*"democrat" + 0.029*"republican" + 0.022*"vote" + 0.018*"conserv" + 0.017*"liber" + 0.014*"candid" + 0.013*"seat" + 0.013*"labour"'),
 (1,
  '0.039*"book" + 0.038*"centuri" + 0.032*"histori" + 0.032*"languag" + 0.032*"publish" + 0.024*"english" + 0.023*"world" + 0.022*"law" + 0.022*"govern" + 0.021*"nation"'),
 (2,
  '0.050*"war" + 0.036*"forc" + 0.026*"armi" + 0.023*"battl" + 0.021*"attack" + 0.019*"militari" + 0.018*"german" + 0.016*"british" + 0.015*"command" + 0.014*"kill"'),
 (3,
  '0.119*"race" + 0.106*"car" + 0.073*"engin" + 0.035*"model" + 0.030*"driver" + 0.029*"vehicl" + 0.029*"ford" + 0.028*"lap" + 0.023*"electr" + 0.020*"power"'),
 (4,
  '0.102*"leagu" + 0.092*"club" + 0.049*"footbal" + 0.047*"cup" + 0.029*"plai" + 0.028*"season" + 0.028*"divis" + 0.028*"goal" + 0.022*"team" + 0.021*"unit"'),
 (5,
  '0.055*"award" + 0.041*"best" + 0.008*"nomin" + 0.008*"year" + 0.006*"actress" + 0.006*"actor" + 0.005*"perform" + 0.005*"artist" + 

### Train NMF with residuals and save it
Residuals add regularization to the model thus increasing quality, but slows down training

In [15]:
row = dict()
row['model'] = 'nmf_with_r'
row['train_time'], nmf_with_r = get_execution_time(
    lambda: Nmf(
        use_r=True,
        lambda_=200,
        normalize=False,
        **params
    )
)
nmf_with_r.save('nmf_with_r.model')

2019-01-15 20:54:05,363 : INFO : Loss (no outliers): 2179.9524465227146	Loss (with outliers): 2102.354108449905
2019-01-15 20:57:12,821 : INFO : Loss (no outliers): 2268.3200929871823	Loss (with outliers): 2110.928651253909
==Truncated==
2019-01-16 04:05:46,589 : INFO : Loss (no outliers): 1321.521323758918	Loss (with outliers): 1282.9364495345592
2019-01-16 04:05:46,599 : INFO : saving Nmf object under nmf_with_r.model, separately None
2019-01-16 04:05:46,601 : INFO : storing scipy.sparse array '_r' under nmf_with_r.model._r.npy
2019-01-16 04:05:47,781 : INFO : saved nmf_with_r.model


### Load NMF with residuals and store metrics

In [16]:
nmf_with_r = Nmf.load('nmf_with_r.model')
row.update(get_tm_metrics(nmf_with_r, test_corpus))
tm_metrics = tm_metrics.append(pd.Series(row), ignore_index=True)

nmf_with_r.show_topics(50)

2019-01-16 04:05:48,017 : INFO : loading Nmf object from nmf_with_r.model
2019-01-16 04:05:48,272 : INFO : loading id2word recursively from nmf_with_r.model.id2word.* with mmap=None
2019-01-16 04:05:48,273 : INFO : loading _r from nmf_with_r.model._r.npy with mmap=None
2019-01-16 04:05:48,304 : INFO : loaded nmf_with_r.model
2019-01-16 04:06:27,119 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2019-01-16 04:06:27,253 : INFO : CorpusAccumulator accumulated stats from 2000 documents


[(0,
  '0.062*"parti" + 0.061*"elect" + 0.031*"democrat" + 0.020*"republican" + 0.020*"vote" + 0.013*"liber" + 0.012*"candid" + 0.012*"conserv" + 0.011*"seat" + 0.010*"member"'),
 (1,
  '0.052*"book" + 0.040*"centuri" + 0.039*"publish" + 0.031*"languag" + 0.027*"histori" + 0.025*"work" + 0.023*"english" + 0.022*"king" + 0.019*"polit" + 0.019*"author"'),
 (2,
  '0.031*"armi" + 0.028*"divis" + 0.025*"regiment" + 0.022*"forc" + 0.020*"battalion" + 0.019*"infantri" + 0.019*"command" + 0.017*"brigad" + 0.016*"gener" + 0.012*"corp"'),
 (3,
  '0.110*"race" + 0.059*"car" + 0.033*"engin" + 0.025*"lap" + 0.023*"driver" + 0.021*"ret" + 0.020*"ford" + 0.015*"finish" + 0.015*"motorsport" + 0.015*"chevrolet"'),
 (4,
  '0.130*"club" + 0.068*"cup" + 0.046*"footbal" + 0.044*"goal" + 0.032*"leagu" + 0.031*"unit" + 0.031*"plai" + 0.030*"match" + 0.026*"score" + 0.021*"player"'),
 (5,
  '0.041*"award" + 0.030*"best" + 0.006*"nomin" + 0.005*"actress" + 0.005*"year" + 0.004*"actor" + 0.004*"won" + 0.004*"pe

### Train LDA and save it
That's a common model to do Topic Modeling

In [17]:
row = dict()
row['model'] = 'lda'
row['train_time'], lda = get_execution_time(
    lambda: LdaModel(**params)
)
lda.save('lda.model')

2019-01-16 04:06:27,576 : INFO : using symmetric alpha at 0.02
2019-01-16 04:06:27,576 : INFO : using symmetric eta at 0.02
2019-01-16 04:06:27,589 : INFO : using serial LDA version on this node
2019-01-16 04:06:28,185 : INFO : running online (single-pass) LDA training, 50 topics, 1 passes over the supplied corpus of 4922894 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
2019-01-16 04:06:28,910 : INFO : PROGRESS: pass 0, at document #2000/4922894
==Truncated==
2019-01-16 06:24:26,456 : INFO : topic diff=0.003897, rho=0.020154
2019-01-16 06:24:26,465 : INFO : saving LdaState object under lda.model.state, separately None
2019-01-16 06:24:26,680 : INFO : saved lda.model.state
2019-01-16 06:24:26,732 : INFO : saving LdaModel object under lda.model, separately ['expElogbeta', 'sstats']
2019-01-16 06:24:26,732 : INFO : storing np array 'expElogbeta' to lda.model.expElogbeta.npy
2019-01-1

### Load LDA and store metrics

In [18]:
lda = LdaModel.load('lda.model')
row.update(get_tm_metrics(lda, test_corpus))
tm_metrics = tm_metrics.append(pd.Series(row), ignore_index=True)

lda.show_topics(50)

2019-01-16 06:24:27,064 : INFO : loading LdaModel object from lda.model
2019-01-16 06:24:27,070 : INFO : loading expElogbeta from lda.model.expElogbeta.npy with mmap=None
2019-01-16 06:24:27,077 : INFO : setting ignored attribute dispatcher to None
2019-01-16 06:24:27,078 : INFO : setting ignored attribute id2word to None
2019-01-16 06:24:27,078 : INFO : setting ignored attribute state to None
2019-01-16 06:24:27,079 : INFO : loaded lda.model
2019-01-16 06:24:27,079 : INFO : loading LdaState object from lda.model.state
2019-01-16 06:24:27,173 : INFO : loaded lda.model.state
2019-01-16 06:24:41,257 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2019-01-16 06:24:41,452 : INFO : CorpusAccumulator accumulated stats from 2000 documents


[(0,
  '0.033*"war" + 0.028*"armi" + 0.021*"forc" + 0.020*"command" + 0.015*"militari" + 0.015*"battl" + 0.013*"gener" + 0.012*"offic" + 0.011*"divis" + 0.011*"regiment"'),
 (1,
  '0.038*"album" + 0.028*"song" + 0.026*"releas" + 0.026*"record" + 0.021*"band" + 0.016*"singl" + 0.015*"music" + 0.014*"chart" + 0.013*"track" + 0.010*"guitar"'),
 (2,
  '0.062*"german" + 0.039*"germani" + 0.025*"van" + 0.023*"von" + 0.020*"der" + 0.019*"dutch" + 0.019*"berlin" + 0.015*"swedish" + 0.014*"netherland" + 0.014*"sweden"'),
 (3,
  '0.032*"john" + 0.027*"william" + 0.019*"british" + 0.015*"georg" + 0.015*"london" + 0.014*"thoma" + 0.014*"sir" + 0.014*"jame" + 0.013*"royal" + 0.013*"henri"'),
 (4,
  '0.137*"school" + 0.040*"colleg" + 0.039*"student" + 0.033*"univers" + 0.030*"high" + 0.028*"educ" + 0.016*"year" + 0.011*"graduat" + 0.010*"state" + 0.009*"campu"'),
 (5,
  '0.030*"game" + 0.009*"develop" + 0.009*"player" + 0.008*"releas" + 0.008*"us" + 0.008*"softwar" + 0.008*"version" + 0.008*"user" +

## Results

In [19]:
tm_metrics

Unnamed: 0,coherence,l2_norm,model,perplexity,topics,train_time
0,-2.814135,7.265412,nmf,975.740399,"[(24, 0.131*""mount"" + 0.129*""lemmon"" + 0.129*""...",4394.560518
1,-2.43665,7.268837,nmf_with_r,985.570926,"[(49, 0.112*""peak"" + 0.111*""kitt"" + 0.111*""mou...",26451.927848
2,-2.514469,7.371544,lda,4727.075546,"[(35, 0.034*""kong"" + 0.034*""japanes"" + 0.033*""...",8278.89106


#### RAM Usage:
- nmf: 100-150Mb
- nmf_with_r: 3-9Gb
- lda: 100Mb

In [20]:
for row_idx, row in tm_metrics.iterrows():
    print('='*20)
    print(row['model'])
    print('='*20)
    print()
    for topic_idx, tokens in row['topics']:
        print('Topic: {}'.format(topic_idx))
        print(tokens)
        print()
    print()

nmf

Topic: 24
0.131*"mount" + 0.129*"lemmon" + 0.129*"peak" + 0.127*"kitt" + 0.127*"spacewatch" + 0.065*"survei" + 0.037*"octob" + 0.031*"septemb" + 0.023*"css" + 0.023*"catalina"

Topic: 32
0.196*"linear" + 0.195*"socorro" + 0.045*"septemb" + 0.039*"neat" + 0.035*"palomar" + 0.032*"octob" + 0.024*"kitt" + 0.024*"peak" + 0.024*"spacewatch" + 0.023*"anderson"

Topic: 8
0.331*"align" + 0.270*"left" + 0.071*"right" + 0.040*"text" + 0.035*"style" + 0.022*"center" + 0.013*"bar" + 0.009*"till" + 0.008*"bgcolor" + 0.008*"color"

Topic: 27
0.186*"district" + 0.027*"pennsylvania" + 0.022*"grade" + 0.017*"fund" + 0.017*"educ" + 0.017*"basic" + 0.016*"level" + 0.014*"oblast" + 0.014*"rural" + 0.013*"tax"

Topic: 48
0.103*"art" + 0.066*"museum" + 0.040*"paint" + 0.035*"work" + 0.026*"artist" + 0.024*"galleri" + 0.022*"exhibit" + 0.019*"collect" + 0.015*"histori" + 0.013*"jpg"

Topic: 11
0.122*"new" + 0.043*"york" + 0.009*"zealand" + 0.007*"jersei" + 0.006*"american" + 0.006*"time" + 0.006*"austra

As we can see, NMF can be significantly faster than LDA without sacrificing quality of topics too much (or not sacrificing at all)

Moreover, NMF can be very flexible on RAM usage due to sparsity option, which leaves only small amount of elements in inner matrices.