In [1]:
from gensim.corpora import WikiCorpus
from gensim.models import Nmf, LdaModel
import gensim.downloader as api
from gensim.parsing.preprocessing import preprocess_documents

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
data = api.load("wiki-english-20171001")
for article in data:
    for section_title, section_text in zip(article['section_titles'], article['section_texts']):
        print("Section title: %s" % section_title)
        print("Section text: %s" % section_text)
    break

Section title: Introduction
Section text: 




'''Anarchism''' is a political philosophy that advocates self-governed societies based on voluntary institutions. These are often described as stateless societies, although several authors have defined them more specifically as institutions based on non-hierarchical free associations. Anarchism holds the state to be undesirable, unnecessary and harmful.

While anti-statism is central, anarchism specifically entails opposing authority or hierarchical organisation in the conduct of all human relations, including—but not limited to—the state system. Anarchism is usually considered a far-left ideology and much of anarchist economics and anarchist legal philosophy reflects anti-authoritarian interpretations of communism, collectivism, syndicalism, mutualism or participatory economics.

Anarchism does not offer a fixed body of doctrine from a single particular world view, instead fluxing and flowing as a philosophy. Many types and traditions of 

In [3]:
import itertools

ARTICLES_COUNT = 10000

wiki_articles = preprocess_documents(
    " ".join(" ".join(section)
            for section
            in zip(article['section_titles'], article['section_texts'])
    )
    for article
    in itertools.islice(data, ARTICLES_COUNT)
)

In [4]:
from gensim.corpora import Dictionary

dictionary = Dictionary(wiki_articles)
dictionary.filter_extremes()

2018-06-02 20:45:10,808 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-06-02 20:45:28,612 : INFO : built Dictionary(399748 unique tokens: ['abandon', 'abdelrahim', 'abil', 'abl', 'abolit']...) from 10000 documents (total 17815678 corpus positions)
2018-06-02 20:45:29,275 : INFO : discarding 336501 tokens: [('abdelrahim', 2), ('abstention', 3), ('amoureus', 3), ('amoureux', 1), ('amparo', 3), ('anarchica', 3), ('anarchosyndicalist', 2), ('arbet', 2), ('archo', 1), ('arditi', 4)]...
2018-06-02 20:45:29,276 : INFO : keeping 63247 tokens which were in no less than 5 and no more than 5000 (=50.0%) documents
2018-06-02 20:45:29,474 : INFO : resulting dictionary: Dictionary(63247 unique tokens: ['abandon', 'abil', 'abl', 'abolit', 'abstent']...)


In [5]:
corpus = [
    dictionary.doc2bow(article)
    for article
    in wiki_articles
]

In [6]:
%%time

PASSES=1

gensim_nmf = Nmf(
    corpus,
    chunksize=2000,
    passes=PASSES,
    num_topics=20,
    id2word=dictionary,
    lambda_=1000,
    kappa=1.,
    normalize=False
)

2018-06-02 20:46:12,291 : INFO : Loss (no outliers): 7337.937955083741	Loss (with outliers): 7337.937955083741
2018-06-02 20:46:49,539 : INFO : Loss (no outliers): 7465.065080292521	Loss (with outliers): 7465.065080292521
2018-06-02 20:47:32,027 : INFO : Loss (no outliers): 7242.137297523558	Loss (with outliers): 7242.137297523558
2018-06-02 20:48:15,605 : INFO : Loss (no outliers): 7773.646682256459	Loss (with outliers): 7773.646682256459
2018-06-02 20:48:52,218 : INFO : Loss (no outliers): 7251.947542415921	Loss (with outliers): 7251.947542415921


CPU times: user 5min 14s, sys: 2min 19s, total: 7min 33s
Wall time: 3min 10s


In [7]:
gensim_nmf.show_topics(20)

[(0,
  '0.057*"life" + 0.047*"book" + 0.046*"centuri" + 0.033*"god" + 0.033*"publish" + 0.030*"women" + 0.030*"human" + 0.029*"studi" + 0.027*"write" + 0.027*"london"'),
 (1,
  '0.105*"war" + 0.100*"govern" + 0.091*"countri" + 0.068*"parti" + 0.066*"forc" + 0.057*"polit" + 0.056*"union" + 0.055*"soviet" + 0.051*"mexico" + 0.049*"econom"'),
 (2,
  '0.169*"univers" + 0.152*"island" + 0.136*"school" + 0.090*"colleg" + 0.078*"student" + 0.077*"educ" + 0.073*"open" + 0.071*"servic" + 0.069*"librari" + 0.067*"public"'),
 (3,
  '0.713*"film" + 0.135*"award" + 0.114*"best" + 0.104*"brando" + 0.103*"star" + 0.103*"scorses" + 0.099*"director" + 0.084*"actor" + 0.083*"movi" + 0.080*"role"'),
 (4,
  '0.647*"music" + 0.214*"band" + 0.206*"plai" + 0.203*"instrument" + 0.156*"perform" + 0.145*"jazz" + 0.142*"open" + 0.140*"record" + 0.135*"mandolin" + 0.125*"sound"'),
 (5,
  '0.770*"citi" + 0.171*"area" + 0.155*"mexico" + 0.152*"kansa" + 0.122*"popul" + 0.121*"london" + 0.109*"river" + 0.108*"distric

In [8]:
%%time
# %%prun

lda = LdaModel(
    corpus,
    chunksize=2000,
    passes=5,
    num_topics=20,
    id2word=dictionary,
)

2018-06-02 20:48:52,359 : INFO : using symmetric alpha at 0.05
2018-06-02 20:48:52,361 : INFO : using symmetric eta at 0.05
2018-06-02 20:48:52,391 : INFO : using serial LDA version on this node
2018-06-02 20:48:52,634 : INFO : running online (multi-pass) LDA training, 20 topics, 5 passes over the supplied corpus of 10000 documents, updating model once every 2000 documents, evaluating perplexity every 10000 documents, iterating 50x with a convergence threshold of 0.001000
2018-06-02 20:48:52,636 : INFO : PROGRESS: pass 0, at document #2000/10000
2018-06-02 20:48:59,748 : INFO : merging changes from 2000 documents into a model of 10000 documents
2018-06-02 20:48:59,962 : INFO : topic #17 (0.050): 0.003*"american" + 0.003*"citi" + 0.002*"book" + 0.002*"centuri" + 0.002*"space" + 0.002*"apollo" + 0.002*"countri" + 0.002*"land" + 0.002*"game" + 0.002*"oper"
2018-06-02 20:48:59,965 : INFO : topic #3 (0.050): 0.003*"citi" + 0.002*"centuri" + 0.002*"war" + 0.002*"american" + 0.002*"book" + 0.

2018-06-02 20:49:55,911 : INFO : topic #7 (0.050): 0.007*"art" + 0.007*"jpg" + 0.006*"paint" + 0.006*"file" + 0.005*"food" + 0.004*"blue" + 0.004*"centuri" + 0.003*"product" + 0.003*"water" + 0.003*"produc"
2018-06-02 20:49:55,915 : INFO : topic diff=1.105198, rho=0.377964
2018-06-02 20:49:55,917 : INFO : PROGRESS: pass 1, at document #4000/10000
2018-06-02 20:50:02,697 : INFO : merging changes from 2000 documents into a model of 10000 documents
2018-06-02 20:50:02,920 : INFO : topic #5 (0.050): 0.009*"centuri" + 0.005*"empir" + 0.004*"king" + 0.004*"citi" + 0.004*"china" + 0.003*"period" + 0.003*"dynasti" + 0.003*"chines" + 0.003*"muslim" + 0.003*"greek"
2018-06-02 20:50:02,922 : INFO : topic #18 (0.050): 0.007*"plai" + 0.007*"film" + 0.006*"record" + 0.005*"game" + 0.005*"season" + 0.005*"album" + 0.005*"team" + 0.005*"music" + 0.005*"award" + 0.004*"seri"
2018-06-02 20:50:02,924 : INFO : topic #2 (0.050): 0.011*"citi" + 0.006*"countri" + 0.005*"govern" + 0.005*"servic" + 0.005*"comp

2018-06-02 20:50:57,096 : INFO : topic diff=0.577383, rho=0.353553
2018-06-02 20:50:57,097 : INFO : PROGRESS: pass 2, at document #6000/10000
2018-06-02 20:51:03,678 : INFO : merging changes from 2000 documents into a model of 10000 documents
2018-06-02 20:51:03,897 : INFO : topic #19 (0.050): 0.013*"island" + 0.007*"river" + 0.006*"area" + 0.006*"water" + 0.006*"sea" + 0.005*"north" + 0.005*"lake" + 0.005*"star" + 0.005*"south" + 0.005*"speci"
2018-06-02 20:51:03,899 : INFO : topic #17 (0.050): 0.039*"game" + 0.012*"dna" + 0.011*"player" + 0.010*"space" + 0.009*"moon" + 0.007*"apollo" + 0.006*"christma" + 0.006*"gene" + 0.006*"tree" + 0.006*"mission"
2018-06-02 20:51:03,901 : INFO : topic #2 (0.050): 0.013*"citi" + 0.007*"countri" + 0.006*"govern" + 0.006*"servic" + 0.006*"compani" + 0.005*"area" + 0.005*"econom" + 0.005*"intern" + 0.005*"industri" + 0.005*"bank"
2018-06-02 20:51:03,904 : INFO : topic #9 (0.050): 0.010*"engin" + 0.009*"air" + 0.008*"design" + 0.008*"oper" + 0.007*"air

2018-06-02 20:52:04,387 : INFO : topic #15 (0.050): 0.007*"exampl" + 0.006*"inform" + 0.006*"data" + 0.006*"program" + 0.005*"code" + 0.005*"languag" + 0.005*"function" + 0.005*"theori" + 0.005*"comput" + 0.005*"valu"
2018-06-02 20:52:04,389 : INFO : topic #16 (0.050): 0.014*"countri" + 0.012*"relat" + 0.009*"soviet" + 0.008*"embassi" + 0.007*"war" + 0.007*"republ" + 0.007*"foreign" + 0.006*"german" + 0.006*"germani" + 0.006*"intern"
2018-06-02 20:52:04,391 : INFO : topic #8 (0.050): 0.018*"film" + 0.007*"stori" + 0.007*"book" + 0.007*"publish" + 0.005*"charact" + 0.005*"novel" + 0.005*"seri" + 0.005*"life" + 0.005*"fiction" + 0.003*"award"
2018-06-02 20:52:04,394 : INFO : topic #7 (0.050): 0.012*"jpg" + 0.012*"art" + 0.010*"paint" + 0.010*"file" + 0.008*"color" + 0.007*"food" + 0.005*"museum" + 0.005*"centuri" + 0.004*"artist" + 0.004*"design"
2018-06-02 20:52:04,397 : INFO : topic #9 (0.050): 0.010*"engin" + 0.009*"air" + 0.009*"oper" + 0.008*"design" + 0.006*"aircraft" + 0.005*"powe

2018-06-02 20:53:22,258 : INFO : topic #17 (0.050): 0.040*"game" + 0.016*"player" + 0.012*"moon" + 0.009*"space" + 0.009*"calendar" + 0.009*"dna" + 0.008*"mar" + 0.008*"earth" + 0.008*"card" + 0.007*"month"
2018-06-02 20:53:22,260 : INFO : topic #12 (0.050): 0.046*"american" + 0.016*"english" + 0.012*"politician" + 0.012*"player" + 0.010*"author" + 0.009*"footbal" + 0.009*"french" + 0.008*"singer" + 0.007*"actor" + 0.007*"canadian"
2018-06-02 20:53:22,262 : INFO : topic #11 (0.050): 0.015*"parti" + 0.014*"govern" + 0.012*"elect" + 0.011*"polit" + 0.009*"presid" + 0.008*"member" + 0.006*"minist" + 0.006*"right" + 0.005*"power" + 0.005*"support"
2018-06-02 20:53:22,265 : INFO : topic #14 (0.050): 0.016*"church" + 0.014*"univers" + 0.012*"school" + 0.011*"law" + 0.009*"colleg" + 0.008*"court" + 0.007*"educ" + 0.006*"student" + 0.005*"council" + 0.005*"institut"
2018-06-02 20:53:22,268 : INFO : topic diff=0.155145, rho=0.316228


CPU times: user 7min 31s, sys: 9min 48s, total: 17min 19s
Wall time: 4min 29s


In [9]:
lda.show_topics(20)

[(0,
  '0.008*"god" + 0.006*"book" + 0.005*"christian" + 0.005*"centuri" + 0.005*"life" + 0.004*"univers" + 0.004*"philosophi" + 0.004*"human" + 0.004*"jewish" + 0.003*"theori"'),
 (1,
  '0.009*"cell" + 0.009*"acid" + 0.008*"metal" + 0.008*"chemic" + 0.007*"water" + 0.007*"atom" + 0.007*"carbon" + 0.006*"process" + 0.006*"produc" + 0.006*"protein"'),
 (2,
  '0.016*"citi" + 0.008*"countri" + 0.006*"govern" + 0.006*"compani" + 0.006*"servic" + 0.006*"industri" + 0.006*"area" + 0.006*"econom" + 0.005*"intern" + 0.005*"trade"'),
 (3,
  '0.031*"music" + 0.009*"instrument" + 0.009*"plai" + 0.007*"perform" + 0.007*"compos" + 0.006*"guitar" + 0.006*"sound" + 0.006*"song" + 0.006*"style" + 0.006*"danc"'),
 (4,
  '0.008*"field" + 0.008*"function" + 0.007*"theori" + 0.007*"space" + 0.007*"measur" + 0.007*"point" + 0.006*"light" + 0.006*"equat" + 0.005*"exampl" + 0.005*"energi"'),
 (5,
  '0.010*"centuri" + 0.008*"empir" + 0.008*"india" + 0.007*"islam" + 0.006*"muslim" + 0.005*"citi" + 0.005*"chine