In [1]:
import pyLDAvis
import pyLDAvis.sklearn
from pyLDAvis._prepare import (js_PCoA, js_MMDS, js_TSNE)

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
pyLDAvis.enable_notebook()

## load data

In [4]:
newsgroup = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
docs_raw = newsgroup.data
print len(docs_raw)

11314


## document-term matrix

In [5]:
tfidf_vectorizer = TfidfVectorizer(strip_accents = 'unicode', 
                                   stop_words = 'english', 
                                   token_pattern = r'\b[a-zA-Z]{2,}\b',
                                   max_df = 0.5, 
                                   min_df = 10)

In [6]:
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
print dtm_tfidf.shape

(11314, 9597)


## latent dirichlet allocation

In [7]:
lda = LatentDirichletAllocation(n_topics=20, learning_method='batch', random_state=0)
lda.fit(dtm_tfidf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_jobs=1, n_topics=20, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

## pyLDAvis

### PCoA / CMDS

In [8]:
pyLDAvis.sklearn.prepare(lda, dtm_tfidf, tfidf_vectorizer, sort_topics=False, mds=js_PCoA)

### MMDS

In [9]:
pyLDAvis.sklearn.prepare(lda, dtm_tfidf, tfidf_vectorizer, sort_topics=False, mds=js_MMDS)

### TSNE

In [10]:
pyLDAvis.sklearn.prepare(lda, dtm_tfidf, tfidf_vectorizer, sort_topics=False, mds=js_TSNE)