<a href="https://colab.research.google.com/github/cyrus723/my-first-binder/blob/main/LDA_Kochmar2022_NLP_book_CH10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This code has two parts: the first one is from scikit learn and the second one is from the book by Kochmar 2022.

# The first code: https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py

In [None]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck
#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause

from time import time

import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation, MiniBatchNMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20
batch_size = 128
init = "nndsvda"


def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[-n_top_words:]
        top_features = feature_names[top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")


Loading dataset...


  and should_run_async(code)


In [None]:
t0 = time()
data, _ = fetch_20newsgroups(
    shuffle=True,
    random_state=1,
    remove=("headers", "footers", "quotes"),
    return_X_y=True,
)
data_samples = data[:n_samples]
print("done in %0.3fs." % (time() - t0))
print(len(data_samples))
data_samples[1999]

  and should_run_async(code)


done in 3.208s.
2000


"\n\n\nNeither did he!\n\n\nOverall?  How do you figure?\n\n\nSo far my radio hasn't exploded from not being tuned to 660...\n"

In [None]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
)
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
type(tfidf)
tfidf.shape
print(tfidf)


  and should_run_async(code)


Extracting tf-idf features for NMF...
done in 1.101s.


(2000, 1000)

  (0, 708)	0.12621877625178227
  (0, 410)	0.11650651629173196
  (0, 493)	0.1631127602376565
  (0, 548)	0.11873384536901997
  (0, 130)	0.13595955391213657
  (0, 567)	0.13595955391213657
  (0, 412)	0.12831668397369733
  (0, 750)	0.15376128408643466
  (0, 841)	0.18564440175793037
  (0, 206)	0.15810189392327795
  (0, 764)	0.1640284908630232
  (0, 748)	0.13595955391213657
  (0, 904)	0.08983671288492111
  (0, 923)	0.11966934266418663
  (0, 527)	0.1690393571774018
  (0, 432)	0.13369075280946802
  (0, 988)	0.12740095334833063
  (0, 488)	0.3750048191807266
  (0, 717)	0.17767638066823058
  (0, 587)	0.6454209423982519
  (0, 862)	0.1551447391479567
  (0, 286)	0.11115911128919416
  (0, 867)	0.15810189392327795
  (0, 881)	0.11227372176926384
  (1, 381)	0.20157910011124136
  :	:
  (1998, 504)	0.04875543232365812
  (1998, 991)	0.053978162418983656
  (1998, 566)	0.03637572081429063
  (1998, 611)	0.05504978412016225
  (1998, 171)	0.047384737904817335
  (1998, 414)	0.08876861152823663
  (1998, 268)	0.235

In [None]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(
    max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
)
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print(
    "Fitting the NMF model (Frobenius norm) with tf-idf features, "
    "n_samples=%d and n_features=%d..." % (n_samples, n_features)
)
t0 = time()
nmf = NMF(
    n_components=n_components,
    random_state=1,
    init=init,
    beta_loss="frobenius",
    alpha_W=0.00005,
    alpha_H=0.00005,
    l1_ratio=1,
).fit(tfidf)
print("done in %0.3fs." % (time() - t0))


tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
    nmf, tfidf_feature_names, n_top_words, "Topics in NMF model (Frobenius norm)"
)

# Fit the NMF model
print(
    "\n" * 2,
    "Fitting the NMF model (generalized Kullback-Leibler "
    "divergence) with tf-idf features, n_samples=%d and n_features=%d..."
    % (n_samples, n_features),
)
t0 = time()
nmf = NMF(
    n_components=n_components,
    random_state=1,
    init=init,
    beta_loss="kullback-leibler",
    solver="mu",
    max_iter=1000,
    alpha_W=0.00005,
    alpha_H=0.00005,
    l1_ratio=0.5,
).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
    nmf,
    tfidf_feature_names,
    n_top_words,
    "Topics in NMF model (generalized Kullback-Leibler divergence)",
)

# Fit the MiniBatchNMF model
print(
    "\n" * 2,
    "Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf "
    "features, n_samples=%d and n_features=%d, batch_size=%d..."
    % (n_samples, n_features, batch_size),
)
t0 = time()
mbnmf = MiniBatchNMF(
    n_components=n_components,
    random_state=1,
    batch_size=batch_size,
    init=init,
    beta_loss="frobenius",
    alpha_W=0.00005,
    alpha_H=0.00005,
    l1_ratio=0.5,
).fit(tfidf)
print("done in %0.3fs." % (time() - t0))


tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
    mbnmf,
    tfidf_feature_names,
    n_top_words,
    "Topics in MiniBatchNMF model (Frobenius norm)",
)

# Fit the MiniBatchNMF model
print(
    "\n" * 2,
    "Fitting the MiniBatchNMF model (generalized Kullback-Leibler "
    "divergence) with tf-idf features, n_samples=%d and n_features=%d, "
    "batch_size=%d..." % (n_samples, n_features, batch_size),
)
t0 = time()
mbnmf = MiniBatchNMF(
    n_components=n_components,
    random_state=1,
    batch_size=batch_size,
    init=init,
    beta_loss="kullback-leibler",
    alpha_W=0.00005,
    alpha_H=0.00005,
    l1_ratio=0.5,
).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
    mbnmf,
    tfidf_feature_names,
    n_top_words,
    "Topics in MiniBatchNMF model (generalized Kullback-Leibler divergence)",
)

print(
    "\n" * 2,
    "Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
    % (n_samples, n_features),
)
lda = LatentDirichletAllocation(
    n_components=n_components,
    max_iter=5,
    learning_method="online",
    learning_offset=50.0,
    random_state=0,
)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

tf_feature_names = tf_vectorizer.get_feature_names_out()
plot_top_words(lda, tf_feature_names, n_top_words, "Topics in LDA model")

# Source: https://github.com/ekochmar/Getting-Started-with-NLP/blob/master/Chapter10.ipynb Chapter 10: LDA for Topic Modeling

## Load Newsgroups data

As before, let's consider a specific set of categories:
https://scikit-learn.org/stable/datasets/real_world.html#the-20-newsgroups-text-dataset

The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date.

This module contains two loaders. The first one, sklearn.datasets.fetch_20newsgroups, returns a list of the raw texts that can be fed to text feature extractors such as CountVectorizer with custom parameters so as to extract feature vectors. The second one, sklearn.datasets.fetch_20newsgroups_vectorized, returns ready-to-use features, i.e., it is not necessary to use a feature extractor.


In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

  and should_run_async(code)


In [None]:
from sklearn.datasets import fetch_20newsgroups

def load_dataset(sset, cats):
    if cats==[]:
        newsgroups_dset = fetch_20newsgroups(subset=sset,
                          remove=('headers', 'footers', 'quotes'),
                          shuffle=True)
    else:
        newsgroups_dset = fetch_20newsgroups(subset=sset, categories=cats,
                          remove=('headers', 'footers', 'quotes'),
                          shuffle=True)
    return newsgroups_dset

  and should_run_async(code)


In [None]:
categories = ["comp.windows.x", "misc.forsale", "rec.autos", "rec.motorcycles", "rec.sport.baseball"]
categories += ["rec.sport.hockey", "sci.crypt", "sci.med", "sci.space", "talk.politics.mideast"]

newsgroups_all = load_dataset('all', categories)
print(len(newsgroups_all.data))

  and should_run_async(code)


9850


In [None]:
newsgroups_all.keys()

  and should_run_async(code)


dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

## Preprocess

Convert word forms to stems to get concise representations for the documents:

In [None]:
import nltk
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer("english")

def stem(text):
    return stemmer.stem(text)

  and should_run_async(code)


In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS as stopwords

#print(stopwords)

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text, min_len=4):
        if token not in stopwords: #and len(token) > 3:
            result.append(stem(token))
    return result

  and should_run_async(code)


Check how each document is represented. For example, look into the very first one:

In [None]:
doc_sample = newsgroups_all.data[0]
print('Original document: ')
print(doc_sample)

print('\n\nTokenized document: ')
words = []
for token in gensim.utils.tokenize(doc_sample):
    words.append(token)
print(words)

print('\n\nPreprocessed document: ')
print(preprocess(doc_sample))

Original document: 
Hi Xperts!

How can I move the cursor with the keyboard (i.e. cursor keys), 
if no mouse is available?

Any hints welcome.

Thanks.


Tokenized document: 
['Hi', 'Xperts', 'How', 'can', 'I', 'move', 'the', 'cursor', 'with', 'the', 'keyboard', 'i', 'e', 'cursor', 'keys', 'if', 'no', 'mouse', 'is', 'available', 'Any', 'hints', 'welcome', 'Thanks']


Preprocessed document: 
['xpert', 'cursor', 'keyboard', 'cursor', 'key', 'mous', 'avail', 'hint', 'welcom', 'thank']


  and should_run_async(code)


How do the first 10 look like?

In [None]:
for i in range(0, 10):
    print(str(i) + "\t" + ", ".join(preprocess(newsgroups_all.data[i])[:10]))

0	xpert, cursor, keyboard, cursor, key, mous, avail, hint, welcom, thank
1	obtain, copi, open, look, widget, obtain, need, order, copi, thank
2	right, signal, strong, live, west, philadelphia, perfect, sport, fan, dream
3	canadian, thing, coach, boston, bruin, colorado, rocki, summari, post, gather
4	heck, feel, like, time, includ, cafeteria, work, half, time, headach
5	damn, right, late, climb, meet, morn, bother, right, foot, asleep
6	olympus, stylus, pocket, camera, smallest, class, includ, time, date, stamp
7	includ, follow, chmos, clock, generat, driver, processor, chmos, eras, prom
8	chang, intel, discov, xclient, xload, longer, work, bomb, messag, error
9	termin, like, power, server, run, window, manag, special, client, program


  and should_run_async(code)


Now let's represent each document as a dictionary of relevant words. Each word (*value* in the dictionary) has a unique identifier (*key*):

In [None]:
processed_docs = []
for i in range(0, len(newsgroups_all.data)):
    processed_docs.append(preprocess(newsgroups_all.data[i]))

print(len(processed_docs))

dictionary = gensim.corpora.Dictionary(processed_docs)
print(len(dictionary))

index = 0
for key, value in dictionary.iteritems():
    print(key, value)
    index += 1
    if index > 9:
        break

  and should_run_async(code)


9850
39350
0 avail
1 cursor
2 hint
3 key
4 keyboard
5 mous
6 thank
7 welcom
8 xpert
9 copi


Put some contraints on the dictionary of terms: for instance, keep up to $100,000$ words that occur more frequently than $10$ times (`no_below`) and less frequently than in $50\%$ of the documents (`no_above`). This should help you extract the most useful terms, while still keeping a reasonable number of them.

In [None]:
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=100000)
print(len(dictionary))

5868


  and should_run_async(code)


Let's see how a particular document is represented in this dictionary: for example, look into the very first post, or into the 100th:

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[0]
#bow_corpus[99]

  and should_run_async(code)


[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]

Let's decode what each index (key) in this dictionary points to:

In [None]:
#bow_doc = bow_corpus[99]
bow_doc = bow_corpus[0]

for i in range(len(bow_doc)):
    print(f"Key {bow_doc[i][0]} =\"{dictionary[bow_doc[i][0]]}\":\
    occurrences={bow_doc[i][1]}")

Key 0 ="avail":    occurrences=1
Key 1 ="cursor":    occurrences=2
Key 2 ="hint":    occurrences=1
Key 3 ="key":    occurrences=1
Key 4 ="keyboard":    occurrences=1
Key 5 ="mous":    occurrences=1
Key 6 ="thank":    occurrences=1
Key 7 ="welcom":    occurrences=1
Key 8 ="xpert":    occurrences=1


  and should_run_async(code)


## Train an LDA model

In [None]:
# Create the dictionary
id2word = dictionary

# Create the corpus with word frequencies
corpus = bow_corpus

# Build the LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=1000,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)


for index, topic in lda_model.print_topics(-1):
    print(f"Topic: {index} \nWords: {topic}")

  and should_run_async(code)


Topic: 0 
Words: 0.021*"encrypt" + 0.018*"secur" + 0.018*"chip" + 0.016*"govern" + 0.013*"clipper" + 0.012*"public" + 0.010*"privaci" + 0.010*"key" + 0.010*"phone" + 0.009*"algorithm"
Topic: 1 
Words: 0.017*"appear" + 0.014*"copi" + 0.013*"cover" + 0.013*"star" + 0.013*"book" + 0.011*"penalti" + 0.010*"black" + 0.009*"comic" + 0.008*"blue" + 0.008*"green"
Topic: 2 
Words: 0.031*"window" + 0.015*"server" + 0.012*"program" + 0.012*"file" + 0.012*"applic" + 0.012*"display" + 0.011*"widget" + 0.010*"version" + 0.010*"motif" + 0.010*"support"
Topic: 3 
Words: 0.015*"space" + 0.007*"launch" + 0.007*"year" + 0.007*"medic" + 0.006*"patient" + 0.006*"orbit" + 0.006*"research" + 0.006*"diseas" + 0.005*"develop" + 0.005*"nasa"
Topic: 4 
Words: 0.018*"armenian" + 0.011*"peopl" + 0.008*"kill" + 0.008*"said" + 0.007*"turkish" + 0.006*"muslim" + 0.006*"jew" + 0.006*"govern" + 0.005*"state" + 0.005*"greek"
Topic: 5 
Words: 0.024*"price" + 0.021*"sale" + 0.020*"offer" + 0.017*"drive" + 0.017*"sell" + 0

## Interpret the results

What is the most representative topic in each document?

In [None]:
def analyse_topics(ldamodel, corpus, texts):
    main_topic = {}
    percentage = {}
    keywords = {}
    text_snippets = {}
    # Get main topic in each document
    for i, topic_list in enumerate(ldamodel[corpus]):
        #print("\n")
        #print(topic_list)
        #print("\n")
        #for i in range(0, len(topic_list)):
        #    print (topic_list[i])
        topic = topic_list[0] if ldamodel.per_word_topics else topic_list
        #print(topic)
        topic = sorted(topic, key=lambda x: (x[1]), reverse=True)
        # Get the main topic, contribution (%) and keywords for each document
        for j, (topic_num, prop_topic) in enumerate(topic):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp[:5]])
                main_topic[i] = int(topic_num)
                percentage[i] = round(prop_topic,4)
                keywords[i] = topic_keywords
                text_snippets[i] = texts[i][:8]
            else:
                break
    return main_topic, percentage, keywords, text_snippets


main_topic, percentage, keywords, text_snippets = analyse_topics(
    lda_model, bow_corpus, processed_docs)

indexes = []
rows = []
for i in range(0, 10):
    indexes.append(i)
rows.append(['ID', 'Main Topic', 'Contribution (%)', 'Keywords', 'Snippet'])

for idx in indexes:
    rows.append([str(idx), f"{main_topic.get(idx)}",
                f"{percentage.get(idx):.4f}",
                f"{keywords.get(idx)}\n",
                f"{text_snippets.get(idx)}"])

columns = zip(*rows)
column_widths = [max(len(item) for item in col) for col in columns]
for row in rows:
    print(''.join(' {:{width}} '.format(row[i], width=column_widths[i])
                  for i in range(0, len(row))))


  and should_run_async(code)


 ID  Main Topic  Contribution (%)  Keywords                                Snippet                                                                           
 0   2           0.8268            window, server, program, file, applic
  ['xpert', 'cursor', 'keyboard', 'cursor', 'key', 'mous', 'avail', 'hint']         
 1   6           0.4741            mail, list, file, inform, send
         ['obtain', 'copi', 'open', 'look', 'widget', 'obtain', 'need', 'order']           
 2   7           0.4230            like, know, time, look, think
          ['right', 'signal', 'strong', 'live', 'west', 'philadelphia', 'perfect', 'sport'] 
 3   8           0.4159            game, team, play, year, player
         ['canadian', 'thing', 'coach', 'boston', 'bruin', 'colorado', 'rocki', 'summari'] 
 4   9           0.9039            peopl, think, like, time, right
        ['heck', 'feel', 'like', 'time', 'includ', 'cafeteria', 'work', 'half']           
 5   7           0.6291            like, know, time,

## Explore words and topics with pyLDAvis

In [None]:
!pip install pyLDAvis

  and should_run_async(code)




In [None]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis

  and should_run_async(code)


Note, for newer versions of `gensim`, use the following code:

In [None]:
#import pyLDAvis.gensim_models
#pyLDAvis.enable_notebook()
#vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary=lda_model.id2word)
#vis