{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#
Implementing LDA in Python
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
Dr. W.J.B. Mattingly
\n", "\n", "
Smithsonian Data Science Lab and United States Holocaust Memorial Museum
\n", "\n", "
February 2021
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Key Concepts in this Notebook" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Introduction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Importing the Required Libraries" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "#https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#1introduction\n", "import numpy as np\n", "import json\n", "import glob\n", "\n", "#Gensim\n", "import gensim\n", "import gensim.corpora as corpora\n", "from gensim.utils import simple_preprocess\n", "from gensim.models import CoherenceModel\n", "\n", "#spacy\n", "import spacy\n", "from nltk.corpus import stopwords\n", "\n", "#vis\n", "import pyLDAvis\n", "import pyLDAvis.gensim\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\", category=DeprecationWarning)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preparing the Data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def load_data(file):\n", " with open (file, \"r\", encoding=\"utf-8\") as f:\n", " data = json.load(f) \n", " return (data)\n", "\n", "def write_data(file, data):\n", " with open (file, \"w\", encoding=\"utf-8\") as f:\n", " json.dump(data, f, indent=4)\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "stopwords = stopwords.words(\"english\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\", \"you've\", \"you'll\", \"you'd\", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', \"she's\", 'her', 'hers', 'herself', 'it', \"it's\", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', \"that'll\", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', \"don't\", 'should', \"should've\", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', \"aren't\", 'couldn', \"couldn't\", 'didn', \"didn't\", 'doesn', \"doesn't\", 'hadn', \"hadn't\", 'hasn', \"hasn't\", 'haven', \"haven't\", 'isn', \"isn't\", 'ma', 'mightn', \"mightn't\", 'mustn', \"mustn't\", 'needn', \"needn't\", 'shan', \"shan't\", 'shouldn', \"shouldn't\", 'wasn', \"wasn't\", 'weren', \"weren't\", 'won', \"won't\", 'wouldn', \"wouldn't\"]\n" ] } ], "source": [ "print (stopwords)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " My name David Kochalski. I was born in a small town called , and I was born May 5, 1928. \n" ] } ], "source": [ "data = load_data(\"data/ushmm_dn.json\")[\"texts\"]\n", "\n", "print (data[0][0:90])" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name bear small town call bear very hard work child father mother small mill flour buckwhe\n" ] } ], "source": [ "def lemmatization(texts, allowed_postags=[\"NOUN\", \"ADJ\", \"VERB\", \"ADV\"]):\n", " nlp = spacy.load(\"en_core_web_sm\", disable=[\"parser\", \"ner\"])\n", " texts_out = []\n", " for text in texts:\n", " doc = nlp(text)\n", " new_text = []\n", " for token in doc:\n", " if token.pos_ in allowed_postags:\n", " new_text.append(token.lemma_)\n", " final = \" \".join(new_text)\n", " texts_out.append(final)\n", " return (texts_out)\n", "\n", "\n", "lemmatized_texts = lemmatization(data)\n", "print (lemmatized_texts[0][0:90])" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['name', 'bear', 'small', 'town', 'call', 'bear', 'very', 'hard', 'work', 'child', 'father', 'mother', 'small', 'mill', 'flour', 'buckwheat', 'prosperous', 'comfortable', 'go', 'school']\n" ] } ], "source": [ "def gen_words(texts):\n", " final = []\n", " for text in texts:\n", " new = gensim.utils.simple_preprocess(text, deacc=True)\n", " final.append(new)\n", " return (final)\n", "\n", "data_words = gen_words(lemmatized_texts)\n", "\n", "print (data_words[0][0:20])" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0, 2), (1, 8), (2, 1), (3, 2), (4, 1), (5, 2), (6, 1), (7, 2), (8, 3), (9, 1), (10, 12), (11, 1), (12, 8), (13, 1), (14, 2), (15, 1), (16, 3), (17, 2), (18, 1), (19, 2)]\n", "able\n" ] } ], "source": [ "id2word = corpora.Dictionary(data_words)\n", "\n", "corpus = []\n", "for text in data_words:\n", " new = id2word.doc2bow(text)\n", " corpus.append(new)\n", "\n", "print (corpus[0][0:20])\n", "\n", "word = id2word[[0][:1][0]]\n", "print (word)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,\n", " id2word=id2word,\n", " num_topics=30,\n", " random_state=100,\n", " update_every=1,\n", " chunksize=100,\n", " passes=10,\n", " alpha=\"auto\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Vizualizing the Data" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "" ], "text/plain": [ "PreparedData(topic_coordinates= x y topics cluster Freq\n", "topic \n", "3 0.025607 0.293203 1 1 21.721140\n", "2 0.148460 0.331648 2 1 20.611230\n", "24 0.071520 0.359273 3 1 12.493062\n", "15 0.032683 0.396110 4 1 11.556512\n", "9 0.248335 0.354148 5 1 10.847792\n", "19 0.209884 0.378408 6 1 7.268443\n", "12 0.140075 0.376969 7 1 7.107453\n", "21 -0.138464 0.329285 8 1 6.490240\n", "18 -0.068963 0.366136 9 1 1.662169\n", "6 0.251815 -0.001492 10 1 0.223031\n", "25 -0.012343 -0.177815 11 1 0.013639\n", "17 -0.047814 -0.158177 12 1 0.000304\n", "16 -0.047815 -0.158182 13 1 0.000303\n", "22 -0.047816 -0.158184 14 1 0.000300\n", "8 -0.047816 -0.158186 15 1 0.000300\n", "14 -0.047819 -0.158194 16 1 0.000292\n", "23 -0.047821 -0.158202 17 1 0.000286\n", "27 -0.047822 -0.158206 18 1 0.000284\n", "28 -0.047822 -0.158205 19 1 0.000282\n", "4 -0.047822 -0.158206 20 1 0.000280\n", "7 -0.047824 -0.158211 21 1 0.000275\n", "0 -0.047824 -0.158211 22 1 0.000272\n", "29 -0.047824 -0.158211 23 1 0.000272\n", "11 -0.047824 -0.158212 24 1 0.000268\n", "10 -0.047824 -0.158213 25 1 0.000266\n", "20 -0.047824 -0.158213 26 1 0.000265\n", "1 -0.047825 -0.158214 27 1 0.000261\n", "13 -0.047825 -0.158216 28 1 0.000261\n", "5 -0.047825 -0.158215 29 1 0.000260\n", "26 -0.047825 -0.158216 30 1 0.000260, topic_info= Term Freq Total Category logprob loglift\n", "904 would 8150.000000 8150.000000 Default 30.0000 30.0000\n", "446 know 21219.000000 21219.000000 Default 29.0000 29.0000\n", "344 go 24522.000000 24522.000000 Default 28.0000 28.0000\n", "695 say 14619.000000 14619.000000 Default 27.0000 27.0000\n", "166 come 14416.000000 14416.000000 Default 26.0000 26.0000\n", ".. ... ... ... ... ... ...\n", "867 want 0.000167 5565.190503 Topic30 -9.6361 -4.4641\n", "298 father 0.000167 3978.360804 Topic30 -9.6361 -4.1285\n", "488 man 0.000167 3496.535151 Topic30 -9.6361 -3.9994\n", "469 little 0.000167 4627.312828 Topic30 -9.6361 -4.2796\n", "486 make 0.000167 5055.998798 Topic30 -9.6361 -4.3682\n", "\n", "[2943 rows x 6 columns], token_table= Topic Freq Term\n", "term \n", "0 1 0.148470 able\n", "0 2 0.142320 able\n", "0 3 0.166040 able\n", "0 4 0.327688 able\n", "0 5 0.013178 able\n", "... ... ... ...\n", "3954 2 0.027790 zloty\n", "3954 3 0.833693 zloty\n", "3954 7 0.083369 zloty\n", "3954 8 0.027790 zloty\n", "8537 10 0.910631 zoom\n", "\n", "[2774 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[4, 3, 25, 16, 10, 20, 13, 22, 19, 7, 26, 18, 17, 23, 9, 15, 24, 28, 29, 5, 8, 1, 30, 12, 11, 21, 2, 14, 6, 27])" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pyLDAvis.enable_notebook()\n", "vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds=\"mmds\", R=30)\n", "vis" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }