{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Calculate coherence for a pretrained model\n", "\n", "The following workflow describes how to use gensim to calculate coherence measures for an LDA model that has already identified topics." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Extract the top 10 terms from the term frequency table" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from gensim.corpora.dictionary import Dictionary\n", "from gensim.models import CoherenceModel\n", "import re\n", "\n", "#TopicTermFreq = pd.read_csv('/Users/dankoban/Documents/EM6575/mallet_command_line/ct_tidy_topics.csv')\n", "TopicTermFreq = pd.read_csv('/Users/dankoban/Documents/EM6575/twitter/twitter_tidy_topics.csv')\n", "\n", "num_topics = 50\n", "num_top_terms = 10\n", "\n", "topic_term_list = []\n", "top_terms = []\n", "for k in range(1, num_topics+1): \n", " top_terms_topic_k = TopicTermFreq[TopicTermFreq['topic'] == (k-1)].sort_values('count', ascending = False)['term'].tolist()[0:num_top_terms]\n", " top_terms_topic_k = [re.sub(r'\\W+', '', term) for term in top_terms_topic_k] \n", " top_terms = top_terms + top_terms_topic_k\n", " topic_term_list.append(top_terms_topic_k)\n", "\n", "top_terms = list(set(top_terms))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load the raw text files and parse to retain top term vocab only" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "texts = []\n", "counter = 0\n", "with open('/Users/dankoban/Documents/CT_LDA/CT_data/mallet_input_data_crowdtangle.txt','r') as infile:\n", " for line in infile:\n", " line = line.split(' ') \n", " line = [re.sub(r'\\W+', '', term) for term in line]\n", " line = [word.lower() for word in line if word.lower() in top_terms]\n", " line = list(set(line))\n", " counter += 1\n", " if counter %500000 == 0: \n", " print(counter)\n", " texts.append(line) " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "500000\n", "1000000\n", "1500000\n", "2000000\n", "2500000\n", "3000000\n", "3500000\n", "4000000\n", "4500000\n", "5000000\n", "5500000\n", "6000000\n", "6500000\n", "7000000\n", "7500000\n", "8000000\n", "8500000\n", "9000000\n", "9500000\n", "10000000\n", "10500000\n" ] } ], "source": [ "import os\n", "input_dir = '/Users/dankoban/Documents/EM6575/twitter/hashtag model/hashtags'\n", "\n", "# Extract file names from input directory\n", "files = [file for file in os.listdir(input_dir) if file.endswith(\".txt\")] \n", "file_paths = [input_dir + \"/\" + file for file in files]\n", "file_paths = file_paths\n", "\n", "texts = []\n", "counter = 0\n", "for file in file_paths:\n", " with open(file,'r') as infile: \n", " for line in infile:\n", " line = line.split(' ') \n", " line = [re.sub(r'\\W+', '', term) for term in line]\n", " line = [word.lower() for word in line if word.lower() in top_terms]\n", " line = list(set(line))\n", " counter += 1\n", " if counter %500000 == 0: \n", " print(counter)\n", " texts.append(line)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10951065" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(texts)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Transform the raw text into bag of words dictionary and corpus" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "dictionary = Dictionary(texts)\n", "corpus = [dictionary.doc2bow(text) for text in texts]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Calculate topic coherence " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[-4.313408244829818,\n", " -6.3413129834533075,\n", " -3.905509802967587,\n", " -3.2721747805157393,\n", " -7.625408311947956,\n", " -1.7196023225550179,\n", " -4.203992873774845,\n", " -4.125434740376362,\n", " -3.3245267356686656,\n", " -4.243934168684841,\n", " -2.885251247261333,\n", " -3.274337705415125,\n", " -4.09288216257212,\n", " -4.924394662078107,\n", " -4.149309356654412,\n", " -4.484200299110569,\n", " -4.2947085374780825,\n", " -3.9431711613137117,\n", " -3.8064866510481634,\n", " -3.3631922605649422,\n", " -4.021276276302869,\n", " -4.8187575864249625,\n", " -3.840477926991953,\n", " -4.195053778502089,\n", " -3.2573543587006757,\n", " -2.948624393117324,\n", " -4.613766805713099,\n", " -4.522909421247455,\n", " -3.3593983229564035,\n", " -2.7540591437209723,\n", " -3.998741573269264,\n", " -3.5019332822601377,\n", " -2.8636397550329984,\n", " -3.209242248943138,\n", " -3.4505860256680485,\n", " -2.9636872767448135,\n", " -4.671770031055652,\n", " -8.15728667880787,\n", " -5.170307773349419,\n", " -3.7014139374486565,\n", " -3.6600284579976434,\n", " -3.531379750780082,\n", " -6.942124449900354,\n", " -3.3191671147482635,\n", " -3.926852284714465,\n", " -3.881024734399981,\n", " -2.9943846741583147,\n", " -3.8610574376382782,\n", " -4.207393297610942,\n", " -4.376751165760487]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from gensim.models import CoherenceModel\n", "\n", "cm = CoherenceModel(topics=topic_term_list, corpus=corpus, dictionary=dictionary, coherence='u_mass')\n", "coherence_scores = cm.get_coherence_per_topic()\n", "coherence_scores" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }