{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculate coherence for a pretrained model\n",
    "\n",
    "The following workflow describes how to use gensim to calculate coherence measures for an LDA model that has already identified topics."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract the top 10 terms from the term frequency table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from gensim.corpora.dictionary import Dictionary\n",
    "from gensim.models import CoherenceModel\n",
    "import re\n",
    "\n",
    "#TopicTermFreq = pd.read_csv('/Users/dankoban/Documents/EM6575/mallet_command_line/ct_tidy_topics.csv')\n",
    "TopicTermFreq = pd.read_csv('/Users/dankoban/Documents/EM6575/twitter/twitter_tidy_topics.csv')\n",
    "\n",
    "num_topics = 50\n",
    "num_top_terms = 10\n",
    "\n",
    "topic_term_list = []\n",
    "top_terms = []\n",
    "for k in range(1, num_topics+1):    \n",
    "    top_terms_topic_k = TopicTermFreq[TopicTermFreq['topic'] == (k-1)].sort_values('count', ascending = False)['term'].tolist()[0:num_top_terms]\n",
    "    top_terms_topic_k = [re.sub(r'\\W+', '', term) for term in top_terms_topic_k]    \n",
    "    top_terms = top_terms + top_terms_topic_k\n",
    "    topic_term_list.append(top_terms_topic_k)\n",
    "\n",
    "top_terms = list(set(top_terms))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load the raw text files and parse to retain top term vocab only"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "texts = []\n",
    "counter = 0\n",
    "with open('/Users/dankoban/Documents/CT_LDA/CT_data/mallet_input_data_crowdtangle.txt','r') as infile:\n",
    "    for line in infile:\n",
    "        line = line.split(' ')                  \n",
    "        line = [re.sub(r'\\W+', '', term) for term in line]\n",
    "        line = [word.lower() for word in line if word.lower() in top_terms]\n",
    "        line = list(set(line))\n",
    "        counter += 1\n",
    "        if counter %500000 == 0:            \n",
    "            print(counter)\n",
    "        texts.append(line)      "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "500000\n",
      "1000000\n",
      "1500000\n",
      "2000000\n",
      "2500000\n",
      "3000000\n",
      "3500000\n",
      "4000000\n",
      "4500000\n",
      "5000000\n",
      "5500000\n",
      "6000000\n",
      "6500000\n",
      "7000000\n",
      "7500000\n",
      "8000000\n",
      "8500000\n",
      "9000000\n",
      "9500000\n",
      "10000000\n",
      "10500000\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "input_dir = '/Users/dankoban/Documents/EM6575/twitter/hashtag model/hashtags'\n",
    "\n",
    "# Extract file names from input directory\n",
    "files = [file for file in os.listdir(input_dir) if file.endswith(\".txt\")]   \n",
    "file_paths = [input_dir + \"/\" + file for file in files]\n",
    "file_paths = file_paths\n",
    "\n",
    "texts = []\n",
    "counter = 0\n",
    "for file in file_paths:\n",
    "    with open(file,'r') as infile:    \n",
    "        for line in infile:\n",
    "            line = line.split(' ')                  \n",
    "            line = [re.sub(r'\\W+', '', term) for term in line]\n",
    "            line = [word.lower() for word in line if word.lower() in top_terms]\n",
    "            line = list(set(line))\n",
    "            counter += 1\n",
    "            if counter %500000 == 0:            \n",
    "                print(counter)\n",
    "            texts.append(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10951065"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(texts)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Transform the raw text into bag of words dictionary and corpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "dictionary = Dictionary(texts)\n",
    "corpus = [dictionary.doc2bow(text) for text in texts]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculate topic coherence "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[-4.313408244829818,\n",
       " -6.3413129834533075,\n",
       " -3.905509802967587,\n",
       " -3.2721747805157393,\n",
       " -7.625408311947956,\n",
       " -1.7196023225550179,\n",
       " -4.203992873774845,\n",
       " -4.125434740376362,\n",
       " -3.3245267356686656,\n",
       " -4.243934168684841,\n",
       " -2.885251247261333,\n",
       " -3.274337705415125,\n",
       " -4.09288216257212,\n",
       " -4.924394662078107,\n",
       " -4.149309356654412,\n",
       " -4.484200299110569,\n",
       " -4.2947085374780825,\n",
       " -3.9431711613137117,\n",
       " -3.8064866510481634,\n",
       " -3.3631922605649422,\n",
       " -4.021276276302869,\n",
       " -4.8187575864249625,\n",
       " -3.840477926991953,\n",
       " -4.195053778502089,\n",
       " -3.2573543587006757,\n",
       " -2.948624393117324,\n",
       " -4.613766805713099,\n",
       " -4.522909421247455,\n",
       " -3.3593983229564035,\n",
       " -2.7540591437209723,\n",
       " -3.998741573269264,\n",
       " -3.5019332822601377,\n",
       " -2.8636397550329984,\n",
       " -3.209242248943138,\n",
       " -3.4505860256680485,\n",
       " -2.9636872767448135,\n",
       " -4.671770031055652,\n",
       " -8.15728667880787,\n",
       " -5.170307773349419,\n",
       " -3.7014139374486565,\n",
       " -3.6600284579976434,\n",
       " -3.531379750780082,\n",
       " -6.942124449900354,\n",
       " -3.3191671147482635,\n",
       " -3.926852284714465,\n",
       " -3.881024734399981,\n",
       " -2.9943846741583147,\n",
       " -3.8610574376382782,\n",
       " -4.207393297610942,\n",
       " -4.376751165760487]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from gensim.models import CoherenceModel\n",
    "\n",
    "cm = CoherenceModel(topics=topic_term_list, corpus=corpus, dictionary=dictionary, coherence='u_mass')\n",
    "coherence_scores = cm.get_coherence_per_topic()\n",
    "coherence_scores"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}