{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Topic Model Diagnostics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Word Counts by Topic / Parse into Tidy Structure" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import re\n", "\n", "def tidy_word_count(data, row):\n", " df = pd.DataFrame({'term': [], 'topic_count':[]})\n", " try:\n", " topic_count = data[row][2:len(data[row])] \n", " df = pd.DataFrame({'topic_count': topic_count}) \n", " df['term'] = data[row][1] \n", " except:\n", " pass\n", " return df\n", "\n", "data = []\n", "#with open('/Users/dankoban/Documents/EM6575/mallet_command_line/ct.wordtopiccounts','r') as infile:\n", "with open('/Users/dankoban/Documents/EM6575/twitter/hashtag model/hashtags/hashtags.wordtopiccounts','r') as infile:\n", " for line in infile: \n", " line = line.split(' ') \n", " data.append(line) \n", "\n", "counter = 0\n", "tidy_dfs = []\n", "for i in range(0, len(data)):\n", " tidy_dfs.append(tidy_word_count(data = data, row = i))\n", " counter += 1\n", " if counter %10000 == 0:\n", " print(str(counter) + ' out of ' + str(len(data)))\n", " \n", "df = pd.concat(tidy_dfs)\n", "\n", "df['topic_count'] = df['topic_count'].apply(lambda x: re.sub(r'\\n', '', x)) \n", "df['topic'] = df['topic_count'].apply(lambda x: x.split(\":\")[0])\n", "df['count'] = df['topic_count'].apply(lambda x: x.split(\":\")[1])\n", "df = df[['term', 'topic', 'count']] \n", "df.reset_index(inplace = True, drop = True)\n", "#df.to_csv('/Users/dankoban/Documents/EM6575/mallet_command_line/ct_tidy_topics.csv', index=False) \n", "df.to_csv('/Users/dankoban/Documents/EM6575/twitter/twitter_tidy_topics.csv', index=False) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Topics by Documents / Parse into Tidy Structure" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "data = []\n", "#with open('/Users/dankoban/Documents/EM6575/mallet_command_line/ct.doctopics_sparse','r') as infile:\n", "with open('/Users/dankoban/Documents/EM6575/twitter/hashtag model/hashtags.doctopics_sparse','r') as infile:\n", " for line in infile: \n", " line = line.rstrip().split('\\t')\n", " data.append(line)\n", "\n", "df = pd.DataFrame(data[1:], columns = ['doc_id', 'name', 'topic_1', 'proportion_1', 'topic_2', 'proportion_2', 'topic_3', 'proportion_3'])\n", "df['topic_1'] = df['topic_1'].astype('float')\n", "df['topic_2'] = df['topic_2'].astype('float')\n", "df['topic_3'] = df['topic_3'].astype('float')\n", "df['proportion_1'] = df['proportion_1'].astype('float')\n", "df['proportion_2'] = df['proportion_2'].astype('float')\n", "df['proportion_3'] = df['proportion_3'].astype('float')\n", "\n", "rank1_docs = df[['doc_id', 'name', 'topic_1', 'proportion_1']]\n", "rank1_docs.columns = ['doc_id', 'name', 'topic', 'proportion']\n", "rank1_docs = rank1_docs.assign(rank = 1)\n", "\n", "rank2_docs = df[['doc_id', 'name', 'topic_2', 'proportion_2']]\n", "rank2_docs.columns = ['doc_id', 'name', 'topic', 'proportion']\n", "rank2_docs = rank2_docs.assign(rank = 2)\n", "\n", "rank3_docs = df[['doc_id', 'name', 'topic_3', 'proportion_3']]\n", "rank3_docs.columns = ['doc_id', 'name', 'topic', 'proportion']\n", "rank3_docs = rank3_docs.assign(rank = 3)\n", "\n", "df = pd.concat([rank1_docs, rank2_docs, rank3_docs])\n", "df = df[df['proportion'].notnull()]\n", "#df = df[df['proportion'].isna() == False]\n", "#df.to_csv('/Users/dankoban/Documents/EM6575/mallet_command_line/tidy_docs2topics.csv', index=False) \n", "df.to_csv('/Users/dankoban/Documents/EM6575/twitter/tidy_docs2topics.csv', index=False) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load data from step 1 and 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load the parsed data to save processing time." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>term</th>\n", " <th>topic</th>\n", " <th>count</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>bud</td>\n", " <td>34</td>\n", " <td>179</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>bud</td>\n", " <td>49</td>\n", " <td>133</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>bud</td>\n", " <td>31</td>\n", " <td>117</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>bud</td>\n", " <td>29</td>\n", " <td>106</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>bud</td>\n", " <td>39</td>\n", " <td>76</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " term topic count\n", "0 bud 34 179\n", "1 bud 49 133\n", "2 bud 31 117\n", "3 bud 29 106\n", "4 bud 39 76" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import seaborn as sns\n", "import pandas as pd\n", "import math\n", "import numpy as np\n", "\n", "num_topics = 50\n", "num_top_terms = 10\n", "\n", "#TopicTermFreq = pd.read_csv('/Users/dankoban/Documents/EM6575/mallet_command_line/tidy_topics.csv')\n", "#TopicTermFreq = pd.read_csv('/Users/dankoban/Documents/EM6575/mallet_command_line/ct_tidy_topics.csv')\n", "TopicTermFreq = pd.read_csv('/Users/dankoban/Documents/EM6575/twitter/twitter_tidy_topics.csv')\n", "TopicTermFreq.head()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "699836" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#docs2topic = pd.read_csv('/Users/dankoban/Documents/EM6575/mallet_command_line/tidy_docs2topics.csv')\n", "docs2topic = pd.read_csv('/Users/dankoban/Documents/EM6575/twitter/tidy_docs2topics.csv')\n", "len(docs2topic['doc_id'].unique())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tokens per Topic\n", "\n", "These descriptions are designed to make sense without mathematical definitions, but we include them for specificity. We'll use the symbols $w$ for a word, $k$ for a topic, and $d$ for a document. Several metrics count tokens. For these we define variables $N_{d,w,k}$ as the number of times word $w$ occurs in document $d$ assigned to topic $k$, $N_{d,k}$ as $\\sum_wN_{d,w,k}$. Other metrics involve document frequencies. $D(w)$ is the number of documents that contain at least one token of type $w$, and $D(w_1,w_2)$ is the number of documents that contain at least one $w_1$ and one $w_2$. Finally, we define $W_k$ to be the $N$ most probable words in topic $k$. $š(p)$ is equal to 1 if predicate $p$ is true, and 0 otherwise.\n", "\n", "**Tokens.** This metric measures the number of word tokens currently assigned to the topic $N_k=\\sum_dN_{d,k}$. Comparing this number to the sum of the token counts for all topics will give you the proportion of the corpus assigned to the topic. If you are using optimized hyperparameters this number will vary considerably across topics; otherwise it will be roughly the same for all topics. We usually find topics most interesting if they are not on the extremes of this range. Small numbers (relative to other topics) are often a sign that a topic may not be reliable: we don't have enough observations to get a good sense of the topic's word distribution. Large numbers may also be bad: extremely frequent topics are often \"not-quite-stopwords\". For example, in the sample model the largest topic is \"time number term part system form\"." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>topic</th>\n", " <th>token_count</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>1322510</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>749839</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2</td>\n", " <td>1972407</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>3</td>\n", " <td>10062309</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>4</td>\n", " <td>1522195</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " topic token_count\n", "0 0 1322510\n", "1 1 749839\n", "2 2 1972407\n", "3 3 10062309\n", "4 4 1522195" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokens_by_topic = (TopicTermFreq.groupby('topic').\n", " sum().\n", " reset_index().\n", " rename(columns = {'count': 'token_count'}))\n", "tokens_by_topic.head() " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<matplotlib.axes._subplots.AxesSubplot at 0x7fa9904b2640>" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAESCAYAAAD+GW7gAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3deZxcdZnv8c/T+76lO1tn6c5OyAohQEQBBUUHQQUVcIERLzLq4Myduffq9Q6j3DuvEedex1FGHQTGqCMu6CgqIEEIewJJSEL2hHT2pTvp9L53P/ePOolF052u7lR1VXV9369XvfrUOb865+mTytOnfudXz8/cHRERGfvS4h2AiIiMDiV8EZEUoYQvIpIilPBFRFKEEr6ISIpQwhcRSRFxTfhm9pCZ1ZrZlijs60oz2xj26DCzD0QjThGRscDiOQ7fzN4BtAA/dPcFUdxvGbAHmOLubdHar4hIMovrFb67PwfUh68zs5lm9oSZrTez581s3gh2fSPwuJK9iMifJGIf/v3AX7r7hcDfAt8ZwT5uAh6OalQiIkkuI94BhDOzAmAF8AszO706O9j2IeCeAV522N3fE7aPScBC4A+xjVZEJLkkVMIn9Imjwd2X9N/g7r8CfhXBPj4C/Ke7d0c7OBGRZJZQXTru3gTUmNmHASxk8TB3czPqzhEReYt4D8t8GHgZmGtmh8zsduBjwO1mtgnYClw/jP1VAVOBZ6MfrYhIcovrsEwRERk9CdWlIyIisRO3m7bl5eVeVVUVr8OLiCSl9evXn3D3ipG8Nm4Jv6qqinXr1sXr8CIiScnM9o/0terSERFJEUr4IiIpQglfRCRFKOGLiKQIJXwRkRShhC8ikiKU8EVEUoQSvohIilDCFxFJEYlWD1/C/GTtgajt65aLp0VtXyKSnIa8wjezHDN7xcw2mdlWM/vqAG1uM7M6M9sYPD4dm3BFRGSkIrnC7wTe6e4tZpYJvGBmj7v7mn7tfubun49+iCIiEg1DJnwPFcxvCZ5mBg8V0RcRSTIR3bQ1s3Qz2wjUAqvcfe0AzW4ws81m9oiZTR1kP3eY2TozW1dXV3cOYYuIyHBFlPDdvTeYWHwKsNzMFvRr8lugyt0XAauAlYPs5353X+buyyoqRlTOWURERmhYwzLdvQF4Brim3/qT7t4ZPH0AuDA64YmISLREMkqnwsxKguVc4GpgR782k8KeXgdsj2aQIiJy7iIZpTMJWGlm6YT+QPzc3X9nZvcA69z9UeAuM7sO6AHqgdtiFbCIiIxMJKN0NgNLB1h/d9jyl4AvRTc0ERGJJpVWEBFJEUr4IiIpQglfRCRFKOGLiKQIJXwRkRShhC8ikiKU8EVEUoQSvohIilDCFxFJEUr4IiIpQglfRCRFKOGLiKQIJXwRkRShhC8ikiKU8EVEUoQSvohIilDCFxFJEUr4IiIpQglfRCRFKOGLiKSIIRO+meWY2StmtsnMtprZVwdok21mPzOzPWa21syqYhGsiIiMXCRX+J3AO919MbAEuMbMLunX5nbglLvPAv4ZuDe6YYqIyLkaMuF7SEvwNDN4eL9m1wMrg+VHgHeZmUUtShEROWcR9eGbWbqZbQRqgVXuvrZfk0rgIIC79wCNwLgB9nOHma0zs3V1dXXnFrmIiAxLRAnf3XvdfQkwBVhuZgtGcjB3v9/dl7n7soqKipHsQkRERmhYo3TcvQF4Brim36bDwFQAM8sAioGT0QhQRESiI5JROhVmVhIs5wJXAzv6NXsUuDVYvhF42t379/OLiEgcZUTQZhKw0szSCf2B+Lm7/87M7gHWufujwIPAj8xsD1AP3BSziEVEZESGTPjuvhlYOsD6u8OWO4APRzc0ERGJJn3TVkQkRSjhi4ikCCV8EZEUoYQvIpIilPBFRFKEEr6ISIpQwhcRSRFK+CIiKUIJX0QkRSjhi4ikCCV8EZEUoYQvIpIilPBFRFKEEr6ISIpQwhcRSRFK+CIiKUIJX0QkRSjhi4ikCCV8EZEUoYQvIpIihkz4ZjbVzJ4xs21mttXMvjBAmyvMrNHMNgaPuwfal4iIxE9GBG16gL9x9w1mVgisN7NV7r6tX7vn3f3a6IcoIiLRMOQVvrsfdfcNwXIzsB2ojHVgIiISXcPqwzezKmApsHaAzZea2SYze9zMzh/k9XeY2TozW1dXVzfsYEVEZOQiTvhmVgD8Evgrd2/qt3kDMN3dFwPfBn490D7c/X53X+buyyoqKkYas4iIjEBECd/MMgkl+/9w91/13+7uTe7eEiw/BmSaWXlUIxURkXMSySgdAx4Etrv7NwZpMzFoh5ktD/Z7MpqBiojIuYlklM7bgE8Ar5vZxmDd/wSmAbj794Abgb8wsx6gHbjJ3T0G8YqIyAgNmfDd/QXAhmhzH3BftIISEZHo0zdtRURShBK+iEiKUMIXEUkRSvgiIilCCT/FuDvtXb3xDkNE4iCSYZmS5Hr7nCe3HuOZnXU8t6uOI43tLJlawtXzJ/Du+ROZNb4g3iGKyChQwh/jOrp7+dGa/dScaCU/K523zSrnuiWTeXHPCb7+xE6+/sRO/ubqOXz+nbMIvjsnImOUEv4Y1tTRzcqX9nG8qYN7b1jIB5dOISvjT714RxvbuffxHfy/Vbt4o66Fr92wiJzM9DhGLCKxpIQ/RtW3dvHgC3tp7ezlk5dW0dsHj6w/9JZ2F1WV0drVy683HmHDgQZuvbSK3KyzJ/1bLp4Wq7BFJIZ003YM6u1zfvLKfjq6+7j9smrmTCgctK2ZceXc8dy8fBqHTrXxyw2HUFUMkbFJCX8MemF3HUcaOvjA0kqmluVF9JqFlcW8d8Ekth1t4vndJ2IcoYjEgxL+GFPb3MEfd9Ry/uQiFlYWD+u1K2aOY0FlMU9uO0bNidYYRSgi8aKEP4b0ufOrDYfJTE/jusWTh/16M+NDSyspy8/ip68coLmjOwZRiki8KOGPIWtr6jlQ38afLZpEYU7miPaRk5nOLRdPp627l8e3HItyhCIST0r4Y0R3bx+rd9RSXZ7P0qkl57SviUU5vH12ORsPNqhrR2QMUcIfIzYcOEVzZw/vnDc+Kl+gumLOeEpyM/ntpiP09mnUjshYoIQ/BvT2Oc/tqmNKaS4zyvOjss+sjDTet3ASx5o6WFuj2SpFxgIl/DHg9cONnGrr5oo5FVEtj3D+5CJmjS/gqe3HdQNXZAxQwk9y7qGr+/GF2cybVBTVfZsZ7180ma6ePp7ZWRfVfYvI6FPCT3I7jzVzrKmDd8ypIC0Gxc8qCrO5cHopr+6rp6GtK+r7F5HRM2TCN7OpZvaMmW0zs61m9oUB2piZfcvM9pjZZjO7IDbhSn8v7DlBSW4mi6ec28ics7li7nhweHaXrvJFklkkV/g9wN+4+3zgEuBzZja/X5v3ArODxx3Ad6MapQyovrWLvSdauai6jPS02JU2Ls3LYllVKev2neKUrvJFktaQCd/dj7r7hmC5GdgOVPZrdj3wQw9ZA5SY2aSoRytvsn7/KQzOedx9JK6YOx4MVu+sjfmxRCQ2htWHb2ZVwFJgbb9NlcDBsOeHeOsfBczsDjNbZ2br6urUPXAu+tx57cApZo0voCQvK+bHK87NZHlVGev3n+LAybaYH09Eoi/ihG9mBcAvgb9y96aRHMzd73f3Ze6+rKKiYiS7kMDeulYa2ru5cHrpqB3z8uDG8Hef3TNqxxSR6Iko4ZtZJqFk/x/u/qsBmhwGpoY9nxKskxhZt7+enMw0zovyUMyzKcrN5IJppfxyw2HqmjtH7bgiEh2RjNIx4EFgu7t/Y5BmjwKfDEbrXAI0uvvRKMYpYdq7etl2pIklU0vITB/dkbWXzS6nu7ePlS/tG9Xjisi5iyRbvA34BPBOM9sYPN5nZnea2Z1Bm8eAvcAe4PvAZ2MTrgBsPtxAT59z4bSyUT92eUE275k/kR++vI/Wzp5RP76IjNyQc9q6+wvAWcf8eWhOvM9FKyg5uw37TzGxKIfJJTlxOf4dl8/gia3H+OmrB7n9suq4xCAiw6dv2iaZhrYuDp5qZ9GU4qjWzRmOC6aVsryqjIdeqKG7ty8uMYjI8CnhJ5mtR0IDpBZMHt70hdH2mctncLihnd9v1q0akWShhJ9kthxpZGJRDuWF2XGN48q545lZkc+DL9QQ6tETkUSnhJ9Emtq7OXCyjQXDnJw8FtLSjNveVs3rhxvZcOBUvMMRkQgo4SeRrUcacWBB5eiNvT+bGy6opDAng4de3BfvUEQkAkr4SWTLkSbGF2YzvjA+o3P6y8vK4KaLpvLElmMcaWiPdzgiMgQl/CTR3NHNvhOtCdGdE+6Tl1bh7vx4zf54hyIiQ1DCTxLbjjYF3TmJlfCnluVx9fwJPPzKATq6e+MdjoichRJ+kthyuJHygmwmxHl0zkBuW1HNqbZufv2ayieJJDIl/CTQ3tVLzYlWzp9cFLcvW53NJTPKmDexkJUv79cQTZEEpoSfBHbXNtPnMG9iYbxDGZCZ8clLq9h+tIn1+zVEUyRRKeEngR3HmsnLSmdqWV68QxnUB5ZOpjAng5Uv6+atSKIasniaxFdvn7PzWDPzJhaSliDdOT9Ze2DA9Ysqi3ls81EWTC6iMCczon3dcvG0aIYmImehK/wEd6C+jfbuXuaN4kQnI3XxjHH0uvPqvvp4hyIiA1DCT3A7jzWRbsbs8QXxDmVI5QXZzB5fwCs19fT26eatSKJRwk9w2481U12eT05merxDicglM8bR1NHDtqMjmvZYRGJICT+BnWzppK65k3mTEnN0zkDmTiykJC+TNXtPxjsUEelHCT+B7TjWDMC8iYnff39amhkXV5VRc6KV400d8Q5HRMIo4SewnceaGV+YTVl+VrxDGZYLq8pITzPW1ujmrUgiUcJPUC2dPdScaGVugn7Z6mwKsjNYWFnMawdO0an6OiIJY8iEb2YPmVmtmW0ZZPsVZtZoZhuDx93RDzP1vLjnBL3uzJ2QfAkf4JLqMjp7+th4qCHeoYhIIJIr/B8A1wzR5nl3XxI87jn3sGT1zjqyM9KYPi4/3qGMyNSyPCYV57B2b73q64gkiCETvrs/B6gzdhS5O8/urGVmRQHpaYnx7drhMjMumTGOY00d7DvZFu9wRITo9eFfamabzOxxMzt/sEZmdoeZrTOzdXV1dVE69Nizu7aFI40dSdudc9riKSXkZKZpiKZIgohGwt8ATHf3xcC3gV8P1tDd73f3Ze6+rKKiIgqHHptW76wFYE4S3rANl5WRxgXTStl2pInmju54hyOS8s454bt7k7u3BMuPAZlmVn7OkaWw1TvrmDuhkOLcyAqQJbJLqlVfRyRRnHPCN7OJFszKYWbLg33qM/wItXT28Oq+eq6YOzY+AZUXqr6OSKKIZFjmw8DLwFwzO2Rmt5vZnWZ2Z9DkRmCLmW0CvgXc5BqWMWIv7TlBd69z+RhJ+PCn+jrbVV9HJK6GrIfv7jcPsf0+4L6oRZTiVu+qIz8rnWXTy9h3YmyMbgmvr5Nok7CLpBJ90zaBhIZj1nHZ7HKyMsbOP02aGRdXj2Ov6uuIxNXYySpjwJ7aFg43tHP5nPHxDiXqlk0vJSPNNERTJI6U8BPIs7tC300YS/33p+VnZ7BoSjGvHWygQ/V1ROJCCT+BrN5Zx+zxBVSW5MY7lJi4dEY5XT19rN9/Kt6hiKQkJfwE0dbVwys1Y2c45kAqS3OZVpbHy3tP0qeBXCKjTgk/QazZe5Ku3r4x2X8fbsXMcdS3drHreHO8QxFJOUr4CWL1zjpyM9O5qLo03qHE1PmTiynKyeDlN3TzVmS0KeEniGd31bFi5jiyM5JjsvKRSk8zllePY3dtC7XNGqIpMpqU8BNAzYlW9p9sG5OjcwayvDo0BaKGaIqMLiX8BPBsUB3zijHef39aQXYGi6cUs2F/A41tqqIpMlqU8BPAs7vqqC7PZ9q4vHiHMmpWzCynq7ePn756IN6hiKQMJfw46+ju5eW9J7l8Tmp055w2uSSXGRX5/OClfXT39sU7HJGUoIQfZ2v2nqSjuy9l+u/DXTaznKONHTz2+tF4hyKSEpTw42z1zjpyMtO4dMa4eIcy6uZMLGRGeT4PvlCjic5FRoESfhy5O0/vqGXFzHJyMsf2cMyBpJnxqcuq2XyokXUqtyASc0r4cbT3RCsH6tu4cl5qjM4ZyA0XTKEkL5MHnt8b71BExjwl/Dh6Zsfp4Zip139/Wm5WOh+/eDpPbjvO3rqWeIcjMqYp4cfR6eqYU8tSZzjmQG5dUUVmehrf11W+SEwp4cdJS2cPa2tO8s4U7s45raIwmw9fOIVfrj9MrWbEEokZJfw4eTGYrPyKuUr4AHe8YwY9fX08+GJNvEMRGbOGTPhm9pCZ1ZrZlkG2m5l9y8z2mNlmM7sg+mGOPc/sqKUwO4NlVWO7Omakpo/L530LJ/GTNQdo6lC5BZFYiOQK/wfANWfZ/l5gdvC4A/juuYc1trk7z+ys5e1zyslM14es0+68fCbNnT38eM3+eIciMiYNmW3c/Tmg/ixNrgd+6CFrgBIzmxStAMeirUeaON7Uqe6cfhZUFvP22eU89MI+zXsrEgPRuLysBA6GPT8UrHsLM7vDzNaZ2bq6urooHDo5rdp2HDN0w3YAn7tyFidaOnn4FRVVE4m2Ue1PcPf73X2Zuy+rqEjdseerth3nwmmllBdkxzuUhHPJjHEsry7je8++oat8kSiLRsI/DEwNez4lWCcDOHSqjW1Hm7h6/oR4h5Kw/uqq2Rxv6uRnrx4curGIRCwaCf9R4JPBaJ1LgEZ3V/nDQTy17TiAEv5ZXDpjHMuryvju6jfo7NFVvki0RDIs82HgZWCumR0ys9vN7E4zuzNo8hiwF9gDfB/4bMyiHQNWbT/OzIp8ZlQUxDuUhGVmfOGq2Rxr6uDnusoXiZqMoRq4+81DbHfgc1GLaAxrbO9m7d56Pv32GfEOJeGtmDmOZdNL+c7qN/jIRVPH/OTuIqNBg8BH0eqdtfT0ubpzImBm/PXVczja2MF/rNGIHZFoUMIfRU9uO055QTZLp5bEO5Sk8LZZ5bxt1jjue2YPzfr2rcg5U8IfJZ09vTy7s46rzhtPWprFO5yk8T+umUd9axfff06VNEXOlRL+KHnpjZO0dPaoO2eYFk0p4c8WTeKBF2qoa+6MdzgiSU0Jf5T8btNRCnMyuGx2ebxDSTp/++65dPX08e2nd8c7FJGkpoQ/Cjp7enly2zHePX+iRpuMQHV5Ph+9aCo/WXuAfSda4x2OSNJSwh8Fz+86QXNHD9cuVk25kfrCVbPJyUznf/9uW7xDEUlaSvij4PevH6U4N5PLZqk7Z6TGF+Zw17tm8ccdtWfmAhaR4VHCj7GO7l5WbTvONedPVO37c3TbimpmlOdzz++20dXTF+9wRJKOMlCMPburjpZOdedEQ1ZGGne/fz41J1r5d02FKDJsSvgx9rvNRynLz+LSGePiHcqYcMXc8Vx13gS+9cfdHNeE5yLDooQfQ+1dvfxx+3GuWTCRDHXnRM3fXXsePX3O3/16C6FSTiISCWWhGHpq+3Haunq5dpG6c6Jp+rh8/ubdc3hy23F+/7oqcYtESgk/hn6x/hCVJblcUq3unGj71NuqWTylmL//zVbqW7viHY5IUlDCj5EjDe08v7uOGy6coto5MZCRnsbXb1xMU0c3X/3t1niHI5IUlPBj5JfrD+EOH75wSrxDGbPmTizkc1fO4jcbj/DElmPxDkck4Snhx0Bfn/OL9Ye4dMY4ppblxTucMe2zV8xiYWUx//2RTRw61RbvcEQSmhJ+DLyyr54D9W185CJd3cdaVkYa992ylD6Hux5+je5efSFLZDBK+DHw83UHKczO4JrzNTpnNEwfl8/XbljIhgMN/L8nd8U7HJGENeSctjI8zR3dPPb6UT64dAq5WaqMOZSfrI3e9IW3XDyN7z37BhdVlfKu8zTvgEh/EV3hm9k1ZrbTzPaY2RcH2H6bmdWZ2cbg8enoh5ocfrPxCB3dfXx4mbpzRtvd185nQWURdz38GtuONMU7HJGEM2TCN7N04F+B9wLzgZvNbP4ATX/m7kuCxwNRjjMp9PU5D71Yw6IpxZq3Ng5yMtN58NaLKMrN5FM/eJVjjSq9IBIukiv85cAed9/r7l3AT4HrYxtWclq9q5a9da3cflk1Zhp7Hw8TinJ46LaLaO7o5vaVr9La2RPvkEQSRiQJvxI4GPb8ULCuvxvMbLOZPWJmUwfakZndYWbrzGxdXV3dCMJNbA++UMPEohzet1A3a+PpvElF/OvHLmDHsWY+vXIdbV1K+iIQvVE6vwWq3H0RsApYOVAjd7/f3Ze5+7KKioooHToxbD/axIt7TnLriirVvU8AV8wdzzc+spi1NSe57d9fpUVX+iIRJfzDQPgV+5Rg3RnuftLdO4OnDwAXRie85PHgCzXkZqZzy/Jp8Q5FAtcvqeRfblrK+v2nuO2hV2ju6I53SCJxFUnCfxWYbWbVZpYF3AQ8Gt7AzML7MK4DtkcvxMRX29zBoxuP8OFlUyjOy4x3OBLm/Ysn8+2bl7LxYAMf/t7LHKzXt3EldQ2Z8N29B/g88AdCifzn7r7VzO4xs+uCZneZ2VYz2wTcBdwWq4AT0YMv1NDd18efv6063qHIAN63cBIP3nYRRxraue6+F3hpz4l4hyQSFxF1Nrv7Y+4+x91nuvs/BOvudvdHg+Uvufv57r7Y3a909x2xDDqRHGvs4Acv7uODSyqpLs+PdzgyiMvnVPCbz1/GuIJsPvHQK3zv2Tfo7dPkKZJadHfxHH3r6d30ufPXV8+JdygyhOryfP7zsyu4+rwJfO3xHXzoOy+y81hzvMMSGTUqrXAOak608rNXD/Lxi6epKmaSKMzJ5Lsfv4Dfbj7KVx7dyrXffp7/8vYZfOYdM990/yXaJR/GMp2r5KEr/HPwjVW7yM5I4/PvnB3vUGQYzIzrFk/mqf96Oe9fNJnvrH6Dy+59mm8+tYsmjeSRMUxX+CO05XAjv910hM9fOYuKwux4hyMjUJafxTc+uoQ7Lp/BP6/axTef2s39z+3l2kWTKM3LYlpZnr4xLWOKEv4I9PY5d/9mC6V5mdxx+Yx4hyPnaN7EIv7tE8vYcriRH6/Zz6ObjtDW1cu4/CzmTixkzoRCqsvz9YU6SXpK+COw8qV9bDjQwDc/uoSiHI27HysWVBbztRsW8XfXzufvfr2FLUcaeaWmnpfeOEl6mjGpOIeppXlUluRSUZhNRWE2OZkqgS3JQwl/mA7Wt/FPf9jJlXMruH7J5HiHIzGQn53BsqoyllWV0d3bx966VvbWtXCooZ31+0/x8t6Tb2pbmpdJcW4mJbmZFOZkUpiTceZnQXYGfX2uiewlISjhD4O786VfvU56mvEPH1yo/t0UkJmextyJhcydWAhAnzsnWjo50dwV+tnSSWN7N7VNnew63kx371vH9t/7xA4qCrMZX5TD+MJsJhXnMKk4l8klOUwty2N6WR5l+Vl6P0nMKeEPw8OvHOSFPSf4Px9YwOSS3HiHI3GQZsb4whzGF+a8ZZu709nTR3NHD80d3bR09tDc0cO0cXnUNnVS29zB/pOtrNl7kuaONxdzK8jOoLo8n1njC5hZkc+cCYWcN6mIKaW5+kMgUaOEH6ENB07xlUe3ctmschVIkwGZGTmZ6eRkpr9p5NZAY8tbOns40tDOwfo29p9sY//JVvaeaGXt3pP852t/qk1YmJ3BeZOLWFhZzMLKYhZNKaZqXL66iGRElPAjcLSxnc/8aD0Ti3P49s1L9Z9NzllBdgZzJoRGAPXX2tnDzuPNbD/axPajTWw90sSP1+yns6cPgKKcDBZPLWHp1BKWTCthydRSyvKzRvtXkCSkhD+Eju5e7vjheto6e/iPT19Mqf5jSYzlZ2dwwbRSLphWemZdT28fu2tb2HyogY0HG3jtQAP3PbOH0+WApo/LY8nUEhZPKWHx1BLmTyoiN0sjiOTNlPDPorOnl7sefo0tRxr5/ieWDXg1Jokjml/xj6Zox7WwsoSFlSV09vRyuKGdg/WhrqFndtTym41HADCgojCbySW5TCjKYUJRNhOKcijOzSTNTCUMUpQS/iDaunr4zI/W8/zuE3z1uvO5av6EeIck8ibZGenMKC9gRnnBmXWN7d0cPtXOkcZ2jjS0s7euhY0HG85sz0gzSvOz+OP240wszmFiUQ4TinIozc+iJC80tDQvO4OcjDRys9LJSEsjPc1Is9A9CnfHHXrd6e0LPTq6e3EHJ/RxwzDMID3NgteqCzRRKOEPoLG9m9t/8CobDpzi6zcu4iPLBpyiVyThFOeGvhMwf3LRmXXtXb3UNndwvKmTk62dnGzp4nBDOxsOnOJUW+xrB6WbkZlh5GSEbmjnZqVTkJ1x5vsKpXmZlOZlUZqfhbtrVFIMKeH3s+VwI1/46WscqG/jvlsu0ITkkvRys9KZPi6f6eP+NF/D6S6dju5e6po7OdXWRUNbNw3t3bR39dDR3Ud7dy89vX309oWu6CHUVWQWSuJpwRX85oMNb0nSHnwC6AkeXb19dHb30t7dR3tXD0cb29l1vOfMjejT/vWZPcysyGdmRQHzJhVx3qRC5k8qoiRP986iQQk/0NfnfP/5vfzfJ3dSlp/Fyk8tZ8XM8niHJRJTOZnpTC3LO6fy3udyj6Kzp5dTbd00tHZxsrWL0vxM9tS28MzOWn6x/tCZdlNKc0NDU6cUs3hKCYumFFOosibDpoQPvFJTz71P7GD9/lNcc/5E/vFDCzUaR2QUZGekM7EonYlFoS+yhd9MrmvuZPvRJrYdbeL1w41sOdzI41uOAaFPGbMqClgytYSl00pZMrWEORMKyFCBu7NK2YTv7mw40MC//HE3z+2qo6Iwm3+6cRE3XjhFfYgiCSBUoK6Cd8ypOLOuoa2LTYca2XiggY0HT/HU9uNnPgnkZqazsLKYxVOLWTSlhIWVxUwfpxLX4VIu4R9paOc3G4/wyw2H2FPbQkleJl967zw+eWmVxi2LJLiSvCwun1PB5cEfAXfnQH0bGw82nHmsfHk/XT01ABTmZDB/UhHnTSpi/qQi5k4sZOb4AgqyUy71AREmfBWvkx8AAAtkSURBVDO7BvgXIB14wN2/1m97NvBD4ELgJPBRd98X3VCHr7fP2Xeyle1Hm1i7t56X3jjBG3WtAFw4vZR//NBCrl00SX2BIknKzM7ckL5+SSUAXT197DrezJbDjbx+uJFtR5v4+bqDtHX1nnnd5OIcZo4vYFpZHlXj8plaFip7PbkkZ0wXshsy4ZtZOvCvwNXAIeBVM3vU3beFNbsdOOXus8zsJuBe4KOxCLi7N1ScqrWzh7auXlo6e2ho66K+tYtTbV0caejgSEM7h06180Zdy5lRAHlZ6SyvLuMjy6by7vMnUl2eP8SRRCQZZWWksaCymAWVxdwUrOvrc/bXt7HreDN7alvYfbyZmpNt/P71ozT0G5qalZHG+GC+g4qCbErzsijJDw0dPV3yuignk9ysdPKCR3ZGOtmZaWSnp5OVkUZmemgEU6L94YjkCn85sMfd9wKY2U+B64HwhH898JVg+RHgPjMzd39rrdhz9PiWY9z18GuDbs/PSqeyNJfJJbmsmDmOuRMLmTexiHmTCjVjkUiKSkszqsvzqS7P5z3nv3lbQ1sXB+v/9GW1o40dnGjupLa5kwP1bWw61MCp1m66evsG3vlZnE784cNY08y4bUUVd71r9OfCjiThVwIHw54fAi4erI2795hZIzAOOBHeyMzuAO4InraY2c6RBD2I8tPH2zZEwwRyJuYkkWzxQvLFPCrxfiy6u4tazFGOazBxf0+8BnxheC8Jj3n6SI87qncu3P1+4P5Y7NvM1rn7sljsO1aSLeZkixeSL+ZkixeSL+ZkixeiF3MkfRyHgfDaAlOCdQO2MbMMoJjQzVsREUkQkST8V4HZZlZtZlnATcCj/do8CtwaLN8IPB2L/nsRERm5Ibt0gj75zwN/IDQs8yF332pm9wDr3P1R4EHgR2a2B6iHMzfHR1NMuopiLNliTrZ4IfliTrZ4IfliTrZ4IUoxmy7ERURSg8YpioikCCV8EZEUkVQJ38zKzGyVme0OfpYO0GaJmb1sZlvNbLOZfTRs2w/MrMbMNgaPJTGK8xoz22lme8zsiwNszzaznwXb15pZVdi2LwXrd5rZe2IR3whj/q9mti04p380s+lh23rDzmn/G/rxivc2M6sLi+vTYdtuDd5Du83s1v6vjWPM/xwW7y4zawjbFo9z/JCZ1ZrZlkG2m5l9K/h9NpvZBWHbRv0cRxDvx4I4Xzezl8xscdi2fcH6jWa2bjTijTDmK8ysMezf/u6wbWd9Pw0oNGVZcjyArwNfDJa/CNw7QJs5wOxgeTJwFCgJnv8AuDHGMaYDbwAzgCxgEzC/X5vPAt8Llm8CfhYszw/aZwPVwX7SR+G8RhLzlUBesPwXp2MOnreM8vsgknhvA+4b4LVlwN7gZ2mwXJoIMfdr/5eEBkjE5RwHx3wHcAGwZZDt7wMeJzQvyiXA2jif46HiXXE6DuC9p+MNnu8DyhPwHF8B/O5c30+nH0l1hU+ohMPKYHkl8IH+Ddx9l7vvDpaPALVARf92MXSmFIW7dwGnS1GEC/89HgHeZWYWrP+pu3e6ew2wJ9hf3GN292fcvS14uobQ9zHiJZJzPJj3AKvcvd7dTwGrgGtiFGe44cZ8M/DwKMQ1KHd/jtCou8FcD/zQQ9YAJWY2iTid46HidfeXgngg/u9hIKJzPJgR/R9ItoQ/wd2PBsvHgLPOLG5mywn99XsjbPU/BB/r/tlCVT6jbaBSFJWDtXH3HuB0KYpIXhsLwz3u7YSu7E7LMbN1ZrbGzN7yRzgGIo33huDf+hEzO/3lwYQ/x0F3WTXwdNjq0T7HkRjsd4rXOR6O/u9hB540s/UWKgGTSC41s01m9riZna4ENKJznHBFoc3sKWDiAJu+HP7E3d3MBh1TGlxp/Ai41d1PVz36EqE/FFmExrX+D+CeaMSdKszs48Ay4PKw1dPd/bCZzQCeNrPX3f2Ngfcwan4LPOzunWb2GUKfqN4Z55gidRPwiLv3hq1LxHOclMzsSkIJ/7Kw1ZcF53c8sMrMdgRX3/G2gdC/fYuZvQ/4NTDiqmsJd4Xv7le5+4IBHr8BjgeJ/HRCrx1oH2ZWBPwe+HLwUfP0vo8GHz87gX8nNt0l51KKIpLXxkJExzWzqwj94b0uOIcAuPvh4OdeYDWwNJbBEkG87n4yLMYHCM3VENFrY2Q4x72Jft05cTjHkRjsd4rXOR6SmS0i9H643t3PlH8JO7+1wH8yOl2pQ3L3JndvCZYfAzLNrJyRnuPRvklxLg/gn3jzTduvD9AmC/gj8FcDbJsU/DTgm8DXYhBjBqGbVNX86WbK+f3afI4337T9ebB8Pm++abuX0blpG0nMSwl1jc3ut74UyA6Wy4HdRHDzaBTinRS2/EFgTbBcBtQEcZcGy2WJcI6DdvMI3UC0eJ7jsGNXMfgNxT/jzTdtX4nnOY4g3mmE7out6Lc+HygMW34JuGY04o0g5omn3wuE/ggdCM53RO+nt+xvtH6pKJ2YcYSS+W7gqdNvIkJdDA8Eyx8HuoGNYY8lwbangdeBLcCPgYIYxfk+YFeQIL8crLuH0JUxQA7wi+DN9wowI+y1Xw5etxN47yie26Fifgo4HnZOHw3WrwjO6abg5+0JEu8/AluDuJ4B5oW99lPBud8D/HminOPg+VfodyESx3P8MKFRbt2E+ohvB+4E7gy2G6HJkd4I4loWz3McQbwPAKfC3sPrgvUzgnO7KXjPfHkU3xNDxfz5sPfxGsL+WA30fhrqodIKIiIpIuH68EVEJDaU8EVEUoQSvohIilDCFxFJEUr4IiKjYKhCaf3aDlpI75xi0CgdEZHYM7N3AC2E6g8tGMbr/hJY6u6fOtcYdIUvCcvMSszss0O0ucLMfjdaMUVTJL+fjB0+QKE0M5tpZk8ENXyeN7N5A7w0aoX0lPAlkZUQKiU9Vo3130+Gdj/wl+5+IfC3wHfCNw5SSG/ElPAlkX0NmBn0Y/5T8NgSTFTx0f6NzewiM3stuGq60MyeDa6c/hBWg2m1md1rZq8EfaNvH+zgZpZuZv83OObm4KM1Zvau4DivB/2y2cH6fUGdE8xsmZmtDpa/ErRbbWZ7zeyugX6/aJ44SXxmVkDoW9S/MLONwL8Bk/o1G6iQ3oglXLVMkTBfBBa4+xIzu4HQV84XE6on86qZnalmaGYrgG8Tqgl+lFCl1OvdvS744/APhL7uD5Dh7suD6oN/D1w1yPHvIFTnZIm791hoxrUcQhPpvMvdd5nZDwlNCPPNIX6XeYQmkSkEdprZd8N/v8hPiYwhaUDDEP/+NxGqvRW1A4okg8sIlTvudffjwLPARcG28wh9NH6/ux8A5gILCJW53Qj8L9482cWvgp/rCSX0wVwF/JuH5izA3euDfde4+66gzUpCsxYN5fcemtjmBKEqr2edy0HGPndvAmrM7MNwZsrI8GkX5xEqPvdytI6pK3wZC44SKki3FDhCqKjXVne/dJD2p8sm9xLd/wM9/OkiKmeQY8biuJIEzOxhQlMWlpvZIUKfLj8GfNfM/heQSWjmqk3BS24iNANe1IZS6k0niayZUBcIwPPAZ8xsJaHyu+8A/huhrpIGQlUGV5lZK6HythVmdqm7v2xmmcAcd986zOOvCo75zOkuHUJVTKvMbJa77wE+QejTBoTKGl9IqGTwDcP8/WSMc/ebB9k04PSP7v6VaMegLh1JWB6aoOLF4IsqlwKbCV39PA38d3c/Ftb2OHAtoXK9S4EbgXvNbBOhUrgrRhDCA4Tqj28O9nOLu3cAf07oRtvrQB/wvaD9V4F/MbN1hK7iI/79dNNWRoO+eCUikiJ0hS8ikiLUhy8pz8zeA9zbb3WNu38wHvGIxIq6dEREUoS6dEREUoQSvohIilDCFxFJEUr4IiIp4v8DzqkPhtm0ZYgAAAAASUVORK5CYII=\n", "text/plain": [ "<Figure size 432x288 with 1 Axes>" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "sns.distplot(tokens_by_topic[\"token_count\"])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>topic</th>\n", " <th>top_n_terms</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>coronavirus, covid, travel, japan, airlines, aviation, uae, flights, tourism, amp</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>covid, coronavirus, join, bitcoin, contest, ecoins, crypto, free, earn, contestalert</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2</td>\n", " <td>covid, support, food, amp, coronavirus, community, donate, ireland, local, people</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>3</td>\n", " <td>coronavirus, covid, health, pandemic, news, lockdown, outbreak, amp, virus, government</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>4</td>\n", " <td>covid, coronavirus, follow, stayhome, bts, bbb, highriskcovid, day, mondaythoughts, survival</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>5</td>\n", " <td>economy, people, urge, coronavirus, million, debt, package, needed, student, stimulate</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>6</td>\n", " <td>usa, covid, coronavirus, america, project, trump, amp, tuesdaythoughts, topics, amazing</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>7</td>\n", " <td>coronavirus, iran, covid, amp, yemen, israel, russia, people, health, syria</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>8</td>\n", " <td>covid, workfromhome, home, quarantinelife, make, coronavirus, extra, online, wfh, work</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>9</td>\n", " <td>covid, coronavirus, easter, god, hope, jesus, love, ramadan, prayer, pray</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>10</td>\n", " <td>coronavirus, redbubble, covid, art, findyourthing, support, products, awesome, printed, rbandme</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>11</td>\n", " <td>china, coronavirus, covid, wuhan, chinesevirus, wuhanvirus, chinese, chinavirus, world, ccp</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>12</td>\n", " <td>covid, coronavirus, u.s, education, students, school, trump, kids, pandemic, learning</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>13</td>\n", " <td>covid, coronavirus, australia, auspol, government, alert, world, australian, community, aus</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>14</td>\n", " <td>covid, coronavirus, dogs, cats, amp, animals, dog, pets, cat, animal</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>15</td>\n", " <td>covid, coronavirus, art, design, artist, corona, pandemic, drawing, quedateencasa, confinement</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>16</td>\n", " <td>covid, pakistan, coronavirus, kashmir, cases, amp, karachi, positive, lahore, islamabad</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>17</td>\n", " <td>pandemic, covid, coronavirus, amp, world, global, earthday, climatechange, crisis, virus</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>18</td>\n", " <td>covid, coronavirus, memes, italia, italy, tiktok, funny, corona, news, meme</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>19</td>\n", " <td>covid, coronavirus, nhs, coronavirusuk, borisjohnson, lockdown, amp, boris, people, news</td>\n", " </tr>\n", " <tr>\n", " <th>20</th>\n", " <td>20</td>\n", " <td>coronavirus, covid, economy, oil, stocks, market, markets, stockmarket, trading, recession</td>\n", " </tr>\n", " <tr>\n", " <th>21</th>\n", " <td>21</td>\n", " <td>coronavirus, covid, live, youtube, watch, twitch, gaming, video, gta, xbox</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>22</td>\n", " <td>covid, coronavirus, technology, tech, data, cybersecurity, google, apple, app, pandemic</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>23</td>\n", " <td>covid, music, coronavirus, love, nyc, london, nowplaying, unite, paris, hiphop</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>24</td>\n", " <td>covid, lockdown, coronavirus, india, indiafightscorona, corona, stayhomestaysafe, coronavirusindia, fight, stay</td>\n", " </tr>\n", " <tr>\n", " <th>25</th>\n", " <td>25</td>\n", " <td>covid, stayhome, staysafe, stayathome, coronavirus, stay, home, safe, stayhomesavelives, flattenthecurve</td>\n", " </tr>\n", " <tr>\n", " <th>26</th>\n", " <td>26</td>\n", " <td>coronavirus, covid, nba, sports, football, due, season, nfl, mlb, olympics</td>\n", " </tr>\n", " <tr>\n", " <th>27</th>\n", " <td>27</td>\n", " <td>covid, coronavirus, vaccine, patients, amp, sarscov, treatment, testing, test, cdc</td>\n", " </tr>\n", " <tr>\n", " <th>28</th>\n", " <td>28</td>\n", " <td>covid, coronavirus, cases, india, positive, total, amp, delhi, state, lockdown</td>\n", " </tr>\n", " <tr>\n", " <th>29</th>\n", " <td>29</td>\n", " <td>covid, coronavirus, corona, coronavirusoutbreak, coronaviruspandemic, coronavirusupdate, coronavirusupdates, virus, coronaoutbreak, cases</td>\n", " </tr>\n", " <tr>\n", " <th>30</th>\n", " <td>30</td>\n", " <td>covid, coronavirus, amp, sign, relief, petition, give, american, stimulus, pandemic</td>\n", " </tr>\n", " <tr>\n", " <th>31</th>\n", " <td>31</td>\n", " <td>covid, coronavirus, horny, sex, porn, sexy, ass, onlyfans, nudes, cum</td>\n", " </tr>\n", " <tr>\n", " <th>32</th>\n", " <td>32</td>\n", " <td>covid, coronavirus, quarantine, lockdown, quarantinelife, stayhome, day, socialdistancing, stayathome, home</td>\n", " </tr>\n", " <tr>\n", " <th>33</th>\n", " <td>33</td>\n", " <td>socialdistancing, covid, coronavirus, coronalockdown, stayathomeandstaysafe, listen, great, coronaupdate, click, music</td>\n", " </tr>\n", " <tr>\n", " <th>34</th>\n", " <td>34</td>\n", " <td>coronavirus, covid, people, amp, time, it's, don't, dont, good, i'm</td>\n", " </tr>\n", " <tr>\n", " <th>35</th>\n", " <td>35</td>\n", " <td>covid, amp, healthcare, workers, nurses, doctors, coronavirus, ppe, health, care</td>\n", " </tr>\n", " <tr>\n", " <th>36</th>\n", " <td>36</td>\n", " <td>covid, coronavirus, read, life, poetry, book, books, blog, free, motivation</td>\n", " </tr>\n", " <tr>\n", " <th>37</th>\n", " <td>37</td>\n", " <td>covid, coronavirus, maga, qanon, wwg, kag, fakenews, wga, amp, billgates</td>\n", " </tr>\n", " <tr>\n", " <th>38</th>\n", " <td>38</td>\n", " <td>coronavirus, covid, due, news, facebook, marketing, movie, twitter, socialmedia, film</td>\n", " </tr>\n", " <tr>\n", " <th>39</th>\n", " <td>39</td>\n", " <td>covid, coronavirus, lockdown, day, photography, nature, socialdistancing, love, stayhome, beautiful</td>\n", " </tr>\n", " <tr>\n", " <th>40</th>\n", " <td>40</td>\n", " <td>covid, coronavirus, health, mentalhealth, amp, anxiety, pandemic, care, support, tips</td>\n", " </tr>\n", " <tr>\n", " <th>41</th>\n", " <td>41</td>\n", " <td>covid, coronavirus, mask, masks, face, facemask, amp, facemasks, hands, virus</td>\n", " </tr>\n", " <tr>\n", " <th>42</th>\n", " <td>42</td>\n", " <td>covid, nigeria, lockdown, africa, coronavirus, day, oflockdown, lagos, kenya, ghana</td>\n", " </tr>\n", " <tr>\n", " <th>43</th>\n", " <td>43</td>\n", " <td>covid, georgia, atlanta, realestate, news, coronavirus, conspiracy, university, truth, medical</td>\n", " </tr>\n", " <tr>\n", " <th>44</th>\n", " <td>44</td>\n", " <td>coronavirus, covid, breaking, cases, nyc, state, newyork, florida, california, positive</td>\n", " </tr>\n", " <tr>\n", " <th>45</th>\n", " <td>45</td>\n", " <td>covid, coronavirus, amp, business, latest, crisis, pandemic, webinar, impact, read</td>\n", " </tr>\n", " <tr>\n", " <th>46</th>\n", " <td>46</td>\n", " <td>cases, covid, coronavirus, deaths, death, total, confirmed, italy, recovered, spain</td>\n", " </tr>\n", " <tr>\n", " <th>47</th>\n", " <td>47</td>\n", " <td>coronavirus, covid, trump, amp, trumpvirus, americans, president, cnn, donaldtrump, people</td>\n", " </tr>\n", " <tr>\n", " <th>48</th>\n", " <td>48</td>\n", " <td>covid, canada, news, coronavirus, latest, cdnpoli, ontario, daily, toronto, covidcanada</td>\n", " </tr>\n", " <tr>\n", " <th>49</th>\n", " <td>49</td>\n", " <td>covid, coronavirus, food, healthy, cannabis, health, coffee, lockdown, stayhome, immunity</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " topic \\\n", "0 0 \n", "1 1 \n", "2 2 \n", "3 3 \n", "4 4 \n", "5 5 \n", "6 6 \n", "7 7 \n", "8 8 \n", "9 9 \n", "10 10 \n", "11 11 \n", "12 12 \n", "13 13 \n", "14 14 \n", "15 15 \n", "16 16 \n", "17 17 \n", "18 18 \n", "19 19 \n", "20 20 \n", "21 21 \n", "22 22 \n", "23 23 \n", "24 24 \n", "25 25 \n", "26 26 \n", "27 27 \n", "28 28 \n", "29 29 \n", "30 30 \n", "31 31 \n", "32 32 \n", "33 33 \n", "34 34 \n", "35 35 \n", "36 36 \n", "37 37 \n", "38 38 \n", "39 39 \n", "40 40 \n", "41 41 \n", "42 42 \n", "43 43 \n", "44 44 \n", "45 45 \n", "46 46 \n", "47 47 \n", "48 48 \n", "49 49 \n", "\n", " top_n_terms \n", "0 coronavirus, covid, travel, japan, airlines, aviation, uae, flights, tourism, amp \n", "1 covid, coronavirus, join, bitcoin, contest, ecoins, crypto, free, earn, contestalert \n", "2 covid, support, food, amp, coronavirus, community, donate, ireland, local, people \n", "3 coronavirus, covid, health, pandemic, news, lockdown, outbreak, amp, virus, government \n", "4 covid, coronavirus, follow, stayhome, bts, bbb, highriskcovid, day, mondaythoughts, survival \n", "5 economy, people, urge, coronavirus, million, debt, package, needed, student, stimulate \n", "6 usa, covid, coronavirus, america, project, trump, amp, tuesdaythoughts, topics, amazing \n", "7 coronavirus, iran, covid, amp, yemen, israel, russia, people, health, syria \n", "8 covid, workfromhome, home, quarantinelife, make, coronavirus, extra, online, wfh, work \n", "9 covid, coronavirus, easter, god, hope, jesus, love, ramadan, prayer, pray \n", "10 coronavirus, redbubble, covid, art, findyourthing, support, products, awesome, printed, rbandme \n", "11 china, coronavirus, covid, wuhan, chinesevirus, wuhanvirus, chinese, chinavirus, world, ccp \n", "12 covid, coronavirus, u.s, education, students, school, trump, kids, pandemic, learning \n", "13 covid, coronavirus, australia, auspol, government, alert, world, australian, community, aus \n", "14 covid, coronavirus, dogs, cats, amp, animals, dog, pets, cat, animal \n", "15 covid, coronavirus, art, design, artist, corona, pandemic, drawing, quedateencasa, confinement \n", "16 covid, pakistan, coronavirus, kashmir, cases, amp, karachi, positive, lahore, islamabad \n", "17 pandemic, covid, coronavirus, amp, world, global, earthday, climatechange, crisis, virus \n", "18 covid, coronavirus, memes, italia, italy, tiktok, funny, corona, news, meme \n", "19 covid, coronavirus, nhs, coronavirusuk, borisjohnson, lockdown, amp, boris, people, news \n", "20 coronavirus, covid, economy, oil, stocks, market, markets, stockmarket, trading, recession \n", "21 coronavirus, covid, live, youtube, watch, twitch, gaming, video, gta, xbox \n", "22 covid, coronavirus, technology, tech, data, cybersecurity, google, apple, app, pandemic \n", "23 covid, music, coronavirus, love, nyc, london, nowplaying, unite, paris, hiphop \n", "24 covid, lockdown, coronavirus, india, indiafightscorona, corona, stayhomestaysafe, coronavirusindia, fight, stay \n", "25 covid, stayhome, staysafe, stayathome, coronavirus, stay, home, safe, stayhomesavelives, flattenthecurve \n", "26 coronavirus, covid, nba, sports, football, due, season, nfl, mlb, olympics \n", "27 covid, coronavirus, vaccine, patients, amp, sarscov, treatment, testing, test, cdc \n", "28 covid, coronavirus, cases, india, positive, total, amp, delhi, state, lockdown \n", "29 covid, coronavirus, corona, coronavirusoutbreak, coronaviruspandemic, coronavirusupdate, coronavirusupdates, virus, coronaoutbreak, cases \n", "30 covid, coronavirus, amp, sign, relief, petition, give, american, stimulus, pandemic \n", "31 covid, coronavirus, horny, sex, porn, sexy, ass, onlyfans, nudes, cum \n", "32 covid, coronavirus, quarantine, lockdown, quarantinelife, stayhome, day, socialdistancing, stayathome, home \n", "33 socialdistancing, covid, coronavirus, coronalockdown, stayathomeandstaysafe, listen, great, coronaupdate, click, music \n", "34 coronavirus, covid, people, amp, time, it's, don't, dont, good, i'm \n", "35 covid, amp, healthcare, workers, nurses, doctors, coronavirus, ppe, health, care \n", "36 covid, coronavirus, read, life, poetry, book, books, blog, free, motivation \n", "37 covid, coronavirus, maga, qanon, wwg, kag, fakenews, wga, amp, billgates \n", "38 coronavirus, covid, due, news, facebook, marketing, movie, twitter, socialmedia, film \n", "39 covid, coronavirus, lockdown, day, photography, nature, socialdistancing, love, stayhome, beautiful \n", "40 covid, coronavirus, health, mentalhealth, amp, anxiety, pandemic, care, support, tips \n", "41 covid, coronavirus, mask, masks, face, facemask, amp, facemasks, hands, virus \n", "42 covid, nigeria, lockdown, africa, coronavirus, day, oflockdown, lagos, kenya, ghana \n", "43 covid, georgia, atlanta, realestate, news, coronavirus, conspiracy, university, truth, medical \n", "44 coronavirus, covid, breaking, cases, nyc, state, newyork, florida, california, positive \n", "45 covid, coronavirus, amp, business, latest, crisis, pandemic, webinar, impact, read \n", "46 cases, covid, coronavirus, deaths, death, total, confirmed, italy, recovered, spain \n", "47 coronavirus, covid, trump, amp, trumpvirus, americans, president, cnn, donaldtrump, people \n", "48 covid, canada, news, coronavirus, latest, cdnpoli, ontario, daily, toronto, covidcanada \n", "49 covid, coronavirus, food, healthy, cannabis, health, coffee, lockdown, stayhome, immunity " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def top_n_terms(k, n = 10):\n", " result = (TopicTermFreq[TopicTermFreq['topic'] == k].\n", " sort_values('count', ascending=False).head(n))\n", " return result\n", "\n", "pd.set_option('display.max_colwidth', None)\n", "topic_top_n = []\n", "for i in range(0, num_topics):\n", " concat_terms = ', '.join(top_n_terms(i, n = num_top_terms)['term'].tolist())\n", " topic_top_n.append([i, concat_terms])\n", "\n", "topic_tbl = pd.DataFrame(topic_top_n, columns = ['topic', 'top_n_terms'])\n", "topic_tbl" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Document Entropy\n", "\n", "We usually think of the probability of a topic given a document. For this metric we calculate the probability of documents given a topic. <br> <br>\n", "<center>\n", "$P(d|k)=\\frac{N_{d,k}}{\\sum_{dā²}N_{dā²,k}}$\n", "</center>\n", "<br> \n", "We count the frequency of a topic over all documents, and normalize to get a distribution, then calculate the entropy of that distribution $H(P(d|k))$. A topic with low entropy (i.e., high predictability) will be concentrated in a few documents, while a topic with higher entropy will be spread evenly over many documents. In the example topic model, the topics \"list links history external information site\" and \"game team league player players football games\" have roughly the same number of tokens, but the \"list\" topic has much higher entropy. It occurs to a small degree in many documents, while the \"game\" topic occurs a lot in a smaller number of documents. Low entropy isn't necessarily good: it can indicate unusual documents (did you accidentally import the Mac .DS_Store file?) or the presence of documents in other languages." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# from scipy.stats import entropy\n", "# k = 1\n", "# docs2topic_nonull = docs2topic[docs2topic['topic'].isnull() == False]\n", "# topic_freq_over_all_docs = docs2topic[docs2topic['topic'] == k]['proportion'].tolist()\n", "\n", "# def calc_doc_entropy(topics, base=None):\n", "# value,counts = np.unique(topics, return_counts=True)\n", "# return entropy(counts, base=base)\n", "\n", "# doc_entropy = []\n", "# for k in range(0, num_topics):\n", "# topic_freq_over_all_docs = docs2topic[docs2topic['topic'] == k]['proportion'].tolist()\n", "# doc_entropy.append([k, calc_doc_entropy(topic_freq_over_all_docs)]) \n", "# doc_entropy = pd.DataFrame(doc_entropy, columns = ['topic', 'doc_entropy'])\n", "# doc_entropy.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Word Length \n", "\n", "The average length, in characters, of the displayed top words. <br><br>\n", "<center>\n", "$\\frac{1}{|W|}\\sum_{wāW}len(w)$ \n", "</center>\n", "<br>\n", "Words are not weighted by probablity or rank position. This is a useful proxy for topic specificity. Longer words often carry more specific meaning, so if the topic brings together lots of short words, it's probably not a very specific topic. This can also be a good way to pick up the \"hey, looks like we've got some Spanish!\" topic because we often pick up the short words that would be stopwords if we were modeling a corpus in that language. The example topic \"de french la france spanish spain italian paris el le\" has a number of this type of short words, but they seem to be due to Wikipedia articles talking about French, Spanish, and Italian things rather than actual text in those languages." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>topic</th>\n", " <th>word-length</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>29</th>\n", " <td>29</td>\n", " <td>11.9</td>\n", " </tr>\n", " <tr>\n", " <th>33</th>\n", " <td>33</td>\n", " <td>10.0</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>24</td>\n", " <td>9.3</td>\n", " </tr>\n", " <tr>\n", " <th>32</th>\n", " <td>32</td>\n", " <td>8.9</td>\n", " </tr>\n", " <tr>\n", " <th>25</th>\n", " <td>25</td>\n", " <td>8.6</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " topic word-length\n", "29 29 11.9\n", "33 33 10.0\n", "24 24 9.3\n", "32 32 8.9\n", "25 25 8.6" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "topics = TopicTermFreq['topic'].unique().tolist()\n", "\n", "topic_ids = []\n", "avg_word_lengths = []\n", "for i in range(0, num_topics):\n", " top_n = top_n_terms(k = i, n = num_top_terms)\n", " avg_len = top_n['term'].apply(len).mean()\n", " avg_word_lengths.append(avg_len)\n", " topic_ids.append(i)\n", "\n", "word_len_by_topic = pd.DataFrame({'topic': topic_ids,\n", " 'word-length': avg_word_lengths})\n", "\n", "word_len_by_topic = word_len_by_topic.sort_values('word-length', ascending = False)\n", "word_len_by_topic.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Coherence\n", "\n", "This metric measures whether the words in a topic tend to co-occur together. We add up a score for each distinct pair of top ranked words. The score is the log of the probability that a document containing at least one instance of the higher-ranked word also contains at least one instance of the lower-ranked word. <br><br>\n", "<center>\n", "$\\sum_i\\sum_jlog\\frac{D(w_j,w_i)+Ī²}{D(w_i)}$\n", "</center><br>\n", "To avoid log zero errors we add the \"beta\" topic-word smoothing parameter specified when you calculate diagnostics. Since these scores are log probabilities they are negative. Large negative values indicate words that don't co-occur often; values closer to zero indicate that words tend to co-occur more often. The least coherent topic in the sample file is \"polish poland danish denmark sweden swedish na norway norwegian sk red iceland bj baltic copenhagen cave greenland krak gda faroese\". This topic seems to be about Northern and Eastern European countries, but the short abbreviations \"na\" and \"sk\" and the words \"red\" and \"cave\" don't really fit.\n", "\n", "$D(w_j,w_i)$ = number of documents containing $w_i$ and $w_j$\n", "\n", "$D(w_i)$ = number of topics containing $w_i$\n", "\n", "$W$ = set of words describing the topic" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# See coherence_calculation.ipynb" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Uniform Dist\n", "\n", "We want topics to be specific. This metric measures the distance from a topic's distribution over words to a uniform distribution. We calculate distance using Kullback-Leibler divergence.<br><br>\n", "<center>\n", "$\\sum_wP(w|k)log\\frac{P(w|k)}{\\frac{1}{|v|}}$\n", "</center><br>\n", "Larger values indicate more specificity." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Term</th>\n", " <th>uniform_dist</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>coronavirus</td>\n", " <td>0.646345</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>covid</td>\n", " <td>0.549688</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>travel</td>\n", " <td>0.311497</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>japan</td>\n", " <td>0.155116</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>airlines</td>\n", " <td>0.099394</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Term uniform_dist\n", "0 coronavirus 0.646345\n", "1 covid 0.549688\n", "2 travel 0.311497\n", "3 japan 0.155116\n", "4 airlines 0.099394" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def calc_KL_diverg_terms(TopicTermFreq, k, n):\n", " term_count_corpus = len(TopicTermFreq['term'].unique()) \n", " token_count_topic_k = sum(TopicTermFreq[(TopicTermFreq['topic'] == k)]['count'])\n", " topic_k_terms = top_n_terms(k = k, n = n)['term'].tolist()\n", " KL_divergence_terms = [] \n", " for term in topic_k_terms: \n", " term_count = TopicTermFreq[(TopicTermFreq['topic'] == k) & \n", " (TopicTermFreq['term'] == term)]['count'].tolist()[0]\n", " P_w_k = term_count/token_count_topic_k\n", " KL_diverg_term = P_w_k*math.log(P_w_k/(1/term_count_corpus))\n", " KL_divergence_terms.append([term, KL_diverg_term])\n", " KL_divergence_terms = pd.DataFrame(KL_divergence_terms, columns = ['Term', 'uniform_dist'])\n", " return KL_divergence_terms\n", "\n", "KL_divergence_terms = calc_KL_diverg_terms(TopicTermFreq, k = 0, n = num_top_terms)\n", "KL_divergence_terms.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>topic</th>\n", " <th>uniform_dist</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>6.889917</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>8.155044</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2</td>\n", " <td>6.823867</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>3</td>\n", " <td>6.630374</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>4</td>\n", " <td>6.574626</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " topic uniform_dist\n", "0 0 6.889917\n", "1 1 8.155044\n", "2 2 6.823867\n", "3 3 6.630374\n", "4 4 6.574626" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def calc_KL_diverg_unif(TopicTermFreq, k):\n", " # calc distribution of topic k\n", " token_counts_topic_k = TopicTermFreq[(TopicTermFreq['topic'] == k)]['count'].tolist()\n", " token_count_topic_k = sum(token_counts_topic_k) \n", " P_w_k = [count/token_count_topic_k for count in token_counts_topic_k]\n", " \n", " # determine number of terms in the corpus to calc uniform dist (1/term_count_corpus)\n", " term_count_corpus = len(TopicTermFreq['term'].unique()) \n", " KL_diverg_topic = sum([p*math.log(p/(1/term_count_corpus)) for p in P_w_k])\n", " return KL_diverg_topic\n", "\n", "unif_dist = []\n", "for k in range(0, num_topics):\n", " unif_dist.append([k, calc_KL_diverg_unif(TopicTermFreq, k = k)])\n", "unif_dist = pd.DataFrame(unif_dist, columns = ['topic', 'uniform_dist']) \n", "unif_dist.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Corpus Dist\n", "\n", "This metric measures how far a topic is from the overall distribution of words in the corpus ā essentially what you would get if you \"trained\" a model with one topic. We calculate distance using Kullback-Leibler divergence. A greater distance means the topic is more distinct; a smaller distanace means that the topic is more similar to the corpus distribution. Not surprisingly, it correlates with number of tokens since a topic that makes up a large proportion of the tokens in the corpus is likely to be more similar to the overall corpus distribution. The closest topic to the corpus distribution is \"time number term part system form\". " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>topic</th>\n", " <th>corpus_dist</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>2.343306</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>3.408515</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2</td>\n", " <td>1.863144</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>3</td>\n", " <td>0.708723</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>4</td>\n", " <td>2.564010</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " topic corpus_dist\n", "0 0 2.343306\n", "1 1 3.408515\n", "2 2 1.863144\n", "3 3 0.708723\n", "4 4 2.564010" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def calc_KL_diverg_corpus(TopicTermFreq, k):\n", " # count terms across topics and corpus\n", " num_tokens_in_corpus = TopicTermFreq['count'].sum()\n", " num_tokens_in_topic = TopicTermFreq['count'][TopicTermFreq['topic']==k].sum()\n", " term_count_corpus_df = TopicTermFreq.groupby('term')['count'].sum().reset_index()\n", " term_count_corpus_df.columns = ['term', 'corpus_count']\n", " term_count_topic_k_df = TopicTermFreq[TopicTermFreq['topic'] == k].groupby('term')['count'].sum().reset_index()\n", " dist_df = term_count_topic_k_df.merge(term_count_corpus_df, how = 'left', on = 'term')\n", " \n", " # calc corpus distribution\n", " dist_df['corpus_dist'] = dist_df['corpus_count'].apply(lambda x: x/num_tokens_in_corpus)\n", " \n", " # calc corpus distribution\n", " dist_df['topic_dist'] = dist_df['count'].apply(lambda x: x/num_tokens_in_topic)\n", " \n", " # calc KL-divergence\n", " KL_diverg = []\n", " for i in range(0, len(dist_df)):\n", " KL_diverg.append(dist_df['topic_dist'][i]*math.log(dist_df['topic_dist'][i]/dist_df['corpus_dist'][i]))\n", " KL_diverg = sum(KL_diverg) \n", " return KL_diverg\n", "\n", "corpus_dist = []\n", "for k in range(0, num_topics):\n", " corpus_dist.append([k, calc_KL_diverg_corpus(TopicTermFreq, k = k)])\n", "corpus_dist = pd.DataFrame(corpus_dist, columns = ['topic', 'corpus_dist']) \n", "corpus_dist.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7. Effective Number of Words\n", "\n", "This metric is equivalent to the effective number of parties metric in Political Science. For each word we calculate the inverse of the squared probability of the word in the topic, and then add those numbers up. Larger numbers indicate more specificity. It is similar to distance from the uniform distribution, but produces a value that may be more interpretable." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>topic</th>\n", " <th>eff_num_words</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>126.926597</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>81.466377</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2</td>\n", " <td>128.929208</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>3</td>\n", " <td>140.626300</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>4</td>\n", " <td>115.824467</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " topic eff_num_words\n", "0 0 126.926597\n", "1 1 81.466377\n", "2 2 128.929208\n", "3 3 140.626300\n", "4 4 115.824467" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def calc_effective_num_words(TopicTermFreq, k):\n", " num_tokens_in_topic = TopicTermFreq['count'][TopicTermFreq['topic']==k].sum()\n", " topic_dist = TopicTermFreq[['term', 'count']][TopicTermFreq['topic'] == k]\n", " topic_dist['freq'] = topic_dist['count'].apply(lambda x: x/num_tokens_in_topic) \n", " topic_dist['sq_prob'] = topic_dist['freq'].apply(lambda x: (x*x))\n", " effective_num_words = 1/sum(topic_dist['sq_prob'])\n", " return effective_num_words\n", "\n", "effective_words = []\n", "for k in range(0, num_topics):\n", " effective_words.append([k, calc_effective_num_words(TopicTermFreq, k = k)])\n", "effective_words = pd.DataFrame(effective_words, columns = ['topic', 'eff_num_words']) \n", "effective_words.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 8. Token/Document Discrepancy\n", "\n", "This metric looks for \"bursty\" words within topics. We compare two distributions over words using Jensen-Shannon distance. The first distribution $P(w|k)āN_{k,w}$ is the usual topic-word distribution proportional to the number of tokens of type w that are assigned to the topic. The second distribution $Q(w|k)ā\\sum_dš(N_{d,w,k}>0)$ is proportional to the number of documents that contain at least one token of type w that is assigned to the topic. A words that occurs many times in only a few documents may appear prominently in the sorted list of words, but may not be a good representative word for the topic. This metric compares the number of times a word occurs in a topic (measured in tokens) and the number of documents the word occurs in as that topic (instances of the word assigned to other topics are not counted). The highest ranked topic in this metric is the \"polish poland danish denmark sweden swedish na norway norwegian sk red\" topic, suggesting that those ill-fitting words may be isolated in a few documents. Although this metric has the same goal as coherence, the two don't appear to correlate well: bursty words aren't necessarily unrelated to the topic, they're just unusually frequent in certain contexts." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 9. Rank 1 Documents\n", "\n", "Some topics are specific, while others aren't really \"topics\" but language that comes up because we are writing in a certain context. Academic writing will talk about \"paper abstract data\", and a Wikipedia article will talk about \"list links history\". The difference is often measurable in terms of burstiness. A content-ful topic will occur in relatively few documents, but when it does, it will produce a lot of tokens. A \"background\" topic will occur in many documents and have a high overall token count, but never produce many tokens in any single document. This metric counts the frequency at which a given topic is the single most frequent topic in a document. The rank_1 metric shows how many times each topic is ranked first in terms of document proportion (hence rank_1) for the documents in which it occurs. Specific topics like \"music album band song\" or \"cell cells disease dna blood treatment\" are the \"rank 1\" topic in many documents. High token-count topics often have few rank-1 documents. This metric is often useful as a way to identify corpus-specific stopwords. But rarer topics can also have few rank-1 documents: \"day year calendar days years month\" is a representative example ā topics for days of the week and units of measurement often appear in documents as a distinct discourse, but they are rarely the focus of a document." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>topic</th>\n", " <th>rank_1_docs</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>0.800777</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>0.784936</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2</td>\n", " <td>0.804646</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>3</td>\n", " <td>0.766618</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>4</td>\n", " <td>0.835698</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " topic rank_1_docs\n", "0 0 0.800777\n", "1 1 0.784936\n", "2 2 0.804646\n", "3 3 0.766618\n", "4 4 0.835698" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def calc_rank_1_docs(docs2topic, k):\n", " rank_1_perc = 0\n", " try:\n", " rank_1_docs = docs2topic[(docs2topic['topic'] == k) & (docs2topic['rank'] == 1)]['doc_id'].unique().tolist()\n", " docs_with_topic_k = docs2topic[docs2topic['topic'] == k]['doc_id'].unique().tolist()\n", " rank_1_perc = len(rank_1_docs)/len(docs_with_topic_k)\n", " except:\n", " pass\n", " return rank_1_perc\n", "\n", "rank1_perc = []\n", "for k in range(0, num_topics):\n", " rank1_perc.append([k, calc_rank_1_docs(docs2topic, k)])\n", "rank1_perc = pd.DataFrame(rank1_perc, columns = ['topic', 'rank_1_docs']) \n", "rank1_perc.head() " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 10. Allocation Count\n", "\n", "This metric has a similar motivation to the rank-1-docs metric. For each document we can calculate the percentage of that document assigned to a given topic. We can then observe how often that percentage is above a certain threshold, by default 30%" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "docs with topic k proprotion >= 0.3: 4222\n", "docs with topic k proprotion > 0: 4222\n" ] } ], "source": [ "print(\"docs with topic k proprotion >= 0.3: \" + str(len(docs2topic[(docs2topic['topic'] == 1) & (docs2topic['proportion'] >= 0.3)])))\n", "print(\"docs with topic k proprotion > 0: \" + str(len(docs2topic[(docs2topic['topic'] == 1) & (docs2topic['proportion'] >= 0)])))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 11. Allocation Ratio\n", "\n", "This metric reports the ratio of allocation counts at two different thresholds, by default 50% and 2%." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "docs with topic k proportion >= 0.5: 2425\n", "docs with topic k proportion >= 0.3: 4222\n", "allocation ratio: 0.5743723353860729\n" ] } ], "source": [ "numerator = len(docs2topic[(docs2topic['topic'] == 1) & (docs2topic['proportion'] >= 0.5)])\n", "denominator = len(docs2topic[(docs2topic['topic'] == 1) & (docs2topic['proportion'] >= 0.3)])\n", "print(\"docs with topic k proportion >= 0.5: \" + str(len(docs2topic[(docs2topic['topic'] == 1) & (docs2topic['proportion'] >= 0.5)])))\n", "print(\"docs with topic k proportion >= 0.3: \" + str(len(docs2topic[(docs2topic['topic'] == 1) & (docs2topic['proportion'] >= 0.3)])))\n", "print(\"allocation ratio: \" + str(numerator/denominator))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 12. Exclusivity\n", "\n", "This metric measures the extent to which the top words for this topic are do not appear as top words in other topics -- i.e., the extent to which its top words are 'exclusive.' The value is the average, over each top word, of the probability of that word in the topic divided by the sum of the probabilities of that word in all topics. Of the top words in the topic, how often do they occur in other topics? Exclusivity correlates (negatively) with token count, but also indicates vaguer, more general topics. \"black hand back body cross man\" is about the same size as \"isbn book press published work books\", but is much less exclusive." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>topic</th>\n", " <th>exclusivity</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>0.556460</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>0.650400</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2</td>\n", " <td>0.235125</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>3</td>\n", " <td>0.059785</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>4</td>\n", " <td>0.379354</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " topic exclusivity\n", "0 0 0.556460\n", "1 1 0.650400\n", "2 2 0.235125\n", "3 3 0.059785\n", "4 4 0.379354" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "from numpy import mean\n", "\n", "def calc_topic_exclusivity(TopicTermFreq, k, n):\n", " top_terms = top_n_terms(k = k, n = n)['term'].tolist() \n", " topic_exclusivity = []\n", " for term in top_terms:\n", " # calc probability of term in topic k\n", " count_term_in_topic_k = TopicTermFreq[(TopicTermFreq['term'] == term) & \n", " (TopicTermFreq['topic'] == k)]['count'].sum()\n", " tokens_topic_k = TopicTermFreq[TopicTermFreq['topic'] == k]['count'].sum()\n", " prob_term_in_topic_k = count_term_in_topic_k/tokens_topic_k \n", " \n", " # calc probability of term in all topics\n", " tokens_by_topic = TopicTermFreq.groupby('topic').sum().reset_index().rename(columns = {'count': 'token_count'})\n", " count_term_all_topics = TopicTermFreq[TopicTermFreq['term'] == term]\n", " prob_term_k_all_topics = count_term_all_topics.merge(tokens_by_topic, how = 'left', on = 'topic')\n", " prob_term_k_all_topics['freq'] = prob_term_k_all_topics['count']/prob_term_k_all_topics['token_count']\n", " prob_term_k_all_topics = sum(prob_term_k_all_topics['freq']) \n", " \n", " # calc exclusivity \n", " term_exclusivity = prob_term_in_topic_k/prob_term_k_all_topics\n", " #print(term + \": \" + str(term_exclusivity))\n", " topic_exclusivity.append(term_exclusivity)\n", " topic_exclusivity = mean(topic_exclusivity)\n", " return topic_exclusivity\n", "\n", "topic_exclusivity = []\n", "for k in range(0, num_topics):\n", " topic_exclusivity.append([k, calc_topic_exclusivity(TopicTermFreq, k, n = num_top_terms)])\n", "topic_exclusivity = pd.DataFrame(topic_exclusivity, columns = ['topic', 'exclusivity']) \n", "topic_exclusivity.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>topic</th>\n", " <th>token_count</th>\n", " <th>word-length</th>\n", " <th>uniform_dist</th>\n", " <th>corpus_dist</th>\n", " <th>eff_num_words</th>\n", " <th>rank_1_docs</th>\n", " <th>exclusivity</th>\n", " <th>top_n_terms</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>1322510</td>\n", " <td>6.3</td>\n", " <td>6.889917</td>\n", " <td>2.343306</td>\n", " <td>126.926597</td>\n", " <td>0.800777</td>\n", " <td>0.556460</td>\n", " <td>coronavirus, covid, travel, japan, airlines, aviation, uae, flights, tourism, amp</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>749839</td>\n", " <td>6.6</td>\n", " <td>8.155044</td>\n", " <td>3.408515</td>\n", " <td>81.466377</td>\n", " <td>0.784936</td>\n", " <td>0.650400</td>\n", " <td>covid, coronavirus, join, bitcoin, contest, ecoins, crypto, free, earn, contestalert</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2</td>\n", " <td>1972407</td>\n", " <td>6.3</td>\n", " <td>6.823867</td>\n", " <td>1.863144</td>\n", " <td>128.929208</td>\n", " <td>0.804646</td>\n", " <td>0.235125</td>\n", " <td>covid, support, food, amp, coronavirus, community, donate, ireland, local, people</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>3</td>\n", " <td>10062309</td>\n", " <td>6.8</td>\n", " <td>6.630374</td>\n", " <td>0.708723</td>\n", " <td>140.626300</td>\n", " <td>0.766618</td>\n", " <td>0.059785</td>\n", " <td>coronavirus, covid, health, pandemic, news, lockdown, outbreak, amp, virus, government</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>4</td>\n", " <td>1522195</td>\n", " <td>7.4</td>\n", " <td>6.574626</td>\n", " <td>2.564010</td>\n", " <td>115.824467</td>\n", " <td>0.835698</td>\n", " <td>0.379354</td>\n", " <td>covid, coronavirus, follow, stayhome, bts, bbb, highriskcovid, day, mondaythoughts, survival</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " topic token_count word-length uniform_dist corpus_dist eff_num_words \\\n", "0 0 1322510 6.3 6.889917 2.343306 126.926597 \n", "1 1 749839 6.6 8.155044 3.408515 81.466377 \n", "2 2 1972407 6.3 6.823867 1.863144 128.929208 \n", "3 3 10062309 6.8 6.630374 0.708723 140.626300 \n", "4 4 1522195 7.4 6.574626 2.564010 115.824467 \n", "\n", " rank_1_docs exclusivity \\\n", "0 0.800777 0.556460 \n", "1 0.784936 0.650400 \n", "2 0.804646 0.235125 \n", "3 0.766618 0.059785 \n", "4 0.835698 0.379354 \n", "\n", " top_n_terms \n", "0 coronavirus, covid, travel, japan, airlines, aviation, uae, flights, tourism, amp \n", "1 covid, coronavirus, join, bitcoin, contest, ecoins, crypto, free, earn, contestalert \n", "2 covid, support, food, amp, coronavirus, community, donate, ireland, local, people \n", "3 coronavirus, covid, health, pandemic, news, lockdown, outbreak, amp, virus, government \n", "4 covid, coronavirus, follow, stayhome, bts, bbb, highriskcovid, day, mondaythoughts, survival " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eval_summary = topic_tbl.merge(corpus_dist, how = 'left', on = 'topic')\n", "eval_summary = eval_summary.merge(unif_dist, how = 'left', on = 'topic')\n", "eval_summary = eval_summary.merge(tokens_by_topic, how = 'left', on = 'topic')\n", "eval_summary = eval_summary.merge(word_len_by_topic, how = 'left', on = 'topic')\n", "eval_summary = eval_summary.merge(effective_words, how = 'left', on = 'topic')\n", "eval_summary = eval_summary.merge(topic_exclusivity, how = 'left', on = 'topic')\n", "eval_summary = eval_summary.merge(rank1_perc, how = 'left', on = 'topic')\n", "eval_summary = eval_summary.sort_values('topic')\n", "eval_summary.reset_index(drop = True, inplace = True)\n", "cols = ['topic', 'token_count', 'word-length', 'uniform_dist', 'corpus_dist', \n", " 'eff_num_words', 'rank_1_docs', 'exclusivity', 'top_n_terms']\n", "eval_summary = eval_summary[cols]\n", "#eval_summary.to_csv('/Users/dankoban/Documents/EM6575/twitter/twitter_topic_diagnostics_koban.csv', index=False) \n", "eval_summary.head()" ] } ], "metadata": { "kernelspec": { "display_name": "forcenexus", "language": "python", "name": "forcenexus" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" } }, "nbformat": 4, "nbformat_minor": 4 }