{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Topic Model Diagnostics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Word Counts by Topic / Parse into Tidy Structure" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import re\n", "\n", "def tidy_word_count(data, row):\n", " df = pd.DataFrame({'term': [], 'topic_count':[]})\n", " try:\n", " topic_count = data[row][2:len(data[row])] \n", " df = pd.DataFrame({'topic_count': topic_count}) \n", " df['term'] = data[row][1] \n", " except:\n", " pass\n", " return df\n", "\n", "data = []\n", "#with open('/Users/dankoban/Documents/EM6575/mallet_command_line/ct.wordtopiccounts','r') as infile:\n", "with open('/Users/dankoban/Documents/EM6575/twitter/hashtag model/hashtags/hashtags.wordtopiccounts','r') as infile:\n", " for line in infile: \n", " line = line.split(' ') \n", " data.append(line) \n", "\n", "counter = 0\n", "tidy_dfs = []\n", "for i in range(0, len(data)):\n", " tidy_dfs.append(tidy_word_count(data = data, row = i))\n", " counter += 1\n", " if counter %10000 == 0:\n", " print(str(counter) + ' out of ' + str(len(data)))\n", " \n", "df = pd.concat(tidy_dfs)\n", "\n", "df['topic_count'] = df['topic_count'].apply(lambda x: re.sub(r'\\n', '', x)) \n", "df['topic'] = df['topic_count'].apply(lambda x: x.split(\":\")[0])\n", "df['count'] = df['topic_count'].apply(lambda x: x.split(\":\")[1])\n", "df = df[['term', 'topic', 'count']] \n", "df.reset_index(inplace = True, drop = True)\n", "#df.to_csv('/Users/dankoban/Documents/EM6575/mallet_command_line/ct_tidy_topics.csv', index=False) \n", "df.to_csv('/Users/dankoban/Documents/EM6575/twitter/twitter_tidy_topics.csv', index=False) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Topics by Documents / Parse into Tidy Structure" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "data = []\n", "#with open('/Users/dankoban/Documents/EM6575/mallet_command_line/ct.doctopics_sparse','r') as infile:\n", "with open('/Users/dankoban/Documents/EM6575/twitter/hashtag model/hashtags.doctopics_sparse','r') as infile:\n", " for line in infile: \n", " line = line.rstrip().split('\\t')\n", " data.append(line)\n", "\n", "df = pd.DataFrame(data[1:], columns = ['doc_id', 'name', 'topic_1', 'proportion_1', 'topic_2', 'proportion_2', 'topic_3', 'proportion_3'])\n", "df['topic_1'] = df['topic_1'].astype('float')\n", "df['topic_2'] = df['topic_2'].astype('float')\n", "df['topic_3'] = df['topic_3'].astype('float')\n", "df['proportion_1'] = df['proportion_1'].astype('float')\n", "df['proportion_2'] = df['proportion_2'].astype('float')\n", "df['proportion_3'] = df['proportion_3'].astype('float')\n", "\n", "rank1_docs = df[['doc_id', 'name', 'topic_1', 'proportion_1']]\n", "rank1_docs.columns = ['doc_id', 'name', 'topic', 'proportion']\n", "rank1_docs = rank1_docs.assign(rank = 1)\n", "\n", "rank2_docs = df[['doc_id', 'name', 'topic_2', 'proportion_2']]\n", "rank2_docs.columns = ['doc_id', 'name', 'topic', 'proportion']\n", "rank2_docs = rank2_docs.assign(rank = 2)\n", "\n", "rank3_docs = df[['doc_id', 'name', 'topic_3', 'proportion_3']]\n", "rank3_docs.columns = ['doc_id', 'name', 'topic', 'proportion']\n", "rank3_docs = rank3_docs.assign(rank = 3)\n", "\n", "df = pd.concat([rank1_docs, rank2_docs, rank3_docs])\n", "df = df[df['proportion'].notnull()]\n", "#df = df[df['proportion'].isna() == False]\n", "#df.to_csv('/Users/dankoban/Documents/EM6575/mallet_command_line/tidy_docs2topics.csv', index=False) \n", "df.to_csv('/Users/dankoban/Documents/EM6575/twitter/tidy_docs2topics.csv', index=False) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load data from step 1 and 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load the parsed data to save processing time." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | term | \n", "topic | \n", "count | \n", "
|---|---|---|---|
| 0 | \n", "bud | \n", "34 | \n", "179 | \n", "
| 1 | \n", "bud | \n", "49 | \n", "133 | \n", "
| 2 | \n", "bud | \n", "31 | \n", "117 | \n", "
| 3 | \n", "bud | \n", "29 | \n", "106 | \n", "
| 4 | \n", "bud | \n", "39 | \n", "76 | \n", "
| \n", " | topic | \n", "token_count | \n", "
|---|---|---|
| 0 | \n", "0 | \n", "1322510 | \n", "
| 1 | \n", "1 | \n", "749839 | \n", "
| 2 | \n", "2 | \n", "1972407 | \n", "
| 3 | \n", "3 | \n", "10062309 | \n", "
| 4 | \n", "4 | \n", "1522195 | \n", "
| \n", " | topic | \n", "top_n_terms | \n", "
|---|---|---|
| 0 | \n", "0 | \n", "coronavirus, covid, travel, japan, airlines, aviation, uae, flights, tourism, amp | \n", "
| 1 | \n", "1 | \n", "covid, coronavirus, join, bitcoin, contest, ecoins, crypto, free, earn, contestalert | \n", "
| 2 | \n", "2 | \n", "covid, support, food, amp, coronavirus, community, donate, ireland, local, people | \n", "
| 3 | \n", "3 | \n", "coronavirus, covid, health, pandemic, news, lockdown, outbreak, amp, virus, government | \n", "
| 4 | \n", "4 | \n", "covid, coronavirus, follow, stayhome, bts, bbb, highriskcovid, day, mondaythoughts, survival | \n", "
| 5 | \n", "5 | \n", "economy, people, urge, coronavirus, million, debt, package, needed, student, stimulate | \n", "
| 6 | \n", "6 | \n", "usa, covid, coronavirus, america, project, trump, amp, tuesdaythoughts, topics, amazing | \n", "
| 7 | \n", "7 | \n", "coronavirus, iran, covid, amp, yemen, israel, russia, people, health, syria | \n", "
| 8 | \n", "8 | \n", "covid, workfromhome, home, quarantinelife, make, coronavirus, extra, online, wfh, work | \n", "
| 9 | \n", "9 | \n", "covid, coronavirus, easter, god, hope, jesus, love, ramadan, prayer, pray | \n", "
| 10 | \n", "10 | \n", "coronavirus, redbubble, covid, art, findyourthing, support, products, awesome, printed, rbandme | \n", "
| 11 | \n", "11 | \n", "china, coronavirus, covid, wuhan, chinesevirus, wuhanvirus, chinese, chinavirus, world, ccp | \n", "
| 12 | \n", "12 | \n", "covid, coronavirus, u.s, education, students, school, trump, kids, pandemic, learning | \n", "
| 13 | \n", "13 | \n", "covid, coronavirus, australia, auspol, government, alert, world, australian, community, aus | \n", "
| 14 | \n", "14 | \n", "covid, coronavirus, dogs, cats, amp, animals, dog, pets, cat, animal | \n", "
| 15 | \n", "15 | \n", "covid, coronavirus, art, design, artist, corona, pandemic, drawing, quedateencasa, confinement | \n", "
| 16 | \n", "16 | \n", "covid, pakistan, coronavirus, kashmir, cases, amp, karachi, positive, lahore, islamabad | \n", "
| 17 | \n", "17 | \n", "pandemic, covid, coronavirus, amp, world, global, earthday, climatechange, crisis, virus | \n", "
| 18 | \n", "18 | \n", "covid, coronavirus, memes, italia, italy, tiktok, funny, corona, news, meme | \n", "
| 19 | \n", "19 | \n", "covid, coronavirus, nhs, coronavirusuk, borisjohnson, lockdown, amp, boris, people, news | \n", "
| 20 | \n", "20 | \n", "coronavirus, covid, economy, oil, stocks, market, markets, stockmarket, trading, recession | \n", "
| 21 | \n", "21 | \n", "coronavirus, covid, live, youtube, watch, twitch, gaming, video, gta, xbox | \n", "
| 22 | \n", "22 | \n", "covid, coronavirus, technology, tech, data, cybersecurity, google, apple, app, pandemic | \n", "
| 23 | \n", "23 | \n", "covid, music, coronavirus, love, nyc, london, nowplaying, unite, paris, hiphop | \n", "
| 24 | \n", "24 | \n", "covid, lockdown, coronavirus, india, indiafightscorona, corona, stayhomestaysafe, coronavirusindia, fight, stay | \n", "
| 25 | \n", "25 | \n", "covid, stayhome, staysafe, stayathome, coronavirus, stay, home, safe, stayhomesavelives, flattenthecurve | \n", "
| 26 | \n", "26 | \n", "coronavirus, covid, nba, sports, football, due, season, nfl, mlb, olympics | \n", "
| 27 | \n", "27 | \n", "covid, coronavirus, vaccine, patients, amp, sarscov, treatment, testing, test, cdc | \n", "
| 28 | \n", "28 | \n", "covid, coronavirus, cases, india, positive, total, amp, delhi, state, lockdown | \n", "
| 29 | \n", "29 | \n", "covid, coronavirus, corona, coronavirusoutbreak, coronaviruspandemic, coronavirusupdate, coronavirusupdates, virus, coronaoutbreak, cases | \n", "
| 30 | \n", "30 | \n", "covid, coronavirus, amp, sign, relief, petition, give, american, stimulus, pandemic | \n", "
| 31 | \n", "31 | \n", "covid, coronavirus, horny, sex, porn, sexy, ass, onlyfans, nudes, cum | \n", "
| 32 | \n", "32 | \n", "covid, coronavirus, quarantine, lockdown, quarantinelife, stayhome, day, socialdistancing, stayathome, home | \n", "
| 33 | \n", "33 | \n", "socialdistancing, covid, coronavirus, coronalockdown, stayathomeandstaysafe, listen, great, coronaupdate, click, music | \n", "
| 34 | \n", "34 | \n", "coronavirus, covid, people, amp, time, it's, don't, dont, good, i'm | \n", "
| 35 | \n", "35 | \n", "covid, amp, healthcare, workers, nurses, doctors, coronavirus, ppe, health, care | \n", "
| 36 | \n", "36 | \n", "covid, coronavirus, read, life, poetry, book, books, blog, free, motivation | \n", "
| 37 | \n", "37 | \n", "covid, coronavirus, maga, qanon, wwg, kag, fakenews, wga, amp, billgates | \n", "
| 38 | \n", "38 | \n", "coronavirus, covid, due, news, facebook, marketing, movie, twitter, socialmedia, film | \n", "
| 39 | \n", "39 | \n", "covid, coronavirus, lockdown, day, photography, nature, socialdistancing, love, stayhome, beautiful | \n", "
| 40 | \n", "40 | \n", "covid, coronavirus, health, mentalhealth, amp, anxiety, pandemic, care, support, tips | \n", "
| 41 | \n", "41 | \n", "covid, coronavirus, mask, masks, face, facemask, amp, facemasks, hands, virus | \n", "
| 42 | \n", "42 | \n", "covid, nigeria, lockdown, africa, coronavirus, day, oflockdown, lagos, kenya, ghana | \n", "
| 43 | \n", "43 | \n", "covid, georgia, atlanta, realestate, news, coronavirus, conspiracy, university, truth, medical | \n", "
| 44 | \n", "44 | \n", "coronavirus, covid, breaking, cases, nyc, state, newyork, florida, california, positive | \n", "
| 45 | \n", "45 | \n", "covid, coronavirus, amp, business, latest, crisis, pandemic, webinar, impact, read | \n", "
| 46 | \n", "46 | \n", "cases, covid, coronavirus, deaths, death, total, confirmed, italy, recovered, spain | \n", "
| 47 | \n", "47 | \n", "coronavirus, covid, trump, amp, trumpvirus, americans, president, cnn, donaldtrump, people | \n", "
| 48 | \n", "48 | \n", "covid, canada, news, coronavirus, latest, cdnpoli, ontario, daily, toronto, covidcanada | \n", "
| 49 | \n", "49 | \n", "covid, coronavirus, food, healthy, cannabis, health, coffee, lockdown, stayhome, immunity | \n", "
| \n", " | topic | \n", "word-length | \n", "
|---|---|---|
| 29 | \n", "29 | \n", "11.9 | \n", "
| 33 | \n", "33 | \n", "10.0 | \n", "
| 24 | \n", "24 | \n", "9.3 | \n", "
| 32 | \n", "32 | \n", "8.9 | \n", "
| 25 | \n", "25 | \n", "8.6 | \n", "
| \n", " | Term | \n", "uniform_dist | \n", "
|---|---|---|
| 0 | \n", "coronavirus | \n", "0.646345 | \n", "
| 1 | \n", "covid | \n", "0.549688 | \n", "
| 2 | \n", "travel | \n", "0.311497 | \n", "
| 3 | \n", "japan | \n", "0.155116 | \n", "
| 4 | \n", "airlines | \n", "0.099394 | \n", "
| \n", " | topic | \n", "uniform_dist | \n", "
|---|---|---|
| 0 | \n", "0 | \n", "6.889917 | \n", "
| 1 | \n", "1 | \n", "8.155044 | \n", "
| 2 | \n", "2 | \n", "6.823867 | \n", "
| 3 | \n", "3 | \n", "6.630374 | \n", "
| 4 | \n", "4 | \n", "6.574626 | \n", "
| \n", " | topic | \n", "corpus_dist | \n", "
|---|---|---|
| 0 | \n", "0 | \n", "2.343306 | \n", "
| 1 | \n", "1 | \n", "3.408515 | \n", "
| 2 | \n", "2 | \n", "1.863144 | \n", "
| 3 | \n", "3 | \n", "0.708723 | \n", "
| 4 | \n", "4 | \n", "2.564010 | \n", "
| \n", " | topic | \n", "eff_num_words | \n", "
|---|---|---|
| 0 | \n", "0 | \n", "126.926597 | \n", "
| 1 | \n", "1 | \n", "81.466377 | \n", "
| 2 | \n", "2 | \n", "128.929208 | \n", "
| 3 | \n", "3 | \n", "140.626300 | \n", "
| 4 | \n", "4 | \n", "115.824467 | \n", "
| \n", " | topic | \n", "rank_1_docs | \n", "
|---|---|---|
| 0 | \n", "0 | \n", "0.800777 | \n", "
| 1 | \n", "1 | \n", "0.784936 | \n", "
| 2 | \n", "2 | \n", "0.804646 | \n", "
| 3 | \n", "3 | \n", "0.766618 | \n", "
| 4 | \n", "4 | \n", "0.835698 | \n", "
| \n", " | topic | \n", "exclusivity | \n", "
|---|---|---|
| 0 | \n", "0 | \n", "0.556460 | \n", "
| 1 | \n", "1 | \n", "0.650400 | \n", "
| 2 | \n", "2 | \n", "0.235125 | \n", "
| 3 | \n", "3 | \n", "0.059785 | \n", "
| 4 | \n", "4 | \n", "0.379354 | \n", "
| \n", " | topic | \n", "token_count | \n", "word-length | \n", "uniform_dist | \n", "corpus_dist | \n", "eff_num_words | \n", "rank_1_docs | \n", "exclusivity | \n", "top_n_terms | \n", "
|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "0 | \n", "1322510 | \n", "6.3 | \n", "6.889917 | \n", "2.343306 | \n", "126.926597 | \n", "0.800777 | \n", "0.556460 | \n", "coronavirus, covid, travel, japan, airlines, aviation, uae, flights, tourism, amp | \n", "
| 1 | \n", "1 | \n", "749839 | \n", "6.6 | \n", "8.155044 | \n", "3.408515 | \n", "81.466377 | \n", "0.784936 | \n", "0.650400 | \n", "covid, coronavirus, join, bitcoin, contest, ecoins, crypto, free, earn, contestalert | \n", "
| 2 | \n", "2 | \n", "1972407 | \n", "6.3 | \n", "6.823867 | \n", "1.863144 | \n", "128.929208 | \n", "0.804646 | \n", "0.235125 | \n", "covid, support, food, amp, coronavirus, community, donate, ireland, local, people | \n", "
| 3 | \n", "3 | \n", "10062309 | \n", "6.8 | \n", "6.630374 | \n", "0.708723 | \n", "140.626300 | \n", "0.766618 | \n", "0.059785 | \n", "coronavirus, covid, health, pandemic, news, lockdown, outbreak, amp, virus, government | \n", "
| 4 | \n", "4 | \n", "1522195 | \n", "7.4 | \n", "6.574626 | \n", "2.564010 | \n", "115.824467 | \n", "0.835698 | \n", "0.379354 | \n", "covid, coronavirus, follow, stayhome, bts, bbb, highriskcovid, day, mondaythoughts, survival | \n", "