{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from lda2vec import preprocess, Corpus\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "%matplotlib inline\n", "sns.set_context('poster')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You must be using a very recent version of pyLDAvis to use the lda2vec outputs. \n", "As of this writing, anything past Jan 6 2016 or this commit `14e7b5f60d8360eb84969ff08a1b77b365a5878e` should work.\n", "You can do this quickly by installing it directly from master like so:\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# pip install -U git+https://github.com/bmabey/pyLDAvis.git@master#egg=pyLDAvis" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pyLDAvis\n", "pyLDAvis.enable_notebook()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading in the saved model story topics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "After runnning `lda2vec_run.py` script in `examples/hacker_news/lda2vec` directory `topics.story.pyldavis.npz` and `topics.author.pyldavis.npz` will be created that contain the topic-to-word probabilities and frequencies. What's left is to visualize and label each topic from the it's prevalent words." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "npz = np.load(open('topics.story.pyldavis.npz', 'r'))\n", "dat = {k: v for (k, v) in npz.iteritems()}\n", "dat['vocab'] = dat['vocab'].tolist()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Topic 1 rent control gentrification basic income more housing home ownership housing affordable housing gentrifying housing prices rents\n", "Topic 2 trackpoint xmonad mbp. macports thinkpad mbp sizeup out_of_vocabulary crashplan mechanical keyboard\n", "Topic 3 algebra calculus ebonics adhd reading speed math education meditations new words common core math classes\n", "Topic 4 cree top gear charging stations model s b&n 1gbps mattresses at&t broder starz\n", "Topic 5 google+ bing default search engine ddg g+ igoogle !g g+. google+. google reader\n", "Topic 6 cyclists f-35 tesla's hyperloop cyclist electric cars nest protect pedestrians autonomous cars fuel costs\n", "Topic 7 ender dawkins asperger ramanujan atheists savages gladwell isaacson alan turing psychopathy\n", "Topic 8 bitcoins bitcoin btc bitcoin price mtgox bitcoin economy btc. index funds liquidity bitcoin exchanges\n", "Topic 9 college education mba program idea guys business degree college dropouts gpa graduates higher education rock star grad schools\n", "Topic 10 morning person melatonin cardio naps adderall sleep schedule caffeine pullups weight training little sleep\n", "Topic 11 first language sicp. sicp ror. object orientation cormen category theory the good parts htdp learn you a\n", "Topic 12 current salary hiring managers hiring manager technical interviews performance reviews 60+ hours interviewing interviewer interviewers recruiter\n", "Topic 13 helmet cardio carbs fasting diet lasik biking soylent vitamin d veggies\n", "Topic 14 horvath ortiz eich eich's swartz adria adria richards whistleblower kerr edward snowden\n", "Topic 15 2fa gpg fastmail factor authentication abp lastpass factor auth https encrypt pgp\n", "Topic 16 tau quantum effects neutrinos qm asimov particles galaxies consciousness particle cosine\n", "Topic 17 asian parents grades ap courses gpa grade inflation college experience good grades khan majoring hs\n", "Topic 18 factor authentication fbi icann search warrant tor encrypting passwords privacy rights encrypt us jurisdiction\n", "Topic 19 apple pay apple music whatsapp at&t ad blockers moto g patreon fire phone google play music prime video\n", "Topic 20 slicehost yes
willing seeking freelancer - remote request
email work - remote remote
i yes
technologies remote
i'm no
technologies work - remote
i\n",
"Topic 21 chargify padmapper spreedly godaddy merchant account namecheap recurly paypal free users cc details\n",
"Topic 22 monotouch wp7 .net. bizspark .net stack .net webos microsoft stack 3.3.1 tizen\n",
"Topic 23 aclima backend engineers laundry delivery service team. we great communication skills top-floor office small engineering team\n",
"Topic 36 rim elop plurk zynga pincus patent system crunchpad software patents nortel patents htc\n",
"Topic 37 apple watch the surface pro 16:9 hdmi mac pro winamp good battery life upgradable big iphone steam box\n",
"Topic 38 snowden real terrorists nsa's terrorism whistleblower edward snowden assange terrorists 9/11 "war\n",
"Topic 39 consolas st2 inconsolata .vimrc vim zsh vim bindings iterm2 arrow keys svg\n",
"Topic 40 cloudfront docker dockerfile docker container graphql gitlab docker containers coreos dokku gogs\n"
]
}
],
"source": [
"top_n = 10\n",
"topic_to_topwords = {}\n",
"for j, topic_to_word in enumerate(dat['topic_term_dists']):\n",
" top = np.argsort(topic_to_word)[::-1][:top_n]\n",
" msg = 'Topic %i ' % (j+ 1)\n",
" top_words = [dat['vocab'][i].strip()[:35] for i in top]\n",
" msg += ' '.join(top_words)\n",
" print msg\n",
" topic_to_topwords[j] = top_words"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Visualize story topics"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"prepared_data_story = pyLDAvis.prepare(dat['topic_term_dists'], dat['doc_topic_dists'], \n",
" dat['doc_lengths'] * 1.0, dat['vocab'], dat['term_frequency'] * 1.0, sort_topics=False)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"\n",
"\n",
" reach tax experts music artists here're href=\"http://github.com/sidmitra rel=\"nofollow\">http://www rel=\"nofollow\">http://git\n",
"Topic 4 rel=\"nofollow\">http://tur accountants. http://git href=\"http://github.com/sidmitra rel=\"nofollow\">http://www music artists out_of_vocabulary\n",
"Topic 5 submitted article great web software substantial annual turnover july 2007 limited-time online sales current name-brand goods homeware lifestyle categories tricky integration test postdoctoral fellows\n",
"Topic 6 out_of_vocabulary i>don't twiddla padova -- i>very that tricky integration test talented hackers\n",
"Topic 8 nt pinchzoom .. nuuton beeing newscred atleast fun office environment theres \n",
"Topic 9 pinchzoom ^_^ .. nt smth uhm fun office environment out_of_vocabulary heta\n",
"Topic 10 welcome scribd fun office environment small businesses/individuals top 100 website lifestyle categories available. technologies tricky integration test karts resourceful. reach ebooks"\n",
"Topic 11 fun office environment welcome scribd talented hackers general hacking skills top 100 website karts mobile or web development top 10 ebook app other ycombinator companies works! we're\n",
"Topic 12 out_of_vocabulary http://www\n",
"Topic 33 nuclear power thorium fossil fuels nuclear plants uranium gdp nuclear waste economic growth fiat currency climate change\n",
"Topic 34
==========
\n",
" \n",
"
\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" story_id \n",
" story_time \n",
" story_url \n",
" story_text \n",
" story_author \n",
" comment_id \n",
" comment_text \n",
" comment_author \n",
" comment_ranking \n",
" author_comment_count \n",
" story_comment_count \n",
" story_id_codes \n",
" author_id_codes \n",
" time_id_codes \n",
" days_since \n",
" story_dt \n",
" \n",
" \n",
" 1165434 \n",
" 1013531 \n",
" 1261638606 \n",
" NaN \n",
" For the year 2010, I plan to:\\n1. Learn Clojur... \n",
" aitoehigie \n",
" 1013543 \n",
" I plan to end 2010 with 10x as many customers ... \n",
" cperciva \n",
" 2 \n",
" 1346 \n",
" 51 \n",
" 5787 \n",
" 5787 \n",
" 1037 \n",
" 1037.401030 \n",
" 2009-12-24 07:10:06 \n",
" \n",
" \n",
" 1165435 \n",
" 1013531 \n",
" 1261638606 \n",
" NaN \n",
" For the year 2010, I plan to:\\n1. Learn Clojur... \n",
" aitoehigie \n",
" 1013710 \n",
" Being very close to graduate in the end of the... \n",
" infrequent_author \n",
" 39 \n",
" 74 \n",
" 51 \n",
" 5787 \n",
" 5787 \n",
" 1037 \n",
" 1037.401030 \n",
" 2009-12-24 07:10:06 \n",
" \n",
" \n",
" 1165436 \n",
" 4312761 \n",
" 1343662100 \n",
" http://code.google.com/p/chromium/issues/detai... \n",
" NaN \n",
" eranation \n",
" 4313810 \n",
" Not surprising. The amount of <i>aggressive</i... \n",
" infrequent_author \n",
" 1 \n",
" 46 \n",
" 11 \n",
" 27058 \n",
" 27058 \n",
" 1986 \n",
" 1986.747025 \n",
" 2012-07-30 15:28:20 \n",
" \n",
" \n",
" 1165437 \n",
" 9804349 \n",
" 1435663051 \n",
" http://blogs.aws.amazon.com/security/post/TxCK... \n",
" NaN \n",
" ukj \n",
" 9804795 \n",
" If I counted right:<p><pre><code> OCaml TLS: ... \n",
" edwintorok \n",
" 0 \n",
" 93 \n",
" 17 \n",
" 62571 \n",
" 62571 \n",
" 3051 \n",
" 3051.572847 \n",
" 2015-06-30 11:17:31 \n",
" \n",
" \n",
" \n",
"1165438 \n",
" 6765099 \n",
" 1384901786 \n",
" http://www.theatlantic.com/technology/archive/... \n",
" NaN \n",
" sinak \n",
" 6767538 \n",
" We are educated to speak well our language (en... \n",
" infrequent_author \n",
" 10 \n",
" 91 \n",
" 41 \n",
" 43699 \n",
" 43699 \n",
" 2464 \n",
" 2464.058206 \n",
" 2013-11-19 22:56:26 \n",
"
nothingcan'twe've great web software regular retrospectives feature nuances round. OCaml TLS: ... \n",
"1165438 6767538 We are educated to speak well our language (en... \n",
"\n",
" comment_author comment_ranking author_comment_count \\\n",
"1165434 cperciva 2 1346 \n",
"1165435 infrequent_author 39 74 \n",
"1165436 infrequent_author 1 46 \n",
"1165437 edwintorok 0 93 \n",
"1165438 infrequent_author 10 91 \n",
"\n",
" story_comment_count story_id_codes author_id_codes time_id_codes \\\n",
"1165434 51 5787 5787 1037 \n",
"1165435 51 5787 5787 1037 \n",
"1165436 11 27058 27058 1986 \n",
"1165437 17 62571 62571 3051 \n",
"1165438 41 43699 43699 2464 \n",
"\n",
" days_since story_dt \n",
"1165434 1037.401030 2009-12-24 07:10:06 \n",
"1165435 1037.401030 2009-12-24 07:10:06 \n",
"1165436 1986.747025 2012-07-30 15:28:20 \n",
"1165437 3051.572847 2015-06-30 11:17:31 \n",
"1165438 2464.058206 2013-11-19 22:56:26 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"features.tail()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"features.to_pickle(\"../data/features.pd\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Individual documents"
]
},
{
"cell_type": "code",
"execution_count": 353,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"top_urls = features['story_url'].value_counts().index\n",
"mask = features['story_url'] == top_urls[1]\n",
"story_id_code = features[mask].story_id_codes.values[0]\n",
"story_id_url = features[mask].story_url.values[0]"
]
},
{
"cell_type": "code",
"execution_count": 354,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u'http://googleblog.blogspot.com/2013/03/a-second-spring-of-cleaning.html'"
]
},
"execution_count": 354,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"story_id_url"
]
},
{
"cell_type": "code",
"execution_count": 355,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"topics=dat['doc_topic_dists'][story_id_code]"
]
},
{
"cell_type": "code",
"execution_count": 356,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"27% bing, google, facebook, search engines\n",
"15% karma, votes, comments, stories, rss\n",
"08% online payments, banking, domain registration, user accounts\n",
"07% internet security, passwords, authentication\n",
"05% computer hardware and monitors\n"
]
}
],
"source": [
"msg = \"{fraction:02d}% {text:s}\"\n",
"for idx in np.argsort(topics)[::-1][:5]:\n",
" print msg.format(fraction=int(100.0 * topics[idx]), text=labels[idx])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Looking at these topics and then reading the [HN article comments](u'http://googleblog.blogspot.com/2013/03/a-second-spring-of-cleaning.html') this is about Google Reader shutting down -- it's appropriate that the top topic is about Google itself and the second topic is about RSS."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Plots of topics vs time"
]
},
{
"cell_type": "code",
"execution_count": 359,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"cols = [u'story_comment_count', 'story_time', 'story_url', 'story_text', 'days_since', 'story_dt']\n",
"stories = features.groupby('story_id_codes')[cols].min().reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 524,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"stories = stories.rename(columns={'story_dt': 'Article Date'})"
]
},
{
"cell_type": "code",
"execution_count": 527,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"story_topics = pd.DataFrame(dict(story_id_codes=np.arange(dat['doc_topic_dists'].shape[0])))\n",
"for idx in range(len(labels)):\n",
" story_topics[labels[idx]] = dat['doc_topic_dists'][:, idx]\n",
"trends = stories.merge(story_topics, on='story_id_codes')\n",
"trends['day'] = np.floor(trends['days_since'].values)\n",
"by_day = pd.pivot_table(trends, index=['day', 'story_time'])"
]
},
{
"cell_type": "code",
"execution_count": 528,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"
/code>
--------------------------\n",
"Topic 14 twiddla out_of_vocabulary
ie- 401k twiddla "rent er nurse /\n"
]
}
],
"source": [
"npz = np.load(open('topics.author.pyldavis.npz', 'r'))\n",
"dat = {k: v for (k, v) in npz.iteritems()}\n",
"dat['vocab'] = dat['vocab'].tolist()\n",
"top_n = 10\n",
"topic_to_topwords = {}\n",
"for j, topic_to_word in enumerate(dat['topic_term_dists']):\n",
" top = np.argsort(topic_to_word)[::-1][:top_n]\n",
" msg = 'Topic %i ' % j\n",
" top_words = [dat['vocab'][i].strip()[:35] for i in top]\n",
" msg += ' '.join(top_words)\n",
" print msg\n",
" topic_to_topwords[j] = top_words"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"\n",
"\n",
"\n",
""
],
"text/plain": [
"