{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from __future__ import division\n", "\n", "import graphlab as gl\n", "import pandas as pd\n", "import pyLDAvis\n", "import pyLDAvis.graphlab\n", "\n", "pyLDAvis.enable_notebook()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "stories_sf = gl.load_sframe(\"hn_processed.sframe\")\n", "bows = stories_sf['bow']" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [], "source": [ "topic_model = gl.topic_model.create(bows, num_topics=100, num_iterations=200)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "" ], "text/plain": [ "PreparedData(topic_coordinates= Freq cluster topics x y\n", "topic \n", "75 1.688977 1 1 -0.057952 -0.217365\n", "12 1.641991 1 2 -0.206117 -0.094519\n", "59 1.565019 1 3 0.094331 -0.175334\n", "28 1.520073 1 4 0.125690 -0.049068\n", "66 1.435865 1 5 -0.229133 -0.056118\n", "84 1.395605 1 6 -0.237074 -0.035809\n", "40 1.373063 1 7 -0.121392 -0.164682\n", "31 1.347493 1 8 -0.002535 -0.124955\n", "36 1.335604 1 9 -0.016238 -0.134871\n", "71 1.331412 1 10 -0.218830 -0.008796\n", "... ... ... ... ... ...\n", "20 0.678453 1 91 0.038958 0.044677\n", "21 0.668880 1 92 0.074634 0.070662\n", "67 0.637927 1 93 0.058855 0.015525\n", "99 0.619009 1 94 0.049423 0.042785\n", "44 0.613617 1 95 0.009581 0.082976\n", "9 0.609615 1 96 0.072137 0.041631\n", "96 0.601214 1 97 0.030744 0.038754\n", "17 0.593251 1 98 0.047218 0.046379\n", "78 0.582672 1 99 0.050412 0.122635\n", "48 0.525714 1 100 0.079501 0.037440\n", "\n", "[100 rows x 5 columns], topic_info= Category Freq Term Total loglift logprob\n", "41200 Default 42497.000000 ask hn 42497 30.0000 30.0000\n", "41346 Default 40745.000000 google 40745 29.0000 29.0000\n", "56649 Default 72069.000000 company 72069 28.0000 28.0000\n", "15422 Default 52906.000000 data 52906 27.0000 27.0000\n", "22492 Default 28441.000000 game 28441 26.0000 26.0000\n", "42334 Default 65200.000000 user 65200 25.0000 25.0000\n", "16588 Default 83550.000000 time 83550 24.0000 24.0000\n", "18294 Default 41087.000000 business 41087 23.0000 23.0000\n", "26443 Default 48357.000000 service 48357 22.0000 22.0000\n", "47426 Default 24391.000000 facebook 24391 21.0000 21.0000\n", "... ... ... ... ... ... ...\n", "7582 Topic100 2903.205952 idea 30253 2.8118 -2.9715\n", "7912 Topic100 584.092980 suggestion 2857 3.5400 -4.6031\n", "18581 Topic100 384.524906 dev 1686 3.6499 -5.0206\n", "16398 Topic100 313.169492 landing page 1238 3.7319 -5.2475\n", "31825 Topic100 650.288275 feedback 6338 2.8510 -4.4953\n", "31956 Topic100 345.635781 recommendation 2107 3.3437 -5.1039\n", "18385 Topic100 575.958091 founder 6336 2.6728 -4.6738\n", "45892 Topic100 320.128463 thought 4153 2.6022 -5.1668\n", "19037 Topic100 174.483005 ive 645 3.8156 -5.8157\n", "22903 Topic100 242.203101 hacker 7556 1.7456 -5.4249\n", "\n", "[9270 rows x 6 columns], token_table= Topic Freq Term\n", "term \n", "29975 18 0.600000 id\n", "14552 5 0.958678 # define\n", "14552 57 0.004132 # define\n", "32640 13 0.922280 # fff\n", "32640 34 0.025907 # fff\n", "49081 33 0.984104 # ifihadglass\n", "49081 37 0.001445 # ifihadglass\n", "10663 6 0.952239 % %\n", "10663 15 0.020896 % %\n", "47304 51 0.823529 % conversion rate\n", "... ... ... ...\n", "19578 43 0.003049 | |\n", "19578 51 0.003049 | |\n", "19578 67 0.030488 | |\n", "19578 79 0.003049 | |\n", "19578 88 0.746951 | |\n", "32899 88 0.750000 | |\n", "10915 2 0.015385 | | |\n", "10915 88 0.915385 | | |\n", "54801 29 0.024390 | | | |\n", "54801 88 0.934959 | | | |\n", "\n", "[46165 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[76, 13, 60, 29, 67, 85, 41, 32, 37, 72, 74, 52, 95, 2, 12, 58, 54, 89, 31, 98, 66, 8, 69, 78, 39, 4, 11, 20, 14, 26, 38, 81, 50, 6, 43, 59, 88, 48, 19, 47, 30, 33, 16, 75, 27, 71, 7, 86, 1, 99, 83, 73, 62, 53, 84, 87, 70, 34, 25, 57, 3, 94, 23, 44, 90, 56, 35, 96, 77, 61, 24, 17, 9, 63, 91, 46, 92, 55, 5, 64, 28, 82, 42, 36, 40, 15, 51, 93, 65, 80, 21, 22, 68, 100, 45, 10, 97, 18, 79, 49])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pyLDAvis.graphlab.prepare(topic_model, bows)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can fit more topics and the topics become more fine-grained. They become difficult to visualize in the intertopic map tough." ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "" ], "text/plain": [ "PreparedData(topic_coordinates= Freq cluster topics x y\n", "topic \n", "60 1.350765 1 1 -0.179230 0.119197\n", "68 1.336744 1 2 -0.203056 0.158559\n", "5 1.280542 1 3 0.054403 0.214696\n", "100 1.023161 1 4 0.100992 0.068121\n", "90 1.004791 1 5 0.129613 0.130627\n", "139 0.993098 1 6 0.027925 0.089292\n", "47 0.931807 1 7 -0.079042 0.164084\n", "64 0.921636 1 8 0.152395 -0.104804\n", "109 0.911188 1 9 -0.219534 0.048261\n", "94 0.911147 1 10 -0.184294 0.033838\n", "... ... ... ... ... ...\n", "142 0.440408 1 141 0.057679 -0.099049\n", "149 0.420437 1 142 0.037182 -0.020640\n", "102 0.415476 1 143 0.028267 -0.059498\n", "110 0.406851 1 144 0.007387 -0.155208\n", "39 0.405661 1 145 0.067892 -0.022455\n", "62 0.402688 1 146 0.040854 -0.144485\n", "83 0.387910 1 147 0.040055 -0.048369\n", "43 0.339908 1 148 0.034850 -0.069558\n", "143 0.326145 1 149 0.059832 -0.067740\n", "55 0.309987 1 150 0.034592 -0.062266\n", "\n", "[150 rows x 5 columns], topic_info= Category Freq Term Total loglift logprob\n", "41200 Default 42497.000000 ask hn 42497 30.0000 30.0000\n", "41346 Default 40745.000000 google 40745 29.0000 29.0000\n", "56649 Default 72069.000000 company 72069 28.0000 28.0000\n", "15422 Default 52906.000000 data 52906 27.0000 27.0000\n", "42334 Default 65200.000000 user 65200 26.0000 26.0000\n", "16588 Default 83550.000000 time 83550 25.0000 25.0000\n", "22492 Default 28441.000000 game 28441 24.0000 24.0000\n", "48004 Default 27353.000000 apple 27353 23.0000 23.0000\n", "26443 Default 48357.000000 service 48357 22.0000 22.0000\n", "18294 Default 41087.000000 business 41087 21.0000 21.0000\n", "... ... ... ... ... ... ...\n", "22903 Topic150 542.379737 hacker 7556 3.0420 -4.1285\n", "7713 Topic150 451.517748 custom 6725 2.9716 -4.3154\n", "22983 Topic150 111.727644 right place 627 3.9724 -5.6872\n", "19792 Topic150 194.379264 more detail 2788 3.0294 -5.1381\n", "49781 Topic150 242.469407 advice 4792 2.6586 -4.9673\n", "48290 Topic150 102.713511 nsa 546 4.0202 -5.7778\n", "32102 Topic150 154.752884 info 2679 2.8575 -5.3499\n", "11269 Topic150 158.008776 alternative 3582 2.5893 -5.3277\n", "34245 Topic150 55.731805 saas product 134 4.8465 -6.3563\n", "23611 Topic150 61.077239 3rd party 667 3.3297 -6.2681\n", "\n", "[12230 rows x 6 columns], token_table= Topic Freq Term\n", "term \n", "26750 52 0.067797 # ccc\n", "26750 53 0.711864 # ccc\n", "26750 125 0.016949 # ccc\n", "27501 53 0.812500 # ddd\n", "14552 2 0.847107 # define\n", "14552 19 0.090909 # define\n", "14552 111 0.004132 # define\n", "32640 2 0.005181 # fff\n", "32640 9 0.020725 # fff\n", "32640 52 0.020725 # fff\n", "... ... ... ...\n", "19578 70 0.006098 | |\n", "19578 90 0.091463 | |\n", "19578 117 0.003049 | |\n", "32899 144 0.019231 | |\n", "10915 4 0.023077 | | |\n", "10915 64 0.869231 | | |\n", "10915 94 0.015385 | | |\n", "54801 10 0.004065 | | | |\n", "54801 64 0.918699 | | | |\n", "54801 67 0.024390 | | | |\n", "\n", "[79381 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[61, 69, 6, 101, 91, 140, 48, 65, 110, 95, 8, 36, 94, 19, 105, 73, 132, 87, 18, 128, 24, 96, 138, 139, 20, 102, 5, 51, 133, 57, 49, 114, 109, 136, 21, 80, 59, 53, 14, 45, 13, 85, 135, 47, 147, 68, 32, 86, 23, 58, 99, 25, 106, 125, 52, 30, 50, 146, 119, 34, 43, 129, 1, 7, 113, 11, 42, 79, 64, 89, 28, 137, 123, 2, 122, 35, 66, 148, 26, 121, 12, 93, 10, 88, 46, 120, 75, 67, 70, 97, 33, 107, 142, 112, 16, 54, 38, 41, 74, 60, 62, 17, 92, 116, 39, 117, 145, 4, 134, 72, 22, 27, 9, 37, 82, 98, 130, 3, 76, 149, 141, 90, 77, 78, 83, 104, 127, 115, 31, 71, 100, 15, 55, 81, 29, 108, 126, 124, 131, 118, 143, 150, 103, 111, 40, 63, 84, 44, 144, 56])" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "topic_model150 = gl.topic_model.create(bows, num_topics=150, num_iterations=200)\n", "pyLDAvis.graphlab.prepare(topic_model150, bows)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import re\n", "pos_re = re.compile(r'/(NOUN|ADJ|VERB|ADV)')\n", "\n", "def extract_dists(model, sf=stories_sf):\n", " data = pyLDAvis.graphlab._extract_data(model, sf['bow'])\n", " vocab = data['vocab'] = [pos_re.sub('', t).replace('_', ' ') for t in data['vocab']]\n", " vis_data = pyLDAvis.prepare(**data)\n", " vis_topic_order = vis_data.topic_order\n", " new_order = np.array(vis_topic_order) - 1\n", " topic_ids = range(1, len(new_order) + 1) \n", " data['topic_term_dists'] = pd.DataFrame(data['topic_term_dists'].T, index=vocab)[new_order]\n", " data['topic_term_dists'].columns = topic_ids\n", " data['doc_topic_dists'] = pd.DataFrame(data['doc_topic_dists'], index=sf['title'])[new_order]\n", " data['doc_topic_dists'].columns = topic_ids\n", " if vis_data:\n", " data['vis'] = vis_data\n", " return data" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [], "source": [ "model_data = extract_dists(topic_model)\n", "\n", "def topics_for(doc_name, doc_dist=model_data['doc_topic_dists']):\n", " return doc_dist.ix[doc_name].order(ascending=False)\n", "\n", "def _sort_cols(df, cols):\n", " res = df[cols].apply(lambda probs: probs.order(ascending=False).index)\n", " return res.reset_index(drop=True)\n", "\n", "def top_topic_terms(topic_ids, topic_term_dists=model_data['topic_term_dists']):\n", " return _sort_cols(topic_term_dists, topic_ids)\n", "\n", "def top_docs(topic_ids, doc_topic_dists=model_data['doc_topic_dists']):\n", " return _sort_cols(doc_topic_dists, topic_ids)\n", "\n", "def top_term_topics(term, topic_term_dists=model_data['topic_term_dists']):\n", " df = topic_term_dists.T[term].order(ascending=False)\n", " return df#.reset_index(drop=True)\n", "\n", "def all_top_terms(topic_term_dists=model_data['topic_term_dists']):\n", " return top_topic_terms(topic_term_dists.columns)\n", "\n", "def topic_docs(topic_id, doc_topic_dists=model_data['doc_topic_dists']):\n", " return doc_topic_dists[topic_id].order(ascending=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Building a focused model around 'code'" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "59 0.075427\n", "16 0.053946\n", "5 0.040998\n", "2 0.024466\n", "51 0.024436\n", "6 0.021635\n", "48 0.020752\n", "68 0.019684\n", "12 0.009131\n", "61 0.004977\n", "Name: code, dtype: float64" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "code_topics = top_term_topics('code')[0:10]\n", "code_topics" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": true }, "outputs": [], "source": [ "code_topics = code_topics[code_topics > 0.01]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "8" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(code_topics)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
5916525164868
0Static Analysis isnt Development TestingA Concept Design for C++Elucidating all about Code Analysis in Visual C++The Javascript GardenClarifying the Roles of the .gemspec and GemfileWorking with Design Patterns in JAVAASF comment on JSR#336 (Java 7) review ballotA Hacker\\u2019s Guide to Git
1Test Driven Development Really Works - The Hil...It Is Not Called The \"STL\", Mmkay?Collection of Examples of 64-bit Errors in Rea...JavaScript GardenClarifying the Roles of the .gemspec and GemfileWorking with interfaces in JAVAWhy Open Source misses the point of Free SoftwareGit Workflows & tutorials by Atlassian
2My love affair with code reviewsInteresting Standard Libraries to Study (Ltu)Improving and Fixing C CodeJavascript GardenRunning Pure Django Projects on Google App EngineDependency Injection - An Introductory Tutoria...Qt to ship standard in Ubuntu 11.10Cheat git
3Thoughts on Developer TestingA Brief, Incomplete, and Mostly Wrong History ...The Art of Picking Intel Registers (2003)Understanding Python DecoratorsBeginning Ember.js on Rails: Part 1Tinyweb Does MonoJSRs for Java 7 and Java 8 ApprovedDevelop Faster: Set Up Your Git Fork and Merge...
4The Problems with Unit Testing FrameworksMoving from Java to Scala - One year later...A Collection of Examples of 64-bit Errors in R...Understanding Python DecoratorsBackbone vs EmberUsing MEF to expose interfaces in your Silverl...Control Points and Steering Mechanisms in Open...Git: Merging the right way
\n", "
" ], "text/plain": [ " 59 16 \\\n", "0 Static Analysis isnt Development Testing A Concept Design for C++ \n", "1 Test Driven Development Really Works - The Hil... It Is Not Called The \"STL\", Mmkay? \n", "2 My love affair with code reviews Interesting Standard Libraries to Study (Ltu) \n", "3 Thoughts on Developer Testing A Brief, Incomplete, and Mostly Wrong History ... \n", "4 The Problems with Unit Testing Frameworks Moving from Java to Scala - One year later... \n", "\n", " 5 2 51 \\\n", "0 Elucidating all about Code Analysis in Visual C++ The Javascript Garden Clarifying the Roles of the .gemspec and Gemfile \n", "1 Collection of Examples of 64-bit Errors in Rea... JavaScript Garden Clarifying the Roles of the .gemspec and Gemfile \n", "2 Improving and Fixing C Code Javascript Garden Running Pure Django Projects on Google App Engine \n", "3 The Art of Picking Intel Registers (2003) Understanding Python Decorators Beginning Ember.js on Rails: Part 1 \n", "4 A Collection of Examples of 64-bit Errors in R... Understanding Python Decorators Backbone vs Ember \n", "\n", " 6 48 \\\n", "0 Working with Design Patterns in JAVA ASF comment on JSR#336 (Java 7) review ballot \n", "1 Working with interfaces in JAVA Why Open Source misses the point of Free Software \n", "2 Dependency Injection - An Introductory Tutoria... Qt to ship standard in Ubuntu 11.10 \n", "3 Tinyweb Does Mono JSRs for Java 7 and Java 8 Approved \n", "4 Using MEF to expose interfaces in your Silverl... Control Points and Steering Mechanisms in Open... \n", "\n", " 68 \n", "0 A Hacker\\u2019s Guide to Git \n", "1 Git Workflows & tutorials by Atlassian \n", "2 Cheat git \n", "3 Develop Faster: Set Up Your Git Fork and Merge... \n", "4 Git: Merging the right way " ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top_docs(code_topics.index).head(5)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": true }, "outputs": [], "source": [ "code_topics = code_topics[code_topics > 0.02]" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
59165251648
0Static Analysis isnt Development TestingA Concept Design for C++Elucidating all about Code Analysis in Visual C++The Javascript GardenClarifying the Roles of the .gemspec and GemfileWorking with Design Patterns in JAVAASF comment on JSR#336 (Java 7) review ballot
1Test Driven Development Really Works - The Hil...It Is Not Called The \"STL\", Mmkay?Collection of Examples of 64-bit Errors in Rea...JavaScript GardenClarifying the Roles of the .gemspec and GemfileWorking with interfaces in JAVAWhy Open Source misses the point of Free Software
2My love affair with code reviewsInteresting Standard Libraries to Study (Ltu)Improving and Fixing C CodeJavascript GardenRunning Pure Django Projects on Google App EngineDependency Injection - An Introductory Tutoria...Qt to ship standard in Ubuntu 11.10
3Thoughts on Developer TestingA Brief, Incomplete, and Mostly Wrong History ...The Art of Picking Intel Registers (2003)Understanding Python DecoratorsBeginning Ember.js on Rails: Part 1Tinyweb Does MonoJSRs for Java 7 and Java 8 Approved
4The Problems with Unit Testing FrameworksMoving from Java to Scala - One year later...A Collection of Examples of 64-bit Errors in R...Understanding Python DecoratorsBackbone vs EmberUsing MEF to expose interfaces in your Silverl...Control Points and Steering Mechanisms in Open...
\n", "
" ], "text/plain": [ " 59 16 \\\n", "0 Static Analysis isnt Development Testing A Concept Design for C++ \n", "1 Test Driven Development Really Works - The Hil... It Is Not Called The \"STL\", Mmkay? \n", "2 My love affair with code reviews Interesting Standard Libraries to Study (Ltu) \n", "3 Thoughts on Developer Testing A Brief, Incomplete, and Mostly Wrong History ... \n", "4 The Problems with Unit Testing Frameworks Moving from Java to Scala - One year later... \n", "\n", " 5 2 51 \\\n", "0 Elucidating all about Code Analysis in Visual C++ The Javascript Garden Clarifying the Roles of the .gemspec and Gemfile \n", "1 Collection of Examples of 64-bit Errors in Rea... JavaScript Garden Clarifying the Roles of the .gemspec and Gemfile \n", "2 Improving and Fixing C Code Javascript Garden Running Pure Django Projects on Google App Engine \n", "3 The Art of Picking Intel Registers (2003) Understanding Python Decorators Beginning Ember.js on Rails: Part 1 \n", "4 A Collection of Examples of 64-bit Errors in R... Understanding Python Decorators Backbone vs Ember \n", "\n", " 6 48 \n", "0 Working with Design Patterns in JAVA ASF comment on JSR#336 (Java 7) review ballot \n", "1 Working with interfaces in JAVA Why Open Source misses the point of Free Software \n", "2 Dependency Injection - An Introductory Tutoria... Qt to ship standard in Ubuntu 11.10 \n", "3 Tinyweb Does Mono JSRs for Java 7 and Java 8 Approved \n", "4 Using MEF to expose interfaces in your Silverl... Control Points and Steering Mechanisms in Open... " ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top_docs(code_topics.index).head(5)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [], "source": [ "docs_ordred_by_code = model_data['doc_topic_dists'][code_topics.index].sum(axis=1).order(ascending=False)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def fit_focused_model(ordered_docs, num_topics, num_iters=100, threshold=0.1):\n", " subset = set(ordered_docs[ordered_docs > threshold].index)\n", " print('Keeping %.2f%% of the corpus...' % (100 * (len(subset) / len(ordered_docs))))\n", " # I should have kept the doc index around, oh well..\n", " stories_subset = stories_sf[stories_sf['title'].apply(lambda t: t in subset)]\n", " bows = stories_subset['bow'] \n", " print('Fitting model...')\n", " tm = gl.topic_model.create(bows, num_topics, num_iterations=num_iters)\n", " print('Creating vis data...')\n", " data = extract_dists(tm, stories_subset)\n", " data['model'] = tm\n", " return data" ] }, { "cell_type": "code", "execution_count": 206, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Keeping 1.55% of the corpus...\n", "Fitting model...\n", "PROGRESS: Learning a topic model\n", "PROGRESS: Number of documents 4051\n", "PROGRESS: Vocabulary size 23879\n", "PROGRESS: Running collapsed Gibbs sampling\n", "PROGRESS: +-----------+---------------+----------------+-----------------+\n", "PROGRESS: | Iteration | Elapsed Time | Tokens/Second | Est. Perplexity |\n", "PROGRESS: +-----------+---------------+----------------+-----------------+\n", "PROGRESS: | 10 | 1.00s | 4.22476e+06 | 0 |\n", "PROGRESS: | 20 | 1.93s | 3.79484e+06 | 0 |\n", "PROGRESS: | 30 | 2.85s | 4.20434e+06 | 0 |\n", "PROGRESS: | 40 | 3.71s | 4.29245e+06 | 0 |\n", "PROGRESS: | 50 | 4.61s | 4.1157e+06 | 0 |\n", "PROGRESS: | 60 | 5.50s | 3.64592e+06 | 0 |\n", "PROGRESS: | 70 | 6.40s | 3.58882e+06 | 0 |\n", "PROGRESS: | 80 | 7.32s | 3.33084e+06 | 0 |\n", "PROGRESS: | 90 | 8.27s | 4.14121e+06 | 0 |\n", "PROGRESS: | 100 | 9.18s | 4.06099e+06 | 0 |\n", "PROGRESS: | 110 | 10.08s | 4.37966e+06 | 0 |\n", "PROGRESS: | 120 | 10.96s | 3.95687e+06 | 0 |\n", "PROGRESS: | 130 | 11.88s | 4.38291e+06 | 0 |\n", "PROGRESS: | 140 | 12.73s | 4.06117e+06 | 0 |\n", "PROGRESS: | 150 | 13.59s | 4.11654e+06 | 0 |\n", "PROGRESS: | 160 | 14.50s | 3.57834e+06 | 0 |\n", "PROGRESS: | 170 | 15.41s | 3.93633e+06 | 0 |\n", "PROGRESS: | 180 | 16.30s | 4.14158e+06 | 0 |\n", "PROGRESS: | 190 | 17.18s | 4.14951e+06 | 0 |\n", "PROGRESS: | 200 | 18.08s | 4.19639e+06 | 0 |\n", "PROGRESS: | 210 | 18.94s | 4.28476e+06 | 0 |\n", "PROGRESS: | 220 | 19.82s | 3.92746e+06 | 0 |\n", "PROGRESS: | 230 | 20.67s | 4.25762e+06 | 0 |\n", "PROGRESS: | 240 | 21.58s | 4.15865e+06 | 0 |\n", "PROGRESS: | 250 | 22.46s | 4.28757e+06 | 0 |\n", "PROGRESS: | 260 | 23.42s | 3.50532e+06 | 0 |\n", "PROGRESS: | 270 | 24.34s | 4.14928e+06 | 0 |\n", "PROGRESS: | 280 | 25.21s | 4.27971e+06 | 0 |\n", "PROGRESS: | 290 | 26.09s | 3.83545e+06 | 0 |\n", "PROGRESS: | 300 | 26.97s | 4.07273e+06 | 0 |\n", "PROGRESS: | 310 | 27.89s | 4.24095e+06 | 0 |\n", "PROGRESS: | 320 | 28.78s | 4.0067e+06 | 0 |\n", "PROGRESS: | 330 | 29.67s | 4.07831e+06 | 0 |\n", "PROGRESS: | 340 | 30.58s | 4.29804e+06 | 0 |\n", "PROGRESS: | 350 | 31.52s | 3.49479e+06 | 0 |\n", "PROGRESS: | 360 | 32.45s | 4.16546e+06 | 0 |\n", "PROGRESS: | 370 | 33.39s | 2.94196e+06 | 0 |\n", "PROGRESS: | 380 | 34.29s | 3.39923e+06 | 0 |\n", "PROGRESS: | 390 | 35.34s | 3.587e+06 | 0 |\n", "PROGRESS: | 400 | 36.25s | 4.28361e+06 | 0 |\n", "PROGRESS: | 410 | 37.20s | 4.29764e+06 | 0 |\n", "PROGRESS: | 420 | 38.10s | 4.22593e+06 | 0 |\n", "PROGRESS: | 430 | 39.02s | 3.66926e+06 | 0 |\n", "PROGRESS: | 440 | 39.91s | 4.34361e+06 | 0 |\n", "PROGRESS: | 450 | 40.80s | 4.39185e+06 | 0 |\n", "PROGRESS: | 460 | 41.79s | 3.65108e+06 | 0 |\n", "PROGRESS: | 470 | 42.76s | 3.85248e+06 | 0 |\n", "PROGRESS: | 480 | 43.71s | 3.8152e+06 | 0 |\n", "PROGRESS: | 490 | 44.69s | 2.877e+06 | 0 |\n", "PROGRESS: | 500 | 45.72s | 3.40368e+06 | 0 |\n", "PROGRESS: +-----------+---------------+----------------+-----------------+\n", "Creating vis data...\n" ] } ], "source": [ "code_model = fit_focused_model(docs_ordred_by_code, 40, num_iters=500, threshold=0.25)" ] }, { "cell_type": "code", "execution_count": 207, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "" ], "text/plain": [ "PreparedData(topic_coordinates= Freq cluster topics x y\n", "topic \n", "25 3.629588 1 1 0.025235 -0.050535\n", "23 3.502043 1 2 0.085295 0.200824\n", "21 3.336403 1 3 0.191491 0.080827\n", "36 3.283851 1 4 0.179033 -0.131652\n", "6 3.237957 1 5 0.250050 -0.019619\n", "13 3.206140 1 6 -0.061180 -0.181079\n", "29 3.184069 1 7 0.201272 -0.078189\n", "7 3.171838 1 8 0.066442 -0.129942\n", "32 3.111444 1 9 -0.043864 0.064970\n", "33 3.107598 1 10 0.285290 0.021495\n", "... ... ... ... ... ...\n", "3 1.950338 1 31 -0.143769 0.108843\n", "9 1.940796 1 32 0.049890 0.041037\n", "5 1.935411 1 33 -0.080844 -0.014692\n", "22 1.921433 1 34 -0.031126 0.053605\n", "28 1.844085 1 35 0.071105 0.056181\n", "24 1.770056 1 36 -0.046568 -0.054045\n", "35 1.743148 1 37 0.052070 0.015846\n", "37 1.719299 1 38 -0.054085 -0.046161\n", "15 1.708756 1 39 -0.010949 0.018293\n", "20 1.665883 1 40 -0.004957 0.026704\n", "\n", "[40 rows x 5 columns], topic_info= Category Freq Term Total loglift logprob\n", "1107 Default 10656.000000 function 10656 30.0000 30.0000\n", "5737 Default 7158.000000 class 7158 29.0000 29.0000\n", "21163 Default 7673.000000 object 7673 28.0000 28.0000\n", "17141 Default 6715.000000 type 6715 27.0000 27.0000\n", "17101 Default 3997.000000 test 3997 26.0000 26.0000\n", "1370 Default 15340.000000 code 15340 25.0000 25.0000\n", "19531 Default 6574.000000 language 6574 24.0000 24.0000\n", "15158 Default 6572.000000 method 6572 23.0000 23.0000\n", "8624 Default 5466.000000 file 5466 22.0000 22.0000\n", "22279 Default 2962.000000 java 2962 21.0000 21.0000\n", "... ... ... ... ... ... ...\n", "12403 Topic40 127.471889 post 1285 1.6826 -4.7608\n", "17069 Topic40 77.354899 fun 355 2.4771 -5.2527\n", "5967 Topic40 68.291470 concurrency 268 2.6341 -5.3769\n", "6909 Topic40 170.870957 time 6165 0.3960 -4.4792\n", "10512 Topic40 81.826365 piece 555 2.0684 -5.2146\n", "7916 Topic40 59.659799 vector 273 2.4738 -5.5186\n", "10504 Topic40 47.215294 play 114 3.1412 -5.7246\n", "1023 Topic40 57.634714 detail 537 1.7629 -5.5530\n", "8695 Topic40 52.821073 note 456 1.8724 -5.6070\n", "5737 Topic40 68.636146 class 7158 -0.6807 -5.4067\n", "\n", "[5922 rows x 6 columns], token_table= Topic Freq Term\n", "term \n", "6064 7 0.969072 # define\n", "6064 8 0.010309 # define\n", "4408 16 0.346320 % %\n", "4408 22 0.004329 % %\n", "4408 35 0.632035 % %\n", "662 38 0.812500 % market share\n", "18595 11 0.500000 % marketshare\n", "17502 10 0.836364 + b\n", "17502 17 0.018182 + b\n", "17502 21 0.036364 + b\n", "... ... ... ...\n", "7743 34 0.037037 zlib\n", "7584 22 0.032258 zombie\n", "7584 25 0.870968 zombie\n", "4794 23 0.700000 zoo\n", "13665 20 0.700000 zuckerberg\n", "19186 6 0.833333 | category\n", "10898 13 0.993528 | login\n", "14517 27 0.625000 | march\n", "12680 9 0.727273 | permalink\n", "10031 35 0.727273 |f|\n", "\n", "[7637 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[26, 24, 22, 37, 7, 14, 30, 8, 33, 34, 31, 17, 15, 3, 35, 18, 5, 40, 20, 11, 9, 32, 19, 27, 1, 39, 28, 13, 12, 2, 4, 10, 6, 23, 29, 25, 36, 38, 16, 21])" ] }, "execution_count": 207, "metadata": {}, "output_type": "execute_result" } ], "source": [ "code_model['vis']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Example of looking at a document" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "57 0.141026\n", "38 0.132479\n", "65 0.072650\n", "70 0.072650\n", "Name: Game written by 14 year old passes Angry Birds as the top free iphone app, dtype: float64" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top_topics = topics_for('Game written by 14 year old passes Angry Birds as the top free iphone app').head(4)\n", "top_topics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Without LDAvis you would then look at the top words for those docs.. something like this:" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
57386570
0appgametimemovie
1appsplayerdayfilm
2developervideo gameworkshow
3applicationgaminghourstory
4userdeveloperweekepisode
\n", "
" ], "text/plain": [ " 57 38 65 70\n", "0 app game time movie\n", "1 apps player day film\n", "2 developer video game work show\n", "3 application gaming hour story\n", "4 user developer week episode" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top_topic_terms(top_topics.index)[0:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To look at the all of the topics you are reduced to looking at a wall of words or tables:" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
12345678910111213141516171819202122232425...767778798081828384858687888990919293949596979899100
0problemfunctionworldgovernmentcodeclassnumberscientistresearchsystemmarketbrowsercountryfilewikileakslanguagetimestartupdataandroidhealthlawenergyideacompany...experiencegooglecardesignbusinessvideoseoeventserviceblogemailmusicservicephotoquestionatauhomefebruaryshoefeatureindiaask hnask hnask hnask hn
1systemtypelifemoneybitmethodpointsciencestudyprocesscompanyjavascriptchinapackageinformationcodemancompanydatabasedevicedrugstatewatertimemarket...developersearchvehicledesignercompanycontentbrick marketingconferencephonepostmessageartistcustomerimageanswerandaservicejanuaryfashionlaptoptravelerrorshow hnwebsitestartup
2modelobjectmaneconomydataobjectalgorithmresearcherbrainapplicationsalefirefoxunited statedirectorygovernmentprogrammingyear agoentrepreneurtablephonepatientcourtpowergoalstock...skilluserdriverwebsitemoneyyoutubewebsitepresentationcallsitemailsoundcompanypicturetimedalamreal estatesitestylebatteryhoteldercommentdomainidea
3questioncodebooktaxbugcodeproblemcellresearcheruserproductelementworldcommanddocumentjavadayfounderquerygoogletreatmentobamamaterialproblemshare...companychromesystemweb designclientmediumcompanytalkcommunicationwordpressemail addressaudiobusinesscameratopicituhouseonlinesalephonematchrequest urlhacker newssiteadvice
4pointvargodbankprogramdataresultanimalpaperdatarevenuehtmlgovernmentversionassangeprogrammerdoginvestorsystemappledoctorlawyerplantlifeinvestor...engineerproductmodellogoideaflashmarketingdaytechnologyreaderservicebandsolutionwallpapercommunityuntukpropertytipqualityvideosportmitpostdomain nameweb app
\n", "

5 rows × 100 columns

\n", "
" ], "text/plain": [ " 1 2 3 4 5 6 7 8 9 10 11 12 13 \\\n", "0 problem function world government code class number scientist research system market browser country \n", "1 system type life money bit method point science study process company javascript china \n", "2 model object man economy data object algorithm researcher brain application sale firefox united state \n", "3 question code book tax bug code problem cell researcher user product element world \n", "4 point var god bank program data result animal paper data revenue html government \n", "\n", " 14 15 16 17 18 19 20 21 22 23 24 25 ... \\\n", "0 file wikileaks language time startup data android health law energy idea company ... \n", "1 package information code man company database device drug state water time market ... \n", "2 directory government programming year ago entrepreneur table phone patient court power goal stock ... \n", "3 command document java day founder query google treatment obama material problem share ... \n", "4 version assange programmer dog investor system apple doctor lawyer plant life investor ... \n", "\n", " 76 77 78 79 80 81 82 83 84 85 86 87 \\\n", "0 experience google car design business video seo event service blog email music \n", "1 developer search vehicle designer company content brick marketing conference phone post message artist \n", "2 skill user driver website money youtube website presentation call site mail sound \n", "3 company chrome system web design client medium company talk communication wordpress email address audio \n", "4 engineer product model logo idea flash marketing day technology reader service band \n", "\n", " 88 89 90 91 92 93 94 95 96 97 98 99 100 \n", "0 service photo question atau home february shoe feature india ask hn ask hn ask hn ask hn \n", "1 customer image answer anda service january fashion laptop travel error show hn website startup \n", "2 company picture time dalam real estate site style battery hotel der comment domain idea \n", "3 business camera topic itu house online sale phone match request url hacker news site advice \n", "4 solution wallpaper community untuk property tip quality video sport mit post domain name web app \n", "\n", "[5 rows x 100 columns]" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_top_terms().head(5)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.8" }, "name": "Interpreting a model old school.ipynb" }, "nbformat": 4, "nbformat_minor": 0 }