{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pyLDAvis\n", "import pyLDAvis.sklearn\n", "from pyLDAvis._prepare import (js_PCoA, js_MMDS, js_TSNE)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.datasets import fetch_20newsgroups\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.decomposition import LatentDirichletAllocation" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pyLDAvis.enable_notebook()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## load data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "11314\n" ] } ], "source": [ "newsgroup = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))\n", "docs_raw = newsgroup.data\n", "print len(docs_raw)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## document-term matrix" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "tfidf_vectorizer = TfidfVectorizer(strip_accents = 'unicode', \n", " stop_words = 'english', \n", " token_pattern = r'\\b[a-zA-Z]{2,}\\b',\n", " max_df = 0.5, \n", " min_df = 10)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(11314, 9597)\n" ] } ], "source": [ "dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)\n", "print dtm_tfidf.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## latent dirichlet allocation" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,\n", " evaluate_every=-1, learning_decay=0.7,\n", " learning_method='batch', learning_offset=10.0,\n", " max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,\n", " n_jobs=1, n_topics=20, perp_tol=0.1, random_state=0,\n", " topic_word_prior=None, total_samples=1000000.0, verbose=0)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lda = LatentDirichletAllocation(n_topics=20, learning_method='batch', random_state=0)\n", "lda.fit(dtm_tfidf)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## pyLDAvis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### PCoA / CMDS" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "" ], "text/plain": [ "PreparedData(topic_coordinates= Freq cluster topics x y\n", "topic \n", "0 10.503341 1 1 0.171704 0.164112\n", "1 28.438815 1 2 0.201800 -0.089922\n", "2 0.994623 1 3 -0.065989 0.015844\n", "3 0.948237 1 4 -0.063565 0.018540\n", "4 1.029461 1 5 -0.064392 0.004507\n", "5 10.277859 1 6 0.141423 0.089090\n", "6 11.955222 1 7 0.143332 -0.023628\n", "7 0.931453 1 8 -0.051261 0.014630\n", "8 1.315214 1 9 -0.106246 0.023164\n", "9 1.325113 1 10 -0.069525 -0.020487\n", "10 4.498473 1 11 0.120252 -0.247388\n", "11 17.741730 1 12 0.149401 0.115454\n", "12 1.824888 1 13 -0.028772 -0.098892\n", "13 1.057498 1 14 -0.071256 0.027029\n", "14 0.981087 1 15 -0.067756 0.020545\n", "15 1.122539 1 16 -0.072022 0.017797\n", "16 0.988056 1 17 -0.066088 0.012687\n", "17 1.656893 1 18 -0.045095 -0.091089\n", "18 1.472447 1 19 -0.091967 0.031726\n", "19 0.937052 1 20 -0.063979 0.016282, topic_info= Category Freq Term Total loglift logprob\n", "term \n", "2638 Default 67.000000 drive 67.000000 30.0000 30.0000\n", "3656 Default 81.000000 god 81.000000 29.0000 29.0000\n", "8618 Default 108.000000 thanks 108.000000 28.0000 28.0000\n", "9398 Default 76.000000 windows 76.000000 27.0000 27.0000\n", "1203 Default 56.000000 card 56.000000 26.0000 26.0000\n", "3263 Default 54.000000 file 54.000000 25.0000 25.0000\n", "3266 Default 40.000000 files 40.000000 24.0000 24.0000\n", "2741 Default 87.000000 edu 87.000000 23.0000 23.0000\n", "7588 Default 31.000000 scsi 31.000000 22.0000 22.0000\n", "3483 Default 31.000000 ftp 31.000000 21.0000 21.0000\n", "9159 Default 40.000000 version 40.000000 20.0000 20.0000\n", "9397 Default 41.000000 window 41.000000 19.0000 19.0000\n", "6659 Default 54.000000 program 54.000000 18.0000 18.0000\n", "2569 Default 125.000000 does 125.000000 17.0000 17.0000\n", "9178 Default 35.000000 video 35.000000 16.0000 16.0000\n", "5488 Default 34.000000 monitor 34.000000 15.0000 15.0000\n", "2499 Default 34.000000 disk 34.000000 14.0000 14.0000\n", "2593 Default 42.000000 dos 42.000000 13.0000 13.0000\n", "5120 Default 68.000000 mail 68.000000 12.0000 12.0000\n", "124 Default 38.000000 address 38.000000 11.0000 11.0000\n", "1561 Default 61.000000 com 61.000000 10.0000 10.0000\n", "3527 Default 48.000000 game 48.000000 9.0000 9.0000\n", "1105 Default 26.000000 bus 26.000000 8.0000 8.0000\n", "8530 Default 40.000000 team 40.000000 7.0000 7.0000\n", "8598 Default 24.000000 test 24.000000 6.0000 6.0000\n", "3837 Default 45.000000 hard 45.000000 5.0000 5.0000\n", "4574 Default 38.000000 jesus 38.000000 4.0000 4.0000\n", "3948 Default 46.000000 hi 46.000000 3.0000 3.0000\n", "2642 Default 23.000000 drives 23.000000 2.0000 2.0000\n", "1861 Default 22.000000 controller 22.000000 1.0000 1.0000\n", "... ... ... ... ... ... ...\n", "7364 Topic20 0.958735 robotics 2.231489 3.8254 -6.3447\n", "2669 Topic20 2.012382 duke 4.928634 3.7744 -5.6032\n", "2898 Topic20 0.808980 envelopes 2.029844 3.7502 -6.5145\n", "8683 Topic20 3.866897 ticket 9.774426 3.7429 -4.9501\n", "9529 Topic20 1.119211 xputimage 3.144183 3.6373 -6.1899\n", "5750 Topic20 2.096357 nonsense 6.293258 3.5709 -5.5624\n", "7490 Topic20 0.727430 samuel 2.272779 3.5309 -6.6208\n", "6833 Topic20 0.679060 quarterly 2.136343 3.5240 -6.6896\n", "4565 Topic20 1.299791 jeep 4.101494 3.5210 -6.0404\n", "1122 Topic20 0.498698 bw 1.623146 3.4901 -6.9983\n", "870 Topic20 0.885381 bite 2.892924 3.4862 -6.4243\n", "3560 Topic20 1.082239 geez 3.630871 3.4597 -6.2235\n", "2382 Topic20 1.070237 devoted 3.599249 3.4573 -6.2347\n", "7456 Topic20 0.469594 rx 1.589034 3.4512 -7.0585\n", "9594 Topic20 0.350972 zq 1.188957 3.4501 -7.3496\n", "7459 Topic20 0.528406 rz 1.818195 3.4345 -6.9405\n", "6231 Topic20 0.467564 perl 1.614043 3.4312 -7.0628\n", "8773 Topic20 0.312267 tp 1.128945 3.3850 -7.4665\n", "3537 Topic20 1.903473 gary 6.913224 3.3804 -5.6589\n", "8538 Topic20 1.309161 technician 4.886299 3.3531 -6.0332\n", "9122 Topic20 0.914287 variant 3.339278 3.3748 -6.3922\n", "8845 Topic20 1.281730 tree 5.006041 3.3078 -6.0544\n", "8598 Topic20 4.318902 test 24.794514 2.9226 -4.8396\n", "7422 Topic20 0.873161 rr 3.403104 3.3099 -6.4382\n", "2153 Topic20 3.273311 david 28.411704 2.5092 -5.1168\n", "6392 Topic20 1.331068 pm 7.337767 2.9631 -6.0166\n", "2495 Topic20 1.007643 diseases 5.368270 2.9973 -6.2950\n", "2714 Topic20 1.009780 eating 6.424809 2.8198 -6.2928\n", "380 Topic20 0.964631 apollo 5.794740 2.8772 -6.3386\n", "2131 Topic20 0.879309 danny 3.686446 3.2369 -6.4312\n", "\n", "[1016 rows x 6 columns], token_table= Topic Freq Term\n", "term \n", "17 19 0.597014 abs\n", "22 14 0.666034 absolutes\n", "39 11 0.836374 accelerator\n", "116 2 0.527937 adb\n", "116 5 0.351958 adb\n", "124 2 0.625637 address\n", "124 7 0.052136 address\n", "124 10 0.104273 address\n", "124 11 0.052136 address\n", "124 12 0.078205 address\n", "124 18 0.078205 address\n", "138 12 0.888627 administration\n", "145 14 0.411956 admitting\n", "150 18 0.719837 adress\n", "154 2 0.588642 advance\n", "154 6 0.024527 advance\n", "154 7 0.049054 advance\n", "154 10 0.024527 advance\n", "154 11 0.245268 advance\n", "154 13 0.024527 advance\n", "154 18 0.049054 advance\n", "170 4 0.498933 advocacy\n", "237 2 0.303395 al\n", "237 6 0.424753 al\n", "237 12 0.121358 al\n", "237 16 0.121358 al\n", "256 7 0.460420 aliases\n", "256 15 0.460420 aliases\n", "278 6 0.403502 alomar\n", "278 16 0.538003 alomar\n", "... ... ... ...\n", "9519 13 0.657460 xdm\n", "9529 9 0.318048 xputimage\n", "9529 12 0.318048 xputimage\n", "9529 20 0.318048 xputimage\n", "9544 2 0.171415 yankees\n", "9544 7 0.171415 yankees\n", "9544 10 0.514245 yankees\n", "9550 1 0.407677 yea\n", "9550 8 0.407677 yea\n", "9552 1 0.015859 year\n", "9552 2 0.190311 year\n", "9552 6 0.555072 year\n", "9552 7 0.111014 year\n", "9552 12 0.126874 year\n", "9554 1 0.085130 years\n", "9554 2 0.238364 years\n", "9554 6 0.221338 years\n", "9554 7 0.170260 years\n", "9554 12 0.272416 years\n", "9568 6 0.218476 yo\n", "9568 12 0.218476 yo\n", "9568 17 0.436952 yo\n", "9575 19 0.780507 yr\n", "9578 1 0.148381 yup\n", "9578 8 0.593524 yup\n", "9578 12 0.148381 yup\n", "9590 2 0.073832 zip\n", "9590 11 0.073832 zip\n", "9590 18 0.738321 zip\n", "9590 19 0.073832 zip\n", "\n", "[2101 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pyLDAvis.sklearn.prepare(lda, dtm_tfidf, tfidf_vectorizer, sort_topics=False, mds=js_PCoA)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### MMDS" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "" ], "text/plain": [ "PreparedData(topic_coordinates= Freq cluster topics x y\n", "topic \n", "0 10.503341 1 1 -0.293776 -0.123281\n", "1 28.438815 1 2 -0.260388 0.012623\n", "2 0.994623 1 3 0.032939 -0.016665\n", "3 0.948237 1 4 0.005734 -0.038208\n", "4 1.029461 1 5 0.105541 0.039861\n", "5 10.277859 1 6 -0.142686 0.256511\n", "6 11.955222 1 7 -0.223201 0.130338\n", "7 0.931453 1 8 -0.014763 -0.005552\n", "8 1.315214 1 9 0.035621 -0.129733\n", "9 1.325113 1 10 0.133553 -0.109759\n", "10 4.498473 1 11 0.027108 -0.323114\n", "11 17.741730 1 12 0.020211 0.287044\n", "12 1.824888 1 13 -0.090007 -0.189933\n", "13 1.057498 1 14 0.030210 0.066760\n", "14 0.981087 1 15 -0.009940 0.031133\n", "15 1.122539 1 16 0.060389 0.107625\n", "16 0.988056 1 17 0.087234 -0.027060\n", "17 1.656893 1 18 0.235478 -0.072457\n", "18 1.472447 1 19 0.193287 0.093513\n", "19 0.937052 1 20 0.067459 0.010357, topic_info= Category Freq Term Total loglift logprob\n", "term \n", "2638 Default 67.000000 drive 67.000000 30.0000 30.0000\n", "3656 Default 81.000000 god 81.000000 29.0000 29.0000\n", "8618 Default 108.000000 thanks 108.000000 28.0000 28.0000\n", "9398 Default 76.000000 windows 76.000000 27.0000 27.0000\n", "1203 Default 56.000000 card 56.000000 26.0000 26.0000\n", "3263 Default 54.000000 file 54.000000 25.0000 25.0000\n", "3266 Default 40.000000 files 40.000000 24.0000 24.0000\n", "2741 Default 87.000000 edu 87.000000 23.0000 23.0000\n", "7588 Default 31.000000 scsi 31.000000 22.0000 22.0000\n", "3483 Default 31.000000 ftp 31.000000 21.0000 21.0000\n", "9159 Default 40.000000 version 40.000000 20.0000 20.0000\n", "9397 Default 41.000000 window 41.000000 19.0000 19.0000\n", "6659 Default 54.000000 program 54.000000 18.0000 18.0000\n", "2569 Default 125.000000 does 125.000000 17.0000 17.0000\n", "9178 Default 35.000000 video 35.000000 16.0000 16.0000\n", "5488 Default 34.000000 monitor 34.000000 15.0000 15.0000\n", "2499 Default 34.000000 disk 34.000000 14.0000 14.0000\n", "2593 Default 42.000000 dos 42.000000 13.0000 13.0000\n", "5120 Default 68.000000 mail 68.000000 12.0000 12.0000\n", "124 Default 38.000000 address 38.000000 11.0000 11.0000\n", "1561 Default 61.000000 com 61.000000 10.0000 10.0000\n", "3527 Default 48.000000 game 48.000000 9.0000 9.0000\n", "1105 Default 26.000000 bus 26.000000 8.0000 8.0000\n", "8530 Default 40.000000 team 40.000000 7.0000 7.0000\n", "8598 Default 24.000000 test 24.000000 6.0000 6.0000\n", "3837 Default 45.000000 hard 45.000000 5.0000 5.0000\n", "4574 Default 38.000000 jesus 38.000000 4.0000 4.0000\n", "3948 Default 46.000000 hi 46.000000 3.0000 3.0000\n", "2642 Default 23.000000 drives 23.000000 2.0000 2.0000\n", "1861 Default 22.000000 controller 22.000000 1.0000 1.0000\n", "... ... ... ... ... ... ...\n", "7364 Topic20 0.958735 robotics 2.231489 3.8254 -6.3447\n", "2669 Topic20 2.012382 duke 4.928634 3.7744 -5.6032\n", "2898 Topic20 0.808980 envelopes 2.029844 3.7502 -6.5145\n", "8683 Topic20 3.866897 ticket 9.774426 3.7429 -4.9501\n", "9529 Topic20 1.119211 xputimage 3.144183 3.6373 -6.1899\n", "5750 Topic20 2.096357 nonsense 6.293258 3.5709 -5.5624\n", "7490 Topic20 0.727430 samuel 2.272779 3.5309 -6.6208\n", "6833 Topic20 0.679060 quarterly 2.136343 3.5240 -6.6896\n", "4565 Topic20 1.299791 jeep 4.101494 3.5210 -6.0404\n", "1122 Topic20 0.498698 bw 1.623146 3.4901 -6.9983\n", "870 Topic20 0.885381 bite 2.892924 3.4862 -6.4243\n", "3560 Topic20 1.082239 geez 3.630871 3.4597 -6.2235\n", "2382 Topic20 1.070237 devoted 3.599249 3.4573 -6.2347\n", "7456 Topic20 0.469594 rx 1.589034 3.4512 -7.0585\n", "9594 Topic20 0.350972 zq 1.188957 3.4501 -7.3496\n", "7459 Topic20 0.528406 rz 1.818195 3.4345 -6.9405\n", "6231 Topic20 0.467564 perl 1.614043 3.4312 -7.0628\n", "8773 Topic20 0.312267 tp 1.128945 3.3850 -7.4665\n", "3537 Topic20 1.903473 gary 6.913224 3.3804 -5.6589\n", "8538 Topic20 1.309161 technician 4.886299 3.3531 -6.0332\n", "9122 Topic20 0.914287 variant 3.339278 3.3748 -6.3922\n", "8845 Topic20 1.281730 tree 5.006041 3.3078 -6.0544\n", "8598 Topic20 4.318902 test 24.794514 2.9226 -4.8396\n", "7422 Topic20 0.873161 rr 3.403104 3.3099 -6.4382\n", "2153 Topic20 3.273311 david 28.411704 2.5092 -5.1168\n", "6392 Topic20 1.331068 pm 7.337767 2.9631 -6.0166\n", "2495 Topic20 1.007643 diseases 5.368270 2.9973 -6.2950\n", "2714 Topic20 1.009780 eating 6.424809 2.8198 -6.2928\n", "380 Topic20 0.964631 apollo 5.794740 2.8772 -6.3386\n", "2131 Topic20 0.879309 danny 3.686446 3.2369 -6.4312\n", "\n", "[1016 rows x 6 columns], token_table= Topic Freq Term\n", "term \n", "17 19 0.597014 abs\n", "22 14 0.666034 absolutes\n", "39 11 0.836374 accelerator\n", "116 2 0.527937 adb\n", "116 5 0.351958 adb\n", "124 2 0.625637 address\n", "124 7 0.052136 address\n", "124 10 0.104273 address\n", "124 11 0.052136 address\n", "124 12 0.078205 address\n", "124 18 0.078205 address\n", "138 12 0.888627 administration\n", "145 14 0.411956 admitting\n", "150 18 0.719837 adress\n", "154 2 0.588642 advance\n", "154 6 0.024527 advance\n", "154 7 0.049054 advance\n", "154 10 0.024527 advance\n", "154 11 0.245268 advance\n", "154 13 0.024527 advance\n", "154 18 0.049054 advance\n", "170 4 0.498933 advocacy\n", "237 2 0.303395 al\n", "237 6 0.424753 al\n", "237 12 0.121358 al\n", "237 16 0.121358 al\n", "256 7 0.460420 aliases\n", "256 15 0.460420 aliases\n", "278 6 0.403502 alomar\n", "278 16 0.538003 alomar\n", "... ... ... ...\n", "9519 13 0.657460 xdm\n", "9529 9 0.318048 xputimage\n", "9529 12 0.318048 xputimage\n", "9529 20 0.318048 xputimage\n", "9544 2 0.171415 yankees\n", "9544 7 0.171415 yankees\n", "9544 10 0.514245 yankees\n", "9550 1 0.407677 yea\n", "9550 8 0.407677 yea\n", "9552 1 0.015859 year\n", "9552 2 0.190311 year\n", "9552 6 0.555072 year\n", "9552 7 0.111014 year\n", "9552 12 0.126874 year\n", "9554 1 0.085130 years\n", "9554 2 0.238364 years\n", "9554 6 0.221338 years\n", "9554 7 0.170260 years\n", "9554 12 0.272416 years\n", "9568 6 0.218476 yo\n", "9568 12 0.218476 yo\n", "9568 17 0.436952 yo\n", "9575 19 0.780507 yr\n", "9578 1 0.148381 yup\n", "9578 8 0.593524 yup\n", "9578 12 0.148381 yup\n", "9590 2 0.073832 zip\n", "9590 11 0.073832 zip\n", "9590 18 0.738321 zip\n", "9590 19 0.073832 zip\n", "\n", "[2101 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pyLDAvis.sklearn.prepare(lda, dtm_tfidf, tfidf_vectorizer, sort_topics=False, mds=js_MMDS)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### TSNE" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "" ], "text/plain": [ "PreparedData(topic_coordinates= Freq cluster topics x y\n", "topic \n", "0 10.503341 1 1 -19.076361 19.202361\n", "1 28.438815 1 2 34.508946 -241.254927\n", "2 0.994623 1 3 75.072525 300.275330\n", "3 0.948237 1 4 182.725150 -77.903522\n", "4 1.029461 1 5 -68.233412 -182.249941\n", "5 10.277859 1 6 -23.627804 -83.292323\n", "6 11.955222 1 7 134.466032 462.075964\n", "7 0.931453 1 8 80.976567 -141.300070\n", "8 1.315214 1 9 -54.676999 -453.431738\n", "9 1.325113 1 10 82.161661 83.524152\n", "10 4.498473 1 11 -152.280733 -58.977112\n", "11 17.741730 1 12 -69.876013 187.256978\n", "12 1.824888 1 13 39.312714 184.903147\n", "13 1.057498 1 14 -200.524984 56.533852\n", "14 0.981087 1 15 82.110910 -24.444788\n", "15 1.122539 1 16 200.904107 66.865435\n", "16 0.988056 1 17 164.767475 191.398483\n", "17 1.656893 1 18 -96.840092 84.175119\n", "18 1.472447 1 19 -108.093813 -325.038761\n", "19 0.937052 1 20 -184.160473 -174.006877, topic_info= Category Freq Term Total loglift logprob\n", "term \n", "2638 Default 67.000000 drive 67.000000 30.0000 30.0000\n", "3656 Default 81.000000 god 81.000000 29.0000 29.0000\n", "8618 Default 108.000000 thanks 108.000000 28.0000 28.0000\n", "9398 Default 76.000000 windows 76.000000 27.0000 27.0000\n", "1203 Default 56.000000 card 56.000000 26.0000 26.0000\n", "3263 Default 54.000000 file 54.000000 25.0000 25.0000\n", "3266 Default 40.000000 files 40.000000 24.0000 24.0000\n", "2741 Default 87.000000 edu 87.000000 23.0000 23.0000\n", "7588 Default 31.000000 scsi 31.000000 22.0000 22.0000\n", "3483 Default 31.000000 ftp 31.000000 21.0000 21.0000\n", "9159 Default 40.000000 version 40.000000 20.0000 20.0000\n", "9397 Default 41.000000 window 41.000000 19.0000 19.0000\n", "6659 Default 54.000000 program 54.000000 18.0000 18.0000\n", "2569 Default 125.000000 does 125.000000 17.0000 17.0000\n", "9178 Default 35.000000 video 35.000000 16.0000 16.0000\n", "5488 Default 34.000000 monitor 34.000000 15.0000 15.0000\n", "2499 Default 34.000000 disk 34.000000 14.0000 14.0000\n", "2593 Default 42.000000 dos 42.000000 13.0000 13.0000\n", "5120 Default 68.000000 mail 68.000000 12.0000 12.0000\n", "124 Default 38.000000 address 38.000000 11.0000 11.0000\n", "1561 Default 61.000000 com 61.000000 10.0000 10.0000\n", "3527 Default 48.000000 game 48.000000 9.0000 9.0000\n", "1105 Default 26.000000 bus 26.000000 8.0000 8.0000\n", "8530 Default 40.000000 team 40.000000 7.0000 7.0000\n", "8598 Default 24.000000 test 24.000000 6.0000 6.0000\n", "3837 Default 45.000000 hard 45.000000 5.0000 5.0000\n", "4574 Default 38.000000 jesus 38.000000 4.0000 4.0000\n", "3948 Default 46.000000 hi 46.000000 3.0000 3.0000\n", "2642 Default 23.000000 drives 23.000000 2.0000 2.0000\n", "1861 Default 22.000000 controller 22.000000 1.0000 1.0000\n", "... ... ... ... ... ... ...\n", "7364 Topic20 0.958735 robotics 2.231489 3.8254 -6.3447\n", "2669 Topic20 2.012382 duke 4.928634 3.7744 -5.6032\n", "2898 Topic20 0.808980 envelopes 2.029844 3.7502 -6.5145\n", "8683 Topic20 3.866897 ticket 9.774426 3.7429 -4.9501\n", "9529 Topic20 1.119211 xputimage 3.144183 3.6373 -6.1899\n", "5750 Topic20 2.096357 nonsense 6.293258 3.5709 -5.5624\n", "7490 Topic20 0.727430 samuel 2.272779 3.5309 -6.6208\n", "6833 Topic20 0.679060 quarterly 2.136343 3.5240 -6.6896\n", "4565 Topic20 1.299791 jeep 4.101494 3.5210 -6.0404\n", "1122 Topic20 0.498698 bw 1.623146 3.4901 -6.9983\n", "870 Topic20 0.885381 bite 2.892924 3.4862 -6.4243\n", "3560 Topic20 1.082239 geez 3.630871 3.4597 -6.2235\n", "2382 Topic20 1.070237 devoted 3.599249 3.4573 -6.2347\n", "7456 Topic20 0.469594 rx 1.589034 3.4512 -7.0585\n", "9594 Topic20 0.350972 zq 1.188957 3.4501 -7.3496\n", "7459 Topic20 0.528406 rz 1.818195 3.4345 -6.9405\n", "6231 Topic20 0.467564 perl 1.614043 3.4312 -7.0628\n", "8773 Topic20 0.312267 tp 1.128945 3.3850 -7.4665\n", "3537 Topic20 1.903473 gary 6.913224 3.3804 -5.6589\n", "8538 Topic20 1.309161 technician 4.886299 3.3531 -6.0332\n", "9122 Topic20 0.914287 variant 3.339278 3.3748 -6.3922\n", "8845 Topic20 1.281730 tree 5.006041 3.3078 -6.0544\n", "8598 Topic20 4.318902 test 24.794514 2.9226 -4.8396\n", "7422 Topic20 0.873161 rr 3.403104 3.3099 -6.4382\n", "2153 Topic20 3.273311 david 28.411704 2.5092 -5.1168\n", "6392 Topic20 1.331068 pm 7.337767 2.9631 -6.0166\n", "2495 Topic20 1.007643 diseases 5.368270 2.9973 -6.2950\n", "2714 Topic20 1.009780 eating 6.424809 2.8198 -6.2928\n", "380 Topic20 0.964631 apollo 5.794740 2.8772 -6.3386\n", "2131 Topic20 0.879309 danny 3.686446 3.2369 -6.4312\n", "\n", "[1016 rows x 6 columns], token_table= Topic Freq Term\n", "term \n", "17 19 0.597014 abs\n", "22 14 0.666034 absolutes\n", "39 11 0.836374 accelerator\n", "116 2 0.527937 adb\n", "116 5 0.351958 adb\n", "124 2 0.625637 address\n", "124 7 0.052136 address\n", "124 10 0.104273 address\n", "124 11 0.052136 address\n", "124 12 0.078205 address\n", "124 18 0.078205 address\n", "138 12 0.888627 administration\n", "145 14 0.411956 admitting\n", "150 18 0.719837 adress\n", "154 2 0.588642 advance\n", "154 6 0.024527 advance\n", "154 7 0.049054 advance\n", "154 10 0.024527 advance\n", "154 11 0.245268 advance\n", "154 13 0.024527 advance\n", "154 18 0.049054 advance\n", "170 4 0.498933 advocacy\n", "237 2 0.303395 al\n", "237 6 0.424753 al\n", "237 12 0.121358 al\n", "237 16 0.121358 al\n", "256 7 0.460420 aliases\n", "256 15 0.460420 aliases\n", "278 6 0.403502 alomar\n", "278 16 0.538003 alomar\n", "... ... ... ...\n", "9519 13 0.657460 xdm\n", "9529 9 0.318048 xputimage\n", "9529 12 0.318048 xputimage\n", "9529 20 0.318048 xputimage\n", "9544 2 0.171415 yankees\n", "9544 7 0.171415 yankees\n", "9544 10 0.514245 yankees\n", "9550 1 0.407677 yea\n", "9550 8 0.407677 yea\n", "9552 1 0.015859 year\n", "9552 2 0.190311 year\n", "9552 6 0.555072 year\n", "9552 7 0.111014 year\n", "9552 12 0.126874 year\n", "9554 1 0.085130 years\n", "9554 2 0.238364 years\n", "9554 6 0.221338 years\n", "9554 7 0.170260 years\n", "9554 12 0.272416 years\n", "9568 6 0.218476 yo\n", "9568 12 0.218476 yo\n", "9568 17 0.436952 yo\n", "9575 19 0.780507 yr\n", "9578 1 0.148381 yup\n", "9578 8 0.593524 yup\n", "9578 12 0.148381 yup\n", "9590 2 0.073832 zip\n", "9590 11 0.073832 zip\n", "9590 18 0.738321 zip\n", "9590 19 0.073832 zip\n", "\n", "[2101 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pyLDAvis.sklearn.prepare(lda, dtm_tfidf, tfidf_vectorizer, sort_topics=False, mds=js_TSNE)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }