{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Import necessary dependencies" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import text_normalizer as tn\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load and normalize data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " review sentiment\n", "0 One of the other reviewers has mentioned that ... positive\n", "1 A wonderful little production.

The... positive\n", "2 I thought this was a wonderful way to spend ti... positive\n", "3 Basically there's a family where a little boy ... negative\n", "4 Petter Mattei's \"Love in the Time of Money\" is... positive\n" ] } ], "source": [ "dataset = pd.read_csv(r'movie_reviews.csv')\n", "\n", "# take a peek at the data\n", "print(dataset.head())\n", "reviews = np.array(dataset['review'])\n", "sentiments = np.array(dataset['sentiment'])\n", "\n", "# build train and test datasets\n", "train_reviews = reviews[:35000]\n", "train_sentiments = sentiments[:35000]\n", "test_reviews = reviews[35000:]\n", "test_sentiments = sentiments[35000:]\n", "\n", "# normalize datasets\n", "norm_train_reviews = tn.normalize_corpus(train_reviews)\n", "norm_test_reviews = tn.normalize_corpus(test_reviews)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Extract features from positive and negative reviews" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(25000, 331) (25000, 331)\n" ] } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "# consolidate all normalized reviews\n", "norm_reviews = norm_train_reviews+norm_test_reviews\n", "# get tf-idf features for only positive reviews\n", "positive_reviews = [review for review, sentiment in zip(norm_reviews, sentiments) if sentiment == 'positive']\n", "ptvf = TfidfVectorizer(use_idf=True, min_df=0.05, max_df=0.95, ngram_range=(1,1), sublinear_tf=True)\n", "ptvf_features = ptvf.fit_transform(positive_reviews)\n", "# get tf-idf features for only negative reviews\n", "negative_reviews = [review for review, sentiment in zip(norm_reviews, sentiments) if sentiment == 'negative']\n", "ntvf = TfidfVectorizer(use_idf=True, min_df=0.05, max_df=0.95, ngram_range=(1,1), sublinear_tf=True)\n", "ntvf_features = ntvf.fit_transform(negative_reviews)\n", "# view feature set dimensions\n", "print(ptvf_features.shape, ntvf_features.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Topic Modeling on Reviews" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pyLDAvis\n", "import pyLDAvis.sklearn\n", "from sklearn.decomposition import NMF\n", "import topic_model_utils as tmu\n", "\n", "pyLDAvis.enable_notebook()\n", "total_topics = 10" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Display and visualize topics for positive reviews" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Topic #1 without weights\n", "['like', 'not', 'think', 'really', 'say', 'would', 'get', 'know', 'thing', 'much', 'bad', 'go', 'lot', 'could', 'even']\n", "\n", "Topic #2 without weights\n", "['movie', 'see', 'watch', 'great', 'good', 'one', 'not', 'time', 'ever', 'enjoy', 'recommend', 'make', 'acting', 'like', 'first']\n", "\n", "Topic #3 without weights\n", "['show', 'episode', 'series', 'tv', 'watch', 'dvd', 'first', 'see', 'time', 'one', 'good', 'year', 'remember', 'ever', 'would']\n", "\n", "Topic #4 without weights\n", "['performance', 'role', 'play', 'actor', 'cast', 'good', 'well', 'great', 'character', 'excellent', 'give', 'also', 'support', 'star', 'job']\n", "\n", "Topic #5 without weights\n", "['man', 'young', 'old', 'two', 'get', 'year', 'woman', 'take', 'go', 'come', 'find', 'back', 'girl', 'father', 'friend']\n", "\n", "Topic #6 without weights\n", "['film', 'see', 'one', 'scene', 'make', 'not', 'time', 'director', 'horror', 'music', 'many', 'cinema', 'release', 'work', 'use']\n", "\n", "Topic #7 without weights\n", "['story', 'tell', 'character', 'true', 'book', 'well', 'line', 'base', 'interesting', 'end', 'simple', 'read', 'beautiful', 'main', 'different']\n", "\n", "Topic #8 without weights\n", "['funny', 'comedy', 'laugh', 'humor', 'fun', 'moment', 'line', 'not', 'guy', 'get', 'make', 'lot', 'one', 'time', 'show']\n", "\n", "Topic #9 without weights\n", "['life', 'world', 'people', 'us', 'real', 'live', 'human', 'war', 'many', 'show', 'not', 'way', 'no', 'make', 'feel']\n", "\n", "Topic #10 without weights\n", "['love', 'fall', 'song', 'wonderful', 'beautiful', 'music', 'heart', 'girl', 'would', 'watch', 'great', 'favorite', 'always', 'family', 'woman']\n", "\n" ] } ], "source": [ "# build topic model on positive sentiment review features\n", "pos_nmf = NMF(n_components=total_topics, \n", " random_state=42, alpha=0.1, l1_ratio=0.2)\n", "pos_nmf.fit(ptvf_features) \n", "# extract features and component weights\n", "pos_feature_names = ptvf.get_feature_names()\n", "pos_weights = pos_nmf.components_\n", "# extract and display topics and their components\n", "pos_topics = tmu.get_topics_terms_weights(pos_weights, pos_feature_names)\n", "tmu.print_topics_udf(topics=pos_topics,\n", " total_topics=total_topics,\n", " num_terms=15,\n", " display_weights=False)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "" ], "text/plain": [ "PreparedData(topic_coordinates= Freq cluster topics x y\n", "topic \n", "0 14.357252 1 1 0.014664 0.005799\n", "4 13.233860 1 2 0.056482 0.065233\n", "5 12.933760 1 3 0.040293 -0.060175\n", "1 11.316494 1 4 -0.222410 -0.076387\n", "8 10.693920 1 5 0.078358 0.151408\n", "3 10.532171 1 6 0.151155 -0.101375\n", "6 7.560320 1 7 0.253692 0.048936\n", "2 7.418161 1 8 -0.101996 -0.057984\n", "9 6.489072 1 9 -0.205223 0.225435\n", "7 5.464990 1 10 -0.065015 -0.200889, topic_info= Category Freq Term Total loglift logprob\n", "term \n", "179 Default 2065.000000 love 2065.000000 15.0000 15.0000\n", "279 Default 1945.000000 story 1945.000000 14.0000 14.0000\n", "195 Default 2565.000000 movie 2565.000000 13.0000 13.0000\n", "117 Default 1355.000000 funny 1355.000000 12.0000 12.0000\n", "55 Default 1055.000000 comedy 1055.000000 11.0000 11.0000\n", "107 Default 1895.000000 film 1895.000000 10.0000 10.0000\n", "162 Default 825.000000 laugh 825.000000 9.0000 9.0000\n", "263 Default 1175.000000 show 1175.000000 8.0000 8.0000\n", "85 Default 681.000000 episode 681.000000 7.0000 7.0000\n", "257 Default 669.000000 series 669.000000 6.0000 6.0000\n", "169 Default 768.000000 life 768.000000 5.0000 5.0000\n", "217 Default 680.000000 performance 680.000000 4.0000 4.0000\n", "245 Default 619.000000 role 619.000000 3.0000 3.0000\n", "223 Default 798.000000 play 798.000000 2.0000 2.0000\n", "286 Default 652.000000 tell 652.000000 1.0000 1.0000\n", "186 Topic1 163.964151 maybe 211.102332 1.6882 -4.8415\n", "228 Topic1 242.612464 pretty 317.111517 1.6731 -4.4497\n", "26 Topic1 309.015145 bad 419.511014 1.6352 -4.2077\n", "247 Topic1 446.859755 say 643.700289 1.5759 -3.8389\n", "19 Topic1 145.014644 anything 227.330473 1.4913 -4.9643\n", "237 Topic1 522.341314 really 828.464707 1.4797 -3.6828\n", "288 Topic1 337.092842 thing 543.542192 1.4632 -4.1208\n", "187 Topic1 156.417072 mean 253.055665 1.4598 -4.8886\n", "289 Topic1 523.237842 think 850.034573 1.4557 -3.6811\n", "270 Topic1 229.367808 something 393.079793 1.4022 -4.5058\n", "7 Topic1 198.654126 actually 342.681974 1.3957 -4.6496\n", "79 Topic1 105.638211 else 183.898452 1.3866 -5.2811\n", "283 Topic1 159.592233 sure 301.191519 1.3058 -4.8685\n", "178 Topic1 276.957480 lot 523.368386 1.3045 -4.3173\n", "124 Topic1 197.261326 guy 379.271749 1.2872 -4.6566\n", "... ... ... ... ... ... ...\n", "99 Topic9 133.490184 family 488.645440 1.4374 -4.2530\n", "40 Topic9 88.167362 buy 323.259890 1.4358 -4.6678\n", "14 Topic9 135.841950 always 519.108391 1.3944 -4.2355\n", "18 Topic9 109.917708 anyone 428.757251 1.3739 -4.4473\n", "114 Topic9 106.632712 friend 447.365588 1.3011 -4.4776\n", "326 Topic9 153.541222 would 933.558639 0.9300 -4.1130\n", "313 Topic9 152.050154 watch 1292.108164 0.5952 -4.1228\n", "123 Topic9 149.202780 great 1383.292652 0.5081 -4.1417\n", "117 Topic10 1355.897103 funny 1355.897103 2.9068 -1.7630\n", "55 Topic10 980.942418 comedy 1055.312106 2.8337 -2.0867\n", "162 Topic10 765.627217 laugh 825.811834 2.8311 -2.3345\n", "146 Topic10 254.685511 humor 387.746145 2.4865 -3.4352\n", "116 Topic10 235.869802 fun 546.606942 2.0664 -3.5120\n", "192 Topic10 148.673313 moment 426.128098 1.8538 -3.9735\n", "124 Topic10 101.631419 guy 379.271749 1.5899 -4.3539\n", "172 Topic10 117.008950 line 524.329490 1.4069 -4.2130\n", "84 Topic10 55.037206 entertaining 267.859275 1.3244 -4.9672\n", "170 Topic10 43.447286 light 222.124282 1.2751 -5.2037\n", "275 Topic10 44.431247 stand 229.709880 1.2639 -5.1813\n", "52 Topic10 75.998552 classic 410.741910 1.2196 -4.6445\n", "136 Topic10 37.582918 hit 205.144038 1.2096 -5.3487\n", "261 Topic10 61.192069 short 366.090109 1.1179 -4.8612\n", "178 Topic10 86.999816 lot 523.368386 1.1124 -4.5093\n", "114 Topic10 63.780641 friend 447.365588 0.9589 -4.8198\n", "118 Topic10 93.961029 get 954.639358 0.5884 -4.4324\n", "207 Topic10 110.935070 not 1929.428395 0.0508 -4.2663\n", "181 Topic10 90.974581 make 1232.861491 0.3003 -4.4647\n", "293 Topic10 81.488121 time 1219.124446 0.2014 -4.5748\n", "263 Topic10 79.633141 show 1175.691166 0.2146 -4.5978\n", "211 Topic10 83.111443 one 1489.836716 0.0206 -4.5551\n", "\n", "[248 rows x 6 columns], token_table= Topic Freq Term\n", "term \n", "0 1 0.132356 10\n", "0 2 0.022529 10\n", "0 3 0.163333 10\n", "0 4 0.492814 10\n", "0 6 0.033793 10\n", "0 7 0.022529 10\n", "0 8 0.101379 10\n", "0 10 0.028161 10\n", "3 1 0.035409 acting\n", "3 3 0.143501 acting\n", "3 4 0.432368 acting\n", "3 6 0.240411 acting\n", "3 7 0.147229 acting\n", "5 4 0.144129 actor\n", "5 5 0.015244 actor\n", "5 6 0.823198 actor\n", "5 8 0.013859 actor\n", "5 9 0.002772 actor\n", "6 2 0.035622 actress\n", "6 4 0.060557 actress\n", "6 6 0.744495 actress\n", "6 9 0.156736 actress\n", "7 1 0.580713 actually\n", "7 2 0.087545 actually\n", "7 3 0.125481 actually\n", "7 4 0.023345 actually\n", "7 6 0.014591 actually\n", "7 7 0.011673 actually\n", "7 8 0.070036 actually\n", "7 10 0.090463 actually\n", "... ... ... ...\n", "321 3 0.094962 wonderful\n", "321 4 0.066294 wonderful\n", "321 5 0.035835 wonderful\n", "321 6 0.218591 wonderful\n", "321 7 0.060919 wonderful\n", "321 8 0.003583 wonderful\n", "321 9 0.519601 wonderful\n", "323 1 0.090976 work\n", "323 2 0.172205 work\n", "323 3 0.297297 work\n", "323 5 0.126717 work\n", "323 6 0.290799 work\n", "323 10 0.022744 work\n", "324 3 0.036764 world\n", "324 5 0.964523 world\n", "326 1 0.414543 would\n", "326 2 0.035349 would\n", "326 3 0.109259 would\n", "326 4 0.093192 would\n", "326 5 0.051416 would\n", "326 8 0.131754 would\n", "326 9 0.164960 would\n", "328 2 0.365776 year\n", "328 3 0.182888 year\n", "328 4 0.260651 year\n", "328 8 0.191528 year\n", "330 2 0.685172 young\n", "330 6 0.115952 young\n", "330 7 0.044273 young\n", "330 9 0.153900 young\n", "\n", "[1027 rows x 3 columns], R=15, lambda_step=0.01, plot_opts={'ylab': 'PC2', 'xlab': 'PC1'}, topic_order=[1, 5, 6, 2, 9, 4, 7, 3, 10, 8])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pyLDAvis.sklearn.prepare(pos_nmf, ptvf_features, ptvf, R=15)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Display and visualize topics for negative reviews" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Topic #1 without weights\n", "['get', 'go', 'kill', 'guy', 'scene', 'take', 'end', 'back', 'start', 'around', 'look', 'one', 'thing', 'come', 'first']\n", "\n", "Topic #2 without weights\n", "['bad', 'movie', 'ever', 'acting', 'see', 'terrible', 'one', 'plot', 'effect', 'awful', 'not', 'even', 'make', 'horrible', 'special']\n", "\n", "Topic #3 without weights\n", "['film', 'make', 'not', 'see', 'would', 'director', 'one', 'many', 'may', 'bad', 'however', 'horror', 'no', 'say', 'feel']\n", "\n", "Topic #4 without weights\n", "['character', 'story', 'book', 'plot', 'main', 'seem', 'no', 'interesting', 'not', 'movie', 'read', 'end', 'feel', 'nothing', 'original']\n", "\n", "Topic #5 without weights\n", "['movie', 'think', 'would', 'not', 'like', 'say', 'watch', 'could', 'see', 'really', 'people', 'good', 'know', 'want', 'make']\n", "\n", "Topic #6 without weights\n", "['funny', 'comedy', 'laugh', 'joke', 'try', 'not', 'stupid', 'suppose', 'moment', 'fun', 'even', 'black', 'guy', 'character', 'really']\n", "\n", "Topic #7 without weights\n", "['actor', 'play', 'good', 'cast', 'role', 'performance', 'script', 'much', 'great', 'star', 'act', 'look', 'well', 'give', 'director']\n", "\n", "Topic #8 without weights\n", "['man', 'woman', 'old', 'young', 'year', 'life', 'love', 'girl', 'child', 'play', 'sex', 'wife', 'family', 'boy', 'kid']\n", "\n", "Topic #9 without weights\n", "['show', 'tv', 'series', 'watch', 'not', 'original', 'people', 'like', 'every', 'new', 'kid', 'us', 'make', 'use', 'american']\n", "\n", "Topic #10 without weights\n", "['waste', 'time', 'money', 'watch', 'minute', 'hour', 'movie', 'spend', 'not', 'life', 'save', 'even', 'worth', 'back', 'crap']\n", "\n" ] } ], "source": [ "# build topic model on negative sentiment review features\n", "neg_nmf = NMF(n_components=10, \n", " random_state=42, alpha=0.1, l1_ratio=0.2)\n", "neg_nmf.fit(ntvf_features) \n", "# extract features and component weights\n", "neg_feature_names = ntvf.get_feature_names()\n", "neg_weights = neg_nmf.components_\n", "# extract and display topics and their components\n", "neg_topics = tmu.get_topics_terms_weights(neg_weights, neg_feature_names)\n", "tmu.print_topics_udf(topics=neg_topics,\n", " total_topics=total_topics,\n", " num_terms=15,\n", " display_weights=False) " ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "" ], "text/plain": [ "PreparedData(topic_coordinates= Freq cluster topics x y\n", "topic \n", "0 15.159817 1 1 0.076605 -0.021691\n", "4 13.708854 1 2 -0.060157 -0.026393\n", "2 11.157697 1 3 0.001907 -0.021786\n", "3 10.813738 1 4 0.076284 0.049930\n", "1 10.635604 1 5 -0.231589 0.020158\n", "6 10.582598 1 6 0.107173 0.039257\n", "7 8.862276 1 7 0.218109 -0.142682\n", "9 7.120793 1 8 -0.242084 -0.125559\n", "8 6.419647 1 9 0.053506 -0.048141\n", "5 5.538976 1 10 0.000245 0.276907, topic_info= Category Freq Term Total loglift logprob\n", "term \n", "107 Default 2236.000000 film 2236.000000 15.0000 15.0000\n", "260 Default 1356.000000 show 1356.000000 14.0000 14.0000\n", "117 Default 1240.000000 funny 1240.000000 13.0000 13.0000\n", "28 Default 1776.000000 bad 1776.000000 12.0000 12.0000\n", "310 Default 1111.000000 waste 1111.000000 11.0000 11.0000\n", "55 Default 893.000000 comedy 893.000000 10.0000 10.0000\n", "289 Default 1399.000000 time 1399.000000 9.0000 9.0000\n", "189 Default 2357.000000 movie 2357.000000 8.0000 8.0000\n", "157 Default 799.000000 laugh 799.000000 7.0000 7.0000\n", "49 Default 1084.000000 character 1084.000000 6.0000 6.0000\n", "176 Default 791.000000 man 791.000000 5.0000 5.0000\n", "275 Default 930.000000 story 930.000000 4.0000 4.0000\n", "148 Default 642.000000 joke 642.000000 3.0000 3.0000\n", "187 Default 667.000000 money 667.000000 2.0000 2.0000\n", "317 Default 639.000000 woman 639.000000 1.0000 1.0000\n", "151 Topic1 296.182894 kill 318.826975 1.8129 -4.3284\n", "130 Topic1 162.230095 head 221.157364 1.5767 -4.9303\n", "67 Topic1 144.792485 dead 201.323472 1.5569 -5.0441\n", "140 Topic1 139.108803 house 200.283826 1.5220 -5.0841\n", "243 Topic1 192.881583 run 279.680171 1.5150 -4.7573\n", "20 Topic1 218.199939 around 324.167125 1.4907 -4.6339\n", "257 Topic1 151.465034 shoot 228.512367 1.4753 -4.9990\n", "125 Topic1 295.348838 guy 483.584801 1.3935 -4.3312\n", "272 Topic1 224.884183 start 382.346878 1.3558 -4.6038\n", "65 Topic1 106.941458 cut 182.422547 1.3525 -5.3471\n", "72 Topic1 125.686888 die 222.527374 1.3153 -5.1856\n", "308 Topic1 102.504916 walk 188.514620 1.2773 -5.3894\n", "106 Topic1 111.576802 fight 209.403931 1.2570 -5.3046\n", "108 Topic1 87.375183 finally 165.255309 1.2492 -5.5491\n", "68 Topic1 114.935979 death 217.874857 1.2470 -5.2750\n", "... ... ... ... ... ... ...\n", "304 Topic9 59.923876 version 294.737118 1.1528 -5.0670\n", "303 Topic9 93.310733 use 480.903906 1.1061 -4.6241\n", "167 Topic9 63.154657 live 327.356586 1.1003 -5.0145\n", "325 Topic9 58.315564 writer 305.496884 1.0897 -5.0942\n", "211 Topic9 121.985745 people 743.555845 0.9383 -4.3562\n", "311 Topic9 154.980332 watch 1342.737374 0.5866 -4.1168\n", "164 Topic9 119.715163 like 1298.007227 0.3623 -4.3750\n", "201 Topic9 127.169026 not 2356.114616 -0.1734 -4.3146\n", "177 Topic9 82.434764 many 631.931797 0.7090 -4.7481\n", "175 Topic9 98.653405 make 1477.664778 0.0392 -4.5685\n", "117 Topic10 1240.046669 funny 1240.046669 2.8934 -1.8896\n", "55 Topic10 854.231692 comedy 893.748710 2.8481 -2.2623\n", "148 Topic10 575.233228 joke 642.217885 2.7832 -2.6577\n", "157 Topic10 647.703989 laugh 799.954468 2.6822 -2.5391\n", "278 Topic10 122.392615 suppose 364.124516 1.8031 -4.2053\n", "116 Topic10 102.499123 fun 323.877127 1.7429 -4.3827\n", "186 Topic10 107.874050 moment 348.136178 1.7217 -4.3316\n", "276 Topic10 151.737857 stupid 500.721533 1.6995 -3.9904\n", "36 Topic10 84.747293 black 308.246395 1.6021 -4.5728\n", "296 Topic10 178.486645 try 670.578374 1.5697 -3.8280\n", "99 Topic10 71.848210 fail 344.587750 1.3256 -4.7380\n", "129 Topic10 68.522321 hard 376.597156 1.1893 -4.7854\n", "100 Topic10 57.221689 fall 336.864447 1.1206 -4.9656\n", "35 Topic10 55.340713 bit 346.216947 1.0598 -4.9990\n", "125 Topic10 76.891073 guy 483.584801 1.0545 -4.6701\n", "201 Topic10 172.777890 not 2356.114616 0.2806 -3.8605\n", "6 Topic10 71.154300 actually 544.299031 0.8587 -4.7477\n", "88 Topic10 86.292079 even 1179.687172 0.2781 -4.5548\n", "49 Topic10 76.267272 character 1084.332502 0.2389 -4.6783\n", "232 Topic10 72.663809 really 924.396363 0.3501 -4.7267\n", "\n", "[239 rows x 6 columns], token_table= Topic Freq Term\n", "term \n", "0 1 0.058075 10\n", "0 2 0.194426 10\n", "0 3 0.080800 10\n", "0 5 0.257552 10\n", "0 6 0.002525 10\n", "0 7 0.047975 10\n", "0 8 0.303002 10\n", "0 9 0.020200 10\n", "0 10 0.037875 10\n", "2 1 0.010151 act\n", "2 2 0.084588 act\n", "2 3 0.060903 act\n", "2 5 0.263914 act\n", "2 6 0.422939 act\n", "2 7 0.059211 act\n", "2 8 0.069362 act\n", "2 9 0.027068 act\n", "3 3 0.076522 acting\n", "3 4 0.068020 acting\n", "3 5 0.796802 acting\n", "3 6 0.041298 acting\n", "3 8 0.017005 acting\n", "5 5 0.144704 actor\n", "5 6 0.763215 actor\n", "5 8 0.090920 actor\n", "6 1 0.196583 actually\n", "6 2 0.260886 actually\n", "6 3 0.132280 actually\n", "6 5 0.121257 actually\n", "6 6 0.045931 actually\n", "... ... ... ...\n", "323 2 0.597173 would\n", "323 3 0.214213 would\n", "323 6 0.023607 would\n", "323 7 0.065575 would\n", "323 8 0.023607 would\n", "323 9 0.064701 would\n", "325 1 0.075287 writer\n", "325 2 0.052374 writer\n", "325 3 0.003273 writer\n", "325 4 0.360069 writer\n", "325 6 0.193128 writer\n", "325 8 0.039280 writer\n", "325 9 0.189855 writer\n", "325 10 0.085107 writer\n", "326 1 0.226257 wrong\n", "326 2 0.497023 wrong\n", "326 3 0.063055 wrong\n", "326 4 0.018546 wrong\n", "326 5 0.018546 wrong\n", "326 6 0.051928 wrong\n", "326 9 0.066764 wrong\n", "326 10 0.063055 wrong\n", "327 2 0.074992 year\n", "327 3 0.091490 year\n", "327 5 0.080991 year\n", "327 6 0.011999 year\n", "327 7 0.643428 year\n", "327 9 0.095989 year\n", "330 6 0.042668 young\n", "330 7 0.959016 young\n", "\n", "[1030 rows x 3 columns], R=15, lambda_step=0.01, plot_opts={'ylab': 'PC2', 'xlab': 'PC1'}, topic_order=[1, 5, 3, 4, 2, 7, 8, 10, 9, 6])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pyLDAvis.sklearn.prepare(neg_nmf, ntvf_features, ntvf, R=15)" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }