{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Final Model" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": true }, "outputs": [], "source": [ "import pandas as pd\n", "from gensim import models,corpora\n", "import pyLDAvis.gensim\n", "from gensim.models.coherencemodel import CoherenceModel\n", "import warnings" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "pd.set_option('max_colwidth',400)\n", "pyLDAvis.enable_notebook()\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "Lda = models.LdaMulticore\n", "lda_final =Lda.load('lda_final')\n", "dictionary = corpora.Dictionary.load('dictionary')\n", "doc_term_matrix = corpora.MmCorpus('doc_term_matrix.mm')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "One stupid thing with LDA in gensim is when it shows the top coherent topics, it shows the word representation and the coherence score but it doesnt map with the topic id. The next cell is to overcome that issue and correctly map coherence score with the right topic id" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Topic Evaluation " ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Topic | \n", "words | \n", "cs | \n", "
---|---|---|---|
8 | \n", "Topic9 | \n", "{spark, python, r, tableau, technical, hive, pig, java, hadoop, sql} | \n", "-0.937897 | \n", "
9 | \n", "Topic10 | \n", "{python, css, mysql, c, html, java, javascript, sql, c++, php} | \n", "-1.086996 | \n", "
11 | \n", "Topic12 | \n", "{sas, python, r, excel, tableau, technical, matlab, java, sql, c++} | \n", "-1.159532 | \n", "
6 | \n", "Topic7 | \n", "{spark, hadoop, python, r, tableau, technical, pandas, scikit, sql, numpy} | \n", "-1.333507 | \n", "
1 | \n", "Topic2 | \n", "{ms, python, r, technical, c, data, java, sql, oracle, windows} | \n", "-1.502323 | \n", "
0 | \n", "Topic1 | \n", "{xml, technical, html, java, uml, sql, pl, windows, oracle, agile} | \n", "-1.657700 | \n", "
4 | \n", "Topic5 | \n", "{skills, computer, python, r, excel, technical, matlab, data, sql, windows} | \n", "-1.827899 | \n", "
5 | \n", "Topic6 | \n", "{means, key, excel, technical, k, access, teradata, sql, sql_server, oracle} | \n", "-2.382989 | \n", "
2 | \n", "Topic3 | \n", "{project, core, computer, analytics, analysis, r, team, data, areasof, c++} | \n", "-2.893935 | \n", "
3 | \n", "Topic4 | \n", "{project, skills, linkedin, powershell, salesforce, unix, technical, bullhorn, linux, taleo} | \n", "-6.654834 | \n", "
10 | \n", "Topic11 | \n", "{relevant, research, illustrator, french, data, spanish, native, english, mandarin, indesign} | \n", "-10.106876 | \n", "
7 | \n", "Topic8 | \n", "{s., d., core, software, m., skill, jobvite, j., r., taleo} | \n", "-11.682204 | \n", "
\n", " | Topic | \n", "words with Relevance | \n", "
---|---|---|
11 | \n", "Topic9 | \n", "{sqoop, kafka, cassandra, hdfs, hbase, hive, pig, impala, flume, oozie} | \n", "
1 | \n", "Topic10 | \n", "{jquery, xml, css, eclipse, html, c, ajax, django, javascript, php} | \n", "
3 | \n", "Topic12 | \n", "{sas, powerpoint, python, r, excel, matlab, spss, sql, word, stata} | \n", "
9 | \n", "Topic7 | \n", "{classification, svm, learn, k, scikit, pandas, regression, matplotlib, scipy, numpy} | \n", "
4 | \n", "Topic2 | \n", "{mssuite2012, tmux, spark2.0, databaseand, tableau_8, hive2.8, windows7/8/10, hadoop2, electronic, python2.7/3.3} | \n", "
0 | \n", "Topic1 | \n", "{jboss, weblogic, ant, rmi, struts, soap, jsf, uml, jms, cvs} | \n", "
7 | \n", "Topic5 | \n", "{linearandnon, hplc, mexico, gc, community, volunteer, tika, excelandword, pune, ontology} | \n", "
8 | \n", "Topic6 | \n", "{ggplot2and, gridsearchand, oncology, modeltuning/, ddl, stepwise, filter_methods, hiv, pigand, dml} | \n", "
5 | \n", "Topic3 | \n", "{magento, public_health, copy, campaign, hebrew, lucid, oracle_rdbms, ubuntuand, spatialdata, linearalgebra} | \n", "
6 | \n", "Topic4 | \n", "{dataquality, erecruit, d.c., brassring, google_earth, october, scorecards, bullhorn, icims, taleo} | \n", "
2 | \n", "Topic11 | \n", "{french, spanish, native, testing/, hootsuite, english, chinese, mandarin, cantonese, indesign} | \n", "
10 | \n", "Topic8 | \n", "{s., d., l., n., m., y., jobvite, g., j., p.} | \n", "