{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Final Model" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": true }, "outputs": [], "source": [ "import pandas as pd\n", "from gensim import models,corpora\n", "import pyLDAvis.gensim\n", "from gensim.models.coherencemodel import CoherenceModel\n", "import warnings" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "pd.set_option('max_colwidth',400)\n", "pyLDAvis.enable_notebook()\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "Lda = models.LdaMulticore\n", "lda_final =Lda.load('lda_final')\n", "dictionary = corpora.Dictionary.load('dictionary')\n", "doc_term_matrix = corpora.MmCorpus('doc_term_matrix.mm')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "One stupid thing with LDA in gensim is when it shows the top coherent topics, it shows the word representation and the coherence score but it doesnt map with the topic id. The next cell is to overcome that issue and correctly map coherence score with the right topic id" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Topic Evaluation " ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Topicwordscs
8Topic9{spark, python, r, tableau, technical, hive, pig, java, hadoop, sql}-0.937897
9Topic10{python, css, mysql, c, html, java, javascript, sql, c++, php}-1.086996
11Topic12{sas, python, r, excel, tableau, technical, matlab, java, sql, c++}-1.159532
6Topic7{spark, hadoop, python, r, tableau, technical, pandas, scikit, sql, numpy}-1.333507
1Topic2{ms, python, r, technical, c, data, java, sql, oracle, windows}-1.502323
0Topic1{xml, technical, html, java, uml, sql, pl, windows, oracle, agile}-1.657700
4Topic5{skills, computer, python, r, excel, technical, matlab, data, sql, windows}-1.827899
5Topic6{means, key, excel, technical, k, access, teradata, sql, sql_server, oracle}-2.382989
2Topic3{project, core, computer, analytics, analysis, r, team, data, areasof, c++}-2.893935
3Topic4{project, skills, linkedin, powershell, salesforce, unix, technical, bullhorn, linux, taleo}-6.654834
10Topic11{relevant, research, illustrator, french, data, spanish, native, english, mandarin, indesign}-10.106876
7Topic8{s., d., core, software, m., skill, jobvite, j., r., taleo}-11.682204
\n", "
" ], "text/plain": [ " Topic \\\n", "8 Topic9 \n", "9 Topic10 \n", "11 Topic12 \n", "6 Topic7 \n", "1 Topic2 \n", "0 Topic1 \n", "4 Topic5 \n", "5 Topic6 \n", "2 Topic3 \n", "3 Topic4 \n", "10 Topic11 \n", "7 Topic8 \n", "\n", " words \\\n", "8 {spark, python, r, tableau, technical, hive, pig, java, hadoop, sql} \n", "9 {python, css, mysql, c, html, java, javascript, sql, c++, php} \n", "11 {sas, python, r, excel, tableau, technical, matlab, java, sql, c++} \n", "6 {spark, hadoop, python, r, tableau, technical, pandas, scikit, sql, numpy} \n", "1 {ms, python, r, technical, c, data, java, sql, oracle, windows} \n", "0 {xml, technical, html, java, uml, sql, pl, windows, oracle, agile} \n", "4 {skills, computer, python, r, excel, technical, matlab, data, sql, windows} \n", "5 {means, key, excel, technical, k, access, teradata, sql, sql_server, oracle} \n", "2 {project, core, computer, analytics, analysis, r, team, data, areasof, c++} \n", "3 {project, skills, linkedin, powershell, salesforce, unix, technical, bullhorn, linux, taleo} \n", "10 {relevant, research, illustrator, french, data, spanish, native, english, mandarin, indesign} \n", "7 {s., d., core, software, m., skill, jobvite, j., r., taleo} \n", "\n", " cs \n", "8 -0.937897 \n", "9 -1.086996 \n", "11 -1.159532 \n", "6 -1.333507 \n", "1 -1.502323 \n", "0 -1.657700 \n", "4 -1.827899 \n", "5 -2.382989 \n", "2 -2.893935 \n", "3 -6.654834 \n", "10 -10.106876 \n", "7 -11.682204 " ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a = lda_final.show_topics(num_topics=12,formatted=False,num_words=10)\n", "b = lda_final.top_topics(doc_term_matrix,dictionary=dictionary,topn=10) # This orders the topics in the decreasing order of coherence score\n", "\n", "topic2skillb = {}\n", "topic2csb = {}\n", "topic2skilla = {}\n", "topic2csa = {}\n", "num_topics =lda_final.num_topics\n", "cnt =1\n", "\n", "for ws in b:\n", " wset = set(w[1] for w in ws[0])\n", " topic2skillb[cnt] = wset\n", " topic2csb[cnt] = ws[1]\n", " cnt +=1\n", "\n", "for ws in a:\n", " wset = set(w[0]for w in ws[1])\n", " topic2skilla[ws[0]+1] = wset\n", " \n", "for i in range(1,num_topics+1):\n", " for j in range(1,num_topics+1): \n", " if topic2skilla[i].intersection(topic2skillb[j])==topic2skilla[i]:\n", " topic2csa[i] = topic2csb[j]\n", "\n", "finalData = pd.DataFrame([],columns=['Topic','words'])\n", "finalData['Topic']=topic2skilla.keys()\n", "finalData['Topic'] = finalData['Topic'].apply(lambda x: 'Topic'+str(x))\n", "finalData['words']=topic2skilla.values()\n", "finalData['cs'] = topic2csa.values()\n", "finalData.sort_values(by='cs',ascending=False,inplace=True)\n", "finalData.to_csv('CoherenceScore.csv')\n", "finalData" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Visualizing with pyLDAviz" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "" ], "text/plain": [ "PreparedData(topic_coordinates= Freq cluster topics x y\n", "topic \n", "0 7.238229 1 1 -0.028248 -0.179300\n", "1 3.746417 1 2 0.012753 -0.027564\n", "2 2.335267 1 3 0.122807 0.011622\n", "3 2.674761 1 4 0.151100 -0.037611\n", "4 3.370106 1 5 0.074784 0.057546\n", "5 2.191426 1 6 0.132711 0.022145\n", "6 20.330587 1 7 -0.204531 0.141215\n", "7 3.048626 1 8 0.158089 0.021893\n", "8 17.504662 1 9 -0.230710 -0.043406\n", "9 14.389081 1 10 -0.171886 -0.107890\n", "10 2.149308 1 11 0.161517 0.042300\n", "11 21.021530 1 12 -0.178386 0.099050, topic_info= Category Freq Term Total loglift logprob\n", "term \n", "81 Default 1530.000000 sas 1530.000000 30.0000 30.0000\n", "99 Default 701.000000 pandas 701.000000 29.0000 29.0000\n", "21 Default 1020.000000 javascript 1020.000000 28.0000 28.0000\n", "75 Default 1058.000000 hive 1058.000000 27.0000 27.0000\n", "2 Default 1114.000000 html 1114.000000 26.0000 26.0000\n", "0 Default 656.000000 css 656.000000 25.0000 25.0000\n", "150 Default 567.000000 numpy 567.000000 24.0000 24.0000\n", "100 Default 458.000000 scikit 458.000000 23.0000 23.0000\n", "204 Default 646.000000 pig 646.000000 22.0000 22.0000\n", "7 Default 3289.000000 r 3289.000000 21.0000 21.0000\n", "6 Default 3500.000000 python 3500.000000 20.0000 20.0000\n", "27 Default 1092.000000 excel 1092.000000 19.0000 19.0000\n", "31 Default 1462.000000 matlab 1462.000000 18.0000 18.0000\n", "20 Default 1851.000000 java 1851.000000 17.0000 17.0000\n", "139 Default 395.000000 php 395.000000 16.0000 16.0000\n", "39 Default 357.000000 hbase 357.000000 15.0000 15.0000\n", "252 Default 300.000000 matplotlib 300.000000 14.0000 14.0000\n", "82 Default 954.000000 spark 954.000000 13.0000 13.0000\n", "47 Default 515.000000 powerpoint 515.000000 12.0000 12.0000\n", "898 Default 286.000000 sqoop 286.000000 11.0000 11.0000\n", "13 Default 1582.000000 c 1582.000000 10.0000 10.0000\n", "259 Default 359.000000 scipy 359.000000 9.0000 9.0000\n", "195 Default 684.000000 spss 684.000000 8.0000 8.0000\n", "253 Default 302.000000 learn 302.000000 7.0000 7.0000\n", "216 Default 513.000000 xml 513.000000 6.0000 6.0000\n", "78 Default 337.000000 regression 337.000000 5.0000 5.0000\n", "269 Default 257.000000 jquery 257.000000 4.0000 4.0000\n", "38 Default 1346.000000 hadoop 1346.000000 3.0000 3.0000\n", "93 Default 370.000000 svm 370.000000 2.0000 2.0000\n", "107 Default 357.000000 clustering 357.000000 1.0000 1.0000\n", "... ... ... ... ... ... ...\n", "81 Topic12 1052.860840 sas 1530.139893 1.1858 -3.4139\n", "106 Topic12 306.949036 access 418.801880 1.2489 -4.6465\n", "59 Topic12 308.074188 computer 438.480194 1.2066 -4.6429\n", "742 Topic12 43.209030 maple 53.736332 1.3416 -6.6072\n", "567 Topic12 34.163467 spss_modeler 42.265919 1.3468 -6.8421\n", "31 Topic12 850.477356 matlab 1462.463989 1.0175 -3.6274\n", "170 Topic12 129.183517 latex 185.219879 1.1993 -5.5120\n", "7 Topic12 1540.567017 r 3289.287109 0.8011 -3.0333\n", "1445 Topic12 95.578224 mathematica 136.352020 1.2043 -5.8133\n", "8 Topic12 1368.419189 sql 3209.395996 0.7072 -3.1518\n", "6 Topic12 1386.641113 python 3500.394531 0.6336 -3.1386\n", "10 Topic12 691.729248 tableau 1596.443970 0.7233 -3.8340\n", "14 Topic12 684.805542 c++ 1672.185425 0.6669 -3.8441\n", "437 Topic12 99.844368 msoffice 152.984818 1.1329 -5.7696\n", "446 Topic12 77.332993 sap 113.154388 1.1790 -6.0251\n", "13 Topic12 476.179688 c 1582.063110 0.3589 -4.2074\n", "11 Topic12 502.025482 technical 1821.294922 0.2710 -4.1546\n", "20 Topic12 505.136475 java 1851.706787 0.2606 -4.1484\n", "184 Topic12 177.760452 skills 410.233459 0.7233 -5.1928\n", "1224 Topic12 124.022964 visio 258.190247 0.8264 -5.5527\n", "48 Topic12 237.284393 programming 844.456665 0.2902 -4.9040\n", "103 Topic12 249.680313 linux 1056.768921 0.1168 -4.8530\n", "38 Topic12 278.946930 hadoop 1346.532715 -0.0146 -4.7422\n", "45 Topic12 234.101151 mysql 1187.525879 -0.0643 -4.9175\n", "116 Topic12 186.481079 unix 744.110657 0.1758 -5.1449\n", "53 Topic12 168.867035 windows 685.000488 0.1593 -5.2441\n", "176 Topic12 162.268417 data 742.070618 0.0394 -5.2840\n", "180 Topic12 135.283112 ms 415.437500 0.4377 -5.4658\n", "2 Topic12 146.647583 html 1114.218872 -0.4683 -5.3852\n", "82 Topic12 143.118164 spark 954.700867 -0.3381 -5.4095\n", "\n", "[947 rows x 6 columns], token_table= Topic Freq Term\n", "term \n", "6006 2 0.690387 -9.2\n", "8822 9 0.958744 6.x\n", "541 12 0.929468 @risk\n", "480 1 0.101507 December\n", "480 5 0.507537 December\n", "480 8 0.101507 December\n", "480 10 0.101507 December\n", "480 11 0.101507 December\n", "480 12 0.101507 December\n", "128 1 0.195921 July\n", "128 2 0.587764 July\n", "128 6 0.195921 July\n", "208 1 0.014809 a\n", "208 2 0.034555 a\n", "208 4 0.004936 a\n", "208 6 0.024682 a\n", "208 7 0.646673 a\n", "208 8 0.009873 a\n", "208 9 0.034555 a\n", "208 10 0.024682 a\n", "208 11 0.019746 a\n", "208 12 0.177712 a\n", "8034 2 0.075146 a.\n", "8034 4 0.300583 a.\n", "8034 6 0.075146 a.\n", "8034 8 0.450875 a.\n", "8034 11 0.075146 a.\n", "390 2 0.163577 academic\n", "390 3 0.061341 academic\n", "390 4 0.040894 academic\n", "... ... ... ...\n", "216 12 0.052586 xml\n", "301 1 0.203402 xp\n", "301 2 0.203402 xp\n", "301 4 0.043586 xp\n", "301 5 0.014529 xp\n", "301 6 0.014529 xp\n", "301 7 0.014529 xp\n", "301 8 0.029057 xp\n", "301 9 0.290575 xp\n", "301 10 0.130759 xp\n", "301 11 0.014529 xp\n", "301 12 0.072644 xp\n", "8045 4 0.299898 y.\n", "8045 6 0.074974 y.\n", "8045 8 0.599795 y.\n", "8045 11 0.074974 y.\n", "1106 2 0.014644 yarn\n", "1106 7 0.058574 yarn\n", "1106 9 0.907903 yarn\n", "14167 5 0.459152 yii2\n", "4025 4 0.155626 z.\n", "4025 8 0.622505 z.\n", "9780 11 0.484692 zohooffice\n", "1897 1 0.070001 zookeeper\n", "1897 9 0.921682 zookeeper\n", "2828 1 0.129068 ◆\n", "2828 3 0.387203 ◆\n", "2828 7 0.258136 ◆\n", "2828 9 0.129068 ◆\n", "1332 2 0.688122 ♣\n", "\n", "[2782 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "num_topics =12\n", "vis = pyLDAvis.gensim.prepare(lda_final, doc_term_matrix, dictionary,sort_topics=False)\n", "pyLDAvis.save_html(vis,f'pyLDAvis_{num_topics}.html')\n", "vis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## How to pick relevant words in each topic?" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "token_percent = vis.topic_coordinates.sort_values(by='topics').loc[:,['topics','Freq']]" ] }, { "cell_type": "code", "execution_count": 77, "metadata": { "scrolled": false }, "outputs": [], "source": [ "def get_relevant_words(vis,lam=0.3,topn=10):\n", " a = vis.topic_info\n", " a['finalscore'] = a['logprob']*lam+(1-lam)*a['loglift']\n", " a = a.loc[:,['Category','Term','finalscore']].groupby(['Category'])\\\n", " .apply(lambda x: x.sort_values(by='finalscore',ascending=False).head(topn))\n", " a = a.loc[:,'Term'].reset_index().loc[:,['Category','Term']]\n", " a = a[a['Category']!='Default']\n", " a = a.to_dict('split')['data']\n", " d ={}\n", " for k,v in a: \n", " if k not in d.keys():\n", " d[k] =set()\n", " d[k].add(v)\n", " else:\n", " d[k].add(v)\n", " finalData = pd.DataFrame([],columns=['Topic','words with Relevance'])\n", " finalData['Topic']=d.keys()\n", " finalData['words with Relevance']=d.values()\n", " return finalData" ] }, { "cell_type": "code", "execution_count": 91, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Topicwords with Relevance
11Topic9{sqoop, kafka, cassandra, hdfs, hbase, hive, pig, impala, flume, oozie}
1Topic10{jquery, xml, css, eclipse, html, c, ajax, django, javascript, php}
3Topic12{sas, powerpoint, python, r, excel, matlab, spss, sql, word, stata}
9Topic7{classification, svm, learn, k, scikit, pandas, regression, matplotlib, scipy, numpy}
4Topic2{mssuite2012, tmux, spark2.0, databaseand, tableau_8, hive2.8, windows7/8/10, hadoop2, electronic, python2.7/3.3}
0Topic1{jboss, weblogic, ant, rmi, struts, soap, jsf, uml, jms, cvs}
7Topic5{linearandnon, hplc, mexico, gc, community, volunteer, tika, excelandword, pune, ontology}
8Topic6{ggplot2and, gridsearchand, oncology, modeltuning/, ddl, stepwise, filter_methods, hiv, pigand, dml}
5Topic3{magento, public_health, copy, campaign, hebrew, lucid, oracle_rdbms, ubuntuand, spatialdata, linearalgebra}
6Topic4{dataquality, erecruit, d.c., brassring, google_earth, october, scorecards, bullhorn, icims, taleo}
2Topic11{french, spanish, native, testing/, hootsuite, english, chinese, mandarin, cantonese, indesign}
10Topic8{s., d., l., n., m., y., jobvite, g., j., p.}
\n", "
" ], "text/plain": [ " Topic \\\n", "11 Topic9 \n", "1 Topic10 \n", "3 Topic12 \n", "9 Topic7 \n", "4 Topic2 \n", "0 Topic1 \n", "7 Topic5 \n", "8 Topic6 \n", "5 Topic3 \n", "6 Topic4 \n", "2 Topic11 \n", "10 Topic8 \n", "\n", " words with Relevance \n", "11 {sqoop, kafka, cassandra, hdfs, hbase, hive, pig, impala, flume, oozie} \n", "1 {jquery, xml, css, eclipse, html, c, ajax, django, javascript, php} \n", "3 {sas, powerpoint, python, r, excel, matlab, spss, sql, word, stata} \n", "9 {classification, svm, learn, k, scikit, pandas, regression, matplotlib, scipy, numpy} \n", "4 {mssuite2012, tmux, spark2.0, databaseand, tableau_8, hive2.8, windows7/8/10, hadoop2, electronic, python2.7/3.3} \n", "0 {jboss, weblogic, ant, rmi, struts, soap, jsf, uml, jms, cvs} \n", "7 {linearandnon, hplc, mexico, gc, community, volunteer, tika, excelandword, pune, ontology} \n", "8 {ggplot2and, gridsearchand, oncology, modeltuning/, ddl, stepwise, filter_methods, hiv, pigand, dml} \n", "5 {magento, public_health, copy, campaign, hebrew, lucid, oracle_rdbms, ubuntuand, spatialdata, linearalgebra} \n", "6 {dataquality, erecruit, d.c., brassring, google_earth, october, scorecards, bullhorn, icims, taleo} \n", "2 {french, spanish, native, testing/, hootsuite, english, chinese, mandarin, cantonese, indesign} \n", "10 {s., d., l., n., m., y., jobvite, g., j., p.} " ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_relevant_words(vis,0.3).merge(finalData,how='left',on ='Topic').sort_values(by='cs',ascending=False).iloc[:,[0,1]]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }