{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Gensim" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Gensim is a free Python library designed to automatically extract semantic topics from documents, as efficiently (computer-wise) and painlessly (human-wise) as possible.\n", "\n", "Gensim aims at processing raw, unstructured digital texts (“plain text”). The algorithms in gensim, such as Latent Semantic Analysis, Latent Dirichlet Allocation or Random Projections, discover semantic structure of documents, by examining word statistical co-occurrence patterns within a corpus of training documents. These algorithms are unsupervised, which means no human input is necessary – you only need a corpus of plain text documents.\n", "\n", "Once these statistical patterns are found, any plain text documents can be succinctly expressed in the new, semantic representation, and queried for topical similarity against other documents.\n", "\n", "Library documentation: https://radimrehurek.com/gensim/index.html" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from gensim import corpora, models, similarities\n", "\n", "documents = [\"Human machine interface for lab abc computer applications\",\n", " \"A survey of user opinion of computer system response time\",\n", " \"The EPS user interface management system\",\n", " \"System and human system engineering testing of EPS\",\n", " \"Relation of user perceived response time to error measurement\",\n", " \"The generation of random binary unordered trees\",\n", " \"The intersection graph of paths in trees\",\n", " \"Graph minors IV Widths of trees and well quasi ordering\",\n", " \"Graph minors A survey\"]" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# remove common words and tokenize\n", "stoplist = set('for a of the and to in'.split())\n", "texts = [[word for word in document.lower().split() if word not in stoplist]\n", " for document in documents]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# remove words that appear only once\n", "from collections import defaultdict\n", "frequency = defaultdict(int)\n", "for text in texts:\n", " for token in text:\n", " frequency[token] += 1\n", "\n", "texts = [[token for token in text if frequency[token] > 1]\n", " for text in texts]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[['human', 'interface', 'computer'],\n", " ['survey', 'user', 'computer', 'system', 'response', 'time'],\n", " ['eps', 'user', 'interface', 'system'],\n", " ['system', 'human', 'system', 'eps'],\n", " ['user', 'response', 'time'],\n", " ['trees'],\n", " ['graph', 'trees'],\n", " ['graph', 'minors', 'trees'],\n", " ['graph', 'minors', 'survey']]\n" ] } ], "source": [ "from pprint import pprint\n", "pprint(texts)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dictionary(12 unique tokens: [u'minors', u'graph', u'system', u'trees', u'eps']...)\n" ] } ], "source": [ "# create a dictionary mapping between ids and unique words\n", "dictionary = corpora.Dictionary(texts)\n", "print(dictionary)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{u'minors': 11, u'graph': 10, u'system': 5, u'trees': 9, u'eps': 8, u'computer': 0, u'survey': 4, u'user': 7, u'human': 1, u'time': 6, u'interface': 2, u'response': 3}\n" ] } ], "source": [ "# mapping between ids and words\n", "print(dictionary.token2id)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[(0, 1), (1, 1), (2, 1)],\n", " [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],\n", " [(2, 1), (5, 1), (7, 1), (8, 1)],\n", " [(1, 1), (5, 2), (8, 1)],\n", " [(3, 1), (6, 1), (7, 1)],\n", " [(9, 1)],\n", " [(9, 1), (10, 1)],\n", " [(9, 1), (10, 1), (11, 1)],\n", " [(4, 1), (10, 1), (11, 1)]]\n" ] } ], "source": [ "# convert the text to a bag-of-words corpus\n", "corpus = [dictionary.doc2bow(text) for text in texts]\n", "pprint(corpus)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[ 1. 1. 0. 0. 0. 0. 0. 0. 0.]\n", " [ 1. 0. 0. 1. 0. 0. 0. 0. 0.]\n", " [ 1. 0. 1. 0. 0. 0. 0. 0. 0.]\n", " [ 0. 1. 0. 0. 1. 0. 0. 0. 0.]\n", " [ 0. 1. 0. 0. 0. 0. 0. 0. 1.]\n", " [ 0. 1. 1. 2. 0. 0. 0. 0. 0.]\n", " [ 0. 1. 0. 0. 1. 0. 0. 0. 0.]\n", " [ 0. 1. 1. 0. 1. 0. 0. 0. 0.]\n", " [ 0. 0. 1. 1. 0. 0. 0. 0. 0.]\n", " [ 0. 0. 0. 0. 0. 1. 1. 1. 0.]\n", " [ 0. 0. 0. 0. 0. 0. 1. 1. 1.]\n", " [ 0. 0. 0. 0. 0. 0. 0. 1. 1.]]\n" ] } ], "source": [ "# can convert to numpy/scipy matrices and back\n", "from gensim import matutils\n", "numpy_matrix = matutils.corpus2dense(corpus, num_terms=12)\n", "print(numpy_matrix)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "scipy_csc_matrix = matutils.corpus2csc(corpus)\n", "numpy_corpus = matutils.Dense2Corpus(numpy_matrix)\n", "scipy_corpus = matutils.Sparse2Corpus(scipy_csc_matrix)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# initialize a TF-IDF transformation\n", "tfidf = models.TfidfModel(corpus)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]\n", "[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]\n", "[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]\n", "[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]\n", "[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)]\n", "[(9, 1.0)]\n", "[(9, 0.7071067811865475), (10, 0.7071067811865475)]\n", "[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]\n", "[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]\n" ] } ], "source": [ "# apply it to the whole corpus\n", "corpus_tfidf = tfidf[corpus]\n", "for doc in corpus_tfidf:\n", " print(doc)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[u'0.703*\"trees\" + 0.538*\"graph\" + 0.402*\"minors\" + 0.187*\"survey\" + 0.061*\"system\" + 0.060*\"response\" + 0.060*\"time\" + 0.058*\"user\" + 0.049*\"computer\" + 0.035*\"interface\"',\n", " u'-0.460*\"system\" + -0.373*\"user\" + -0.332*\"eps\" + -0.328*\"interface\" + -0.320*\"time\" + -0.320*\"response\" + -0.293*\"computer\" + -0.280*\"human\" + -0.171*\"survey\" + 0.161*\"trees\"']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# initialize an LSI transformation\n", "lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)\n", "lsi.print_topics(2)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0, 0.066007833960902734), (1, -0.52007033063618424)]\n", "[(0, 0.19667592859142366), (1, -0.76095631677000475)]\n", "[(0, 0.089926399724463812), (1, -0.72418606267525032)]\n", "[(0, 0.075858476521781015), (1, -0.63205515860034267)]\n", "[(0, 0.10150299184980033), (1, -0.57373084830029586)]\n", "[(0, 0.70321089393783154), (1, 0.16115180214025748)]\n", "[(0, 0.87747876731198349), (1, 0.16758906864659354)]\n", "[(0, 0.90986246868185783), (1, 0.14086553628718948)]\n", "[(0, 0.61658253505692784), (1, -0.053929075663894252)]\n" ] } ], "source": [ "# create a double wrapper over the original corpus: bow->tfidf->lsi\n", "corpus_lsi = lsi[corpus_tfidf]\n", "for doc in corpus_lsi:\n", " print(doc)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# random projection model\n", "rp = models.RpModel(corpus_tfidf, num_topics=500)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n" ] } ], "source": [ "# latent dirichlet allocation model\n", "lda = models.LdaModel(corpus, id2word=dictionary, num_topics=100)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0, 0.079104751174447263), (1, -0.5732835243079395)]\n" ] } ], "source": [ "# convert a phrase into the LSI model space\n", "doc = \"Human computer interaction\"\n", "vec_bow = dictionary.doc2bow(doc.lower().split())\n", "vec_lsi = lsi[vec_bow] # convert the query to LSI space\n", "print(vec_lsi)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)\n" ] } ], "source": [ "# index the transformed corpus from earlier\n", "index = similarities.MatrixSimilarity(corpus_lsi)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0, 0.99994081), (1, 0.99330217), (2, 0.99990785), (3, 0.99984384), (4, 0.9992786), (5, -0.08804217), (6, -0.0515742), (7, -0.016480923), (8, 0.22248439)]\n" ] } ], "source": [ "# perform a similarity query against the corpus using cosine similarity\n", "sims = index[vec_lsi]\n", "print(list(enumerate(sims)))" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0, 0.99994081),\n", " (2, 0.99990785),\n", " (3, 0.99984384),\n", " (4, 0.9992786),\n", " (1, 0.99330217),\n", " (8, 0.22248439),\n", " (7, -0.016480923),\n", " (6, -0.0515742),\n", " (5, -0.08804217)]\n" ] } ], "source": [ "# display in sorted order\n", "sims = sorted(enumerate(sims), key=lambda item: -item[1])\n", "pprint(sims)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }