{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "- [gensim doc2vec imdb full notebook](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb)\n", "- [gensim doc2vec tests](https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/test/test_doc2vec.py)\n", "- [doc2vec tutorial](http://rare-technologies.com/doc2vec-tutorial/)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# encoding=utf8 \n", "\n", "from __future__ import division\n", "import sys \n", "\n", "reload(sys) \n", "sys.setdefaultencoding('utf8')\n", "import locale\n", "import glob\n", "import os.path\n", "import requests\n", "import tarfile\n", "import io\n", "\n", "dirname = 'aclImdb'\n", "filename = 'aclImdb_v1.tar.gz'\n", "# locale.setlocale(locale.LC_ALL, 'C')\n", "\n", "\n", "# Convert text to lower-case and strip punctuation/symbols from words\n", "def normalize_text(text):\n", " norm_text = text.lower()\n", "\n", " # Replace breaks with spaces\n", " norm_text = norm_text.replace('
', ' ')\n", "\n", " # Pad punctuation with spaces on both sides\n", " for char in ['.', '\"', ',', '(', ')', '!', '?', ';', ':']:\n", " norm_text = norm_text.replace(char, ' ' + char + ' ')\n", "\n", " return norm_text\n", "\n", "\n", "if not os.path.isfile('aclImdb/alldata-id.txt'):\n", " if not os.path.isdir(dirname):\n", " if not os.path.isfile(filename):\n", " # Download IMDB archive\n", " url = 'http://ai.stanford.edu/~amaas/data/sentiment/' + filename\n", " r = requests.get(url)\n", " with open(filename, 'wb') as f:\n", " f.write(r.content)\n", "\n", " tar = tarfile.open(filename, mode='r')\n", " tar.extractall()\n", " tar.close()\n", "\n", " # Concat and normalize test/train data\n", " folders = ['train/pos', 'train/neg', 'test/pos', 'test/neg', 'train/unsup']\n", " alldata = u''\n", "\n", " for fol in folders:\n", " temp = u''\n", " output = fol.replace('/', '-') + '.txt'\n", "\n", " # Is there a better pattern to use?\n", " txt_files = glob.glob('/'.join([dirname, fol, '*.txt']))\n", "\n", " for txt in txt_files:\n", " with io.open(txt, 'r', encoding='utf-8') as t:\n", " control_chars = [chr(0x85)]\n", " t_clean = t.read()\n", "\n", " for c in control_chars:\n", " t_clean = t_clean.replace(unicode(c,errors='ignore') , ' ')\n", "\n", " temp += t_clean\n", "\n", " temp += \"\\n\"\n", "\n", " temp_norm = normalize_text(temp)\n", " with io.open('/'.join([dirname, output]), 'w', encoding='utf-8') as n:\n", " n.write(temp_norm)\n", "\n", " alldata += temp_norm\n", "\n", " with io.open('/'.join([dirname, 'alldata-id.txt']), 'w', encoding='utf-8') as f:\n", " for idx, line in enumerate(alldata.splitlines()):\n", " num_line = \"_*{0} {1}\\n\".format(idx, line)\n", " f.write(num_line)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os.path\n", "assert os.path.isfile(\"aclImdb/alldata-id.txt\"), \"alldata-id.txt unavailable\"" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import gensim\n", "from gensim.models.doc2vec import TaggedDocument\n", "from collections import namedtuple\n", "\n", "SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')\n", "\n", "alldocs = [] # will hold all docs in original order\n", "with io.open('aclImdb/alldata-id.txt', encoding='utf-8') as alldata:\n", " for line_no, line in enumerate(alldata):\n", " print(line_no)\n", " tokens = gensim.utils.to_unicode(line).split()\n", " words = tokens[1:]\n", " tags = [line_no] # `tags = [tokens[0]]` would also work at extra memory cost\n", " try:\n", " split = ['train','test','extra','extra'][line_no//25000] # 25k train, 25k test, 25k extra\n", " sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown\n", " alldocs.append(SentimentDocument(words, tags, split, sentiment))\n", " except IndexError:\n", " print('error:'+str(line_no))\n", " pass\n", " \n", "\n", "\n", "train_docs = [doc for doc in alldocs if doc.split == 'train']\n", "test_docs = [doc for doc in alldocs if doc.split == 'test']\n", "doc_list = alldocs[:] # for reshuffling per pass\n", "\n", "print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from gensim.models import Doc2Vec\n", "import gensim.models.doc2vec\n", "from collections import OrderedDict\n", "import multiprocessing\n", "\n", "cores = multiprocessing.cpu_count()\n", "assert gensim.models.doc2vec.FAST_VERSION > -1, \"this will be painfully slow otherwise\"\n", "\n", "simple_models = [\n", " # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size\n", " Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),\n", " # PV-DBOW \n", " Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),\n", " # PV-DM w/average\n", " Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),\n", "]\n", "\n", "# speed setup by sharing results of 1st model's vocabulary scan\n", "simple_models[0].build_vocab(alldocs) # PV-DM/concat requires one special NULL word so it serves as template\n", "print(simple_models[0])\n", "for model in simple_models[1:]:\n", " model.reset_from(simple_models[0])\n", " print(model)\n", "\n", "models_by_name = OrderedDict((str(model), model) for model in simple_models)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from gensim.test.test_doc2vec import ConcatenatedDoc2Vec\n", "models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])\n", "models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import numpy as np\n", "import statsmodels.api as sm\n", "from random import sample\n", "\n", "# for timing\n", "from contextlib import contextmanager\n", "from timeit import default_timer\n", "import time \n", "\n", "@contextmanager\n", "def elapsed_timer():\n", " start = default_timer()\n", " elapser = lambda: default_timer() - start\n", " yield lambda: elapser()\n", " end = default_timer()\n", " elapser = lambda: end-start\n", " \n", "def logistic_predictor_from_data(train_targets, train_regressors):\n", " logit = sm.Logit(train_targets, train_regressors)\n", " predictor = logit.fit(disp=0)\n", " #print(predictor.summary())\n", " return predictor\n", "\n", "def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):\n", " \"\"\"Report error rate on test_doc sentiments, using supplied model and train_docs\"\"\"\n", "\n", " train_targets, train_regressors = zip(*[(doc.sentiment, test_model.docvecs[doc.tags[0]]) for doc in train_set])\n", " train_regressors = sm.add_constant(train_regressors)\n", " predictor = logistic_predictor_from_data(train_targets, train_regressors)\n", "\n", " test_data = test_set\n", " if infer:\n", " if infer_subsample < 1.0:\n", " test_data = sample(test_data, int(infer_subsample * len(test_data)))\n", " test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]\n", " else:\n", " test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]\n", " test_regressors = sm.add_constant(test_regressors)\n", " \n", " # predict & evaluate\n", " test_predictions = predictor.predict(test_regressors)\n", " corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])\n", " errors = len(test_predictions) - corrects\n", " error_rate = float(errors) / len(test_predictions)\n", " return (error_rate, errors, len(test_predictions), predictor)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from collections import defaultdict\n", "best_error = defaultdict(lambda :1.0) # to selectively-print only best errors achieved" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[0mtrain_model\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0malpha\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtrain_model\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmin_alpha\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0malpha\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0malpha\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0melapsed_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0melapsed\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m \u001b[0mtrain_model\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdoc_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 18\u001b[0m \u001b[0mduration\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'%.1f'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0melapsed\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/gensim/models/word2vec.pyc\u001b[0m in \u001b[0;36mtrain\u001b[1;34m(self, sentences, total_words, word_count, total_examples, queue_factor, report_delay)\u001b[0m\n\u001b[0;32m 813\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 814\u001b[0m \u001b[1;32mwhile\u001b[0m \u001b[0munfinished_worker_count\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 815\u001b[1;33m \u001b[0mreport\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mprogress_queue\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# blocks if workers too slow\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 816\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mreport\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;31m# a thread reporting that it finished\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 817\u001b[0m \u001b[0munfinished_worker_count\u001b[0m \u001b[1;33m-=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/usr/lib/python2.7/Queue.pyc\u001b[0m in \u001b[0;36mget\u001b[1;34m(self, block, timeout)\u001b[0m\n\u001b[0;32m 166\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 167\u001b[0m \u001b[1;32mwhile\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_qsize\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 168\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnot_empty\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 169\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[1;33m<\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 170\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"'timeout' must be a non-negative number\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/usr/lib/python2.7/threading.pyc\u001b[0m in \u001b[0;36mwait\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m 337\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;31m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 338\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 339\u001b[1;33m \u001b[0mwaiter\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 340\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0m__debug__\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 341\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_note\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"%s.wait(): got it\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "from random import shuffle\n", "import datetime\n", "\n", "alpha, min_alpha, passes = (0.025, 0.001, 20)\n", "alpha_delta = (alpha - min_alpha) / passes\n", "\n", "print(\"START %s\" % datetime.datetime.now())\n", "\n", "for epoch in range(passes):\n", " shuffle(doc_list) # shuffling gets best results\n", " \n", " for name, train_model in models_by_name.items():\n", " # train\n", " duration = 'na'\n", " train_model.alpha, train_model.min_alpha = alpha, alpha\n", " with elapsed_timer() as elapsed:\n", " train_model.train(doc_list)\n", " duration = '%.1f' % elapsed()\n", " \n", " # evaluate\n", " eval_duration = ''\n", " with elapsed_timer() as eval_elapsed:\n", " err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs)\n", " eval_duration = '%.1f' % eval_elapsed()\n", " best_indicator = ' '\n", " if err <= best_error[name]:\n", " best_error[name] = err\n", " best_indicator = '*' \n", " print(\"%s%f : %i passes : %s %ss %ss\" % (best_indicator, err, epoch + 1, name, duration, eval_duration))\n", "\n", " if ((epoch + 1) % 5) == 0 or epoch == 0:\n", " eval_duration = ''\n", " with elapsed_timer() as eval_elapsed:\n", " infer_err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs, infer=True)\n", " eval_duration = '%.1f' % eval_elapsed()\n", " best_indicator = ' '\n", " if infer_err < best_error[name + '_inferred']:\n", " best_error[name + '_inferred'] = infer_err\n", " best_indicator = '*'\n", " print(\"%s%f : %i passes : %s %ss %ss\" % (best_indicator, infer_err, epoch + 1, name + '_inferred', duration, eval_duration))\n", "\n", " print('completed pass %i at alpha %f' % (epoch + 1, alpha))\n", " alpha -= alpha_delta\n", " \n", "print(\"END %s\" % str(datetime.datetime.now()))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# print best error rates achieved\n", "for rate, name in sorted((rate, name) for name, rate in best_error.items()):\n", " print(\"%f %s\" % (rate, name))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "doc_id = np.random.randint(simple_models[0].docvecs.count) # pick random doc; re-run cell for more examples\n", "print('for doc %d...' % doc_id)\n", "for model in simple_models:\n", " inferred_docvec = model.infer_vector(alldocs[doc_id].words)\n", " print('%s:\\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import random\n", "\n", "doc_id = np.random.randint(simple_models[0].docvecs.count) # pick random doc, re-run cell for more examples\n", "model = random.choice(simple_models) # and a random model\n", "sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count) # get *all* similar documents\n", "print(u'TARGET (%d): «%s»\\n' % (doc_id, ' '.join(alldocs[doc_id].words)))\n", "print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\\n' % model)\n", "for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:\n", " print(u'%s %s: «%s»\\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "word_models = simple_models[:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\n", "import random\n", "from IPython.display import HTML\n", "# pick a random word with a suitable number of occurences\n", "while True:\n", " word = random.choice(word_models[0].index2word)\n", " if word_models[0].vocab[word].count > 10:\n", " break\n", "# or uncomment below line, to just pick a word from the relevant domain:\n", "#word = 'comedy/drama'\n", "similars_per_model = [str(model.most_similar(word, topn=20)).replace('), ','),
\\n') for model in word_models]\n", "similar_table = (\"
\" +\n", " \"\".join([str(model) for model in word_models]) + \n", " \"
\" +\n", " \"\".join(similars_per_model) +\n", " \"
\")\n", "print(\"most similar words for '%s' (%d occurences)\" % (word, simple_models[0].vocab[word].count))\n", "HTML(similar_table)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# assuming something like\n", "# https://word2vec.googlecode.com/svn/trunk/questions-words.txt \n", "# is in local directory\n", "# note: this takes many minutes\n", "for model in word_models:\n", " sections = model.accuracy('questions-words.txt')\n", " correct, incorrect = len(sections[-1]['correct']), len(sections[-1]['incorrect'])\n", " print('%s: %0.2f%% correct (%d of %d)' % (model, float(correct*100)/(correct+incorrect), correct, correct+incorrect))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }