{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- [gensim doc2vec imdb full notebook](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb)\n",
    "- [gensim doc2vec tests](https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/test/test_doc2vec.py)\n",
    "- [doc2vec tutorial](http://rare-technologies.com/doc2vec-tutorial/)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# encoding=utf8  \n",
    "\n",
    "from __future__ import division\n",
    "import sys  \n",
    "\n",
    "reload(sys)  \n",
    "sys.setdefaultencoding('utf8')\n",
    "import locale\n",
    "import glob\n",
    "import os.path\n",
    "import requests\n",
    "import tarfile\n",
    "import io\n",
    "\n",
    "dirname = 'aclImdb'\n",
    "filename = 'aclImdb_v1.tar.gz'\n",
    "# locale.setlocale(locale.LC_ALL, 'C')\n",
    "\n",
    "\n",
    "# Convert text to lower-case and strip punctuation/symbols from words\n",
    "def normalize_text(text):\n",
    "    norm_text = text.lower()\n",
    "\n",
    "    # Replace breaks with spaces\n",
    "    norm_text = norm_text.replace('<br />', ' ')\n",
    "\n",
    "    # Pad punctuation with spaces on both sides\n",
    "    for char in ['.', '\"', ',', '(', ')', '!', '?', ';', ':']:\n",
    "        norm_text = norm_text.replace(char, ' ' + char + ' ')\n",
    "\n",
    "    return norm_text\n",
    "\n",
    "\n",
    "if not os.path.isfile('aclImdb/alldata-id.txt'):\n",
    "    if not os.path.isdir(dirname):\n",
    "        if not os.path.isfile(filename):\n",
    "            # Download IMDB archive\n",
    "            url = 'http://ai.stanford.edu/~amaas/data/sentiment/' + filename\n",
    "            r = requests.get(url)\n",
    "            with open(filename, 'wb') as f:\n",
    "                f.write(r.content)\n",
    "\n",
    "        tar = tarfile.open(filename, mode='r')\n",
    "        tar.extractall()\n",
    "        tar.close()\n",
    "\n",
    "    # Concat and normalize test/train data\n",
    "    folders = ['train/pos', 'train/neg', 'test/pos', 'test/neg', 'train/unsup']\n",
    "    alldata = u''\n",
    "\n",
    "    for fol in folders:\n",
    "        temp = u''\n",
    "        output = fol.replace('/', '-') + '.txt'\n",
    "\n",
    "        # Is there a better pattern to use?\n",
    "        txt_files = glob.glob('/'.join([dirname, fol, '*.txt']))\n",
    "\n",
    "        for txt in txt_files:\n",
    "            with io.open(txt, 'r', encoding='utf-8') as t:\n",
    "                control_chars = [chr(0x85)]\n",
    "                t_clean = t.read()\n",
    "\n",
    "                for c in control_chars:\n",
    "                    t_clean = t_clean.replace(unicode(c,errors='ignore') , ' ')\n",
    "\n",
    "                temp += t_clean\n",
    "\n",
    "            temp += \"\\n\"\n",
    "\n",
    "        temp_norm = normalize_text(temp)\n",
    "        with io.open('/'.join([dirname, output]), 'w', encoding='utf-8') as n:\n",
    "            n.write(temp_norm)\n",
    "\n",
    "        alldata += temp_norm\n",
    "\n",
    "    with io.open('/'.join([dirname, 'alldata-id.txt']), 'w', encoding='utf-8') as f:\n",
    "        for idx, line in enumerate(alldata.splitlines()):\n",
    "            num_line = \"_*{0} {1}\\n\".format(idx, line)\n",
    "            f.write(num_line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os.path\n",
    "assert os.path.isfile(\"aclImdb/alldata-id.txt\"), \"alldata-id.txt unavailable\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import gensim\n",
    "from gensim.models.doc2vec import TaggedDocument\n",
    "from collections import namedtuple\n",
    "\n",
    "SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')\n",
    "\n",
    "alldocs = []  # will hold all docs in original order\n",
    "with io.open('aclImdb/alldata-id.txt', encoding='utf-8') as alldata:\n",
    "    for line_no, line in enumerate(alldata):\n",
    "        print(line_no)\n",
    "        tokens = gensim.utils.to_unicode(line).split()\n",
    "        words = tokens[1:]\n",
    "        tags = [line_no] # `tags = [tokens[0]]` would also work at extra memory cost\n",
    "        try:\n",
    "            split = ['train','test','extra','extra'][line_no//25000]  # 25k train, 25k test, 25k extra\n",
    "            sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown\n",
    "            alldocs.append(SentimentDocument(words, tags, split, sentiment))\n",
    "        except IndexError:\n",
    "            print('error:'+str(line_no))\n",
    "            pass\n",
    "        \n",
    "\n",
    "\n",
    "train_docs = [doc for doc in alldocs if doc.split == 'train']\n",
    "test_docs = [doc for doc in alldocs if doc.split == 'test']\n",
    "doc_list = alldocs[:]  # for reshuffling per pass\n",
    "\n",
    "print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from gensim.models import Doc2Vec\n",
    "import gensim.models.doc2vec\n",
    "from collections import OrderedDict\n",
    "import multiprocessing\n",
    "\n",
    "cores = multiprocessing.cpu_count()\n",
    "assert gensim.models.doc2vec.FAST_VERSION > -1, \"this will be painfully slow otherwise\"\n",
    "\n",
    "simple_models = [\n",
    "    # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size\n",
    "    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),\n",
    "    # PV-DBOW \n",
    "    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),\n",
    "    # PV-DM w/average\n",
    "    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),\n",
    "]\n",
    "\n",
    "# speed setup by sharing results of 1st model's vocabulary scan\n",
    "simple_models[0].build_vocab(alldocs)  # PV-DM/concat requires one special NULL word so it serves as template\n",
    "print(simple_models[0])\n",
    "for model in simple_models[1:]:\n",
    "    model.reset_from(simple_models[0])\n",
    "    print(model)\n",
    "\n",
    "models_by_name = OrderedDict((str(model), model) for model in simple_models)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from gensim.test.test_doc2vec import ConcatenatedDoc2Vec\n",
    "models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])\n",
    "models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import statsmodels.api as sm\n",
    "from random import sample\n",
    "\n",
    "# for timing\n",
    "from contextlib import contextmanager\n",
    "from timeit import default_timer\n",
    "import time \n",
    "\n",
    "@contextmanager\n",
    "def elapsed_timer():\n",
    "    start = default_timer()\n",
    "    elapser = lambda: default_timer() - start\n",
    "    yield lambda: elapser()\n",
    "    end = default_timer()\n",
    "    elapser = lambda: end-start\n",
    "    \n",
    "def logistic_predictor_from_data(train_targets, train_regressors):\n",
    "    logit = sm.Logit(train_targets, train_regressors)\n",
    "    predictor = logit.fit(disp=0)\n",
    "    #print(predictor.summary())\n",
    "    return predictor\n",
    "\n",
    "def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):\n",
    "    \"\"\"Report error rate on test_doc sentiments, using supplied model and train_docs\"\"\"\n",
    "\n",
    "    train_targets, train_regressors = zip(*[(doc.sentiment, test_model.docvecs[doc.tags[0]]) for doc in train_set])\n",
    "    train_regressors = sm.add_constant(train_regressors)\n",
    "    predictor = logistic_predictor_from_data(train_targets, train_regressors)\n",
    "\n",
    "    test_data = test_set\n",
    "    if infer:\n",
    "        if infer_subsample < 1.0:\n",
    "            test_data = sample(test_data, int(infer_subsample * len(test_data)))\n",
    "        test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]\n",
    "    else:\n",
    "        test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]\n",
    "    test_regressors = sm.add_constant(test_regressors)\n",
    "    \n",
    "    # predict & evaluate\n",
    "    test_predictions = predictor.predict(test_regressors)\n",
    "    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])\n",
    "    errors = len(test_predictions) - corrects\n",
    "    error_rate = float(errors) / len(test_predictions)\n",
    "    return (error_rate, errors, len(test_predictions), predictor)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "best_error = defaultdict(lambda :1.0)  # to selectively-print only best errors achieved"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-19-ae19afedce59>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m     15\u001b[0m         \u001b[0mtrain_model\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0malpha\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtrain_model\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmin_alpha\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0malpha\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0malpha\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     16\u001b[0m         \u001b[1;32mwith\u001b[0m \u001b[0melapsed_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0melapsed\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m             \u001b[0mtrain_model\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdoc_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     18\u001b[0m             \u001b[0mduration\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'%.1f'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0melapsed\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     19\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/gensim/models/word2vec.pyc\u001b[0m in \u001b[0;36mtrain\u001b[1;34m(self, sentences, total_words, word_count, total_examples, queue_factor, report_delay)\u001b[0m\n\u001b[0;32m    813\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    814\u001b[0m         \u001b[1;32mwhile\u001b[0m \u001b[0munfinished_worker_count\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 815\u001b[1;33m             \u001b[0mreport\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mprogress_queue\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m  \u001b[1;31m# blocks if workers too slow\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    816\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mreport\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m:\u001b[0m  \u001b[1;31m# a thread reporting that it finished\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    817\u001b[0m                 \u001b[0munfinished_worker_count\u001b[0m \u001b[1;33m-=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/usr/lib/python2.7/Queue.pyc\u001b[0m in \u001b[0;36mget\u001b[1;34m(self, block, timeout)\u001b[0m\n\u001b[0;32m    166\u001b[0m             \u001b[1;32melif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    167\u001b[0m                 \u001b[1;32mwhile\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_qsize\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 168\u001b[1;33m                     \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnot_empty\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    169\u001b[0m             \u001b[1;32melif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[1;33m<\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    170\u001b[0m                 \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"'timeout' must be a non-negative number\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/usr/lib/python2.7/threading.pyc\u001b[0m in \u001b[0;36mwait\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m    337\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m    \u001b[1;31m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    338\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 339\u001b[1;33m                 \u001b[0mwaiter\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    340\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[0m__debug__\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    341\u001b[0m                     \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_note\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"%s.wait(): got it\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "from random import shuffle\n",
    "import datetime\n",
    "\n",
    "alpha, min_alpha, passes = (0.025, 0.001, 20)\n",
    "alpha_delta = (alpha - min_alpha) / passes\n",
    "\n",
    "print(\"START %s\" % datetime.datetime.now())\n",
    "\n",
    "for epoch in range(passes):\n",
    "    shuffle(doc_list)  # shuffling gets best results\n",
    "    \n",
    "    for name, train_model in models_by_name.items():\n",
    "        # train\n",
    "        duration = 'na'\n",
    "        train_model.alpha, train_model.min_alpha = alpha, alpha\n",
    "        with elapsed_timer() as elapsed:\n",
    "            train_model.train(doc_list)\n",
    "            duration = '%.1f' % elapsed()\n",
    "            \n",
    "        # evaluate\n",
    "        eval_duration = ''\n",
    "        with elapsed_timer() as eval_elapsed:\n",
    "            err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs)\n",
    "        eval_duration = '%.1f' % eval_elapsed()\n",
    "        best_indicator = ' '\n",
    "        if err <= best_error[name]:\n",
    "            best_error[name] = err\n",
    "            best_indicator = '*' \n",
    "        print(\"%s%f : %i passes : %s %ss %ss\" % (best_indicator, err, epoch + 1, name, duration, eval_duration))\n",
    "\n",
    "        if ((epoch + 1) % 5) == 0 or epoch == 0:\n",
    "            eval_duration = ''\n",
    "            with elapsed_timer() as eval_elapsed:\n",
    "                infer_err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs, infer=True)\n",
    "            eval_duration = '%.1f' % eval_elapsed()\n",
    "            best_indicator = ' '\n",
    "            if infer_err < best_error[name + '_inferred']:\n",
    "                best_error[name + '_inferred'] = infer_err\n",
    "                best_indicator = '*'\n",
    "            print(\"%s%f : %i passes : %s %ss %ss\" % (best_indicator, infer_err, epoch + 1, name + '_inferred', duration, eval_duration))\n",
    "\n",
    "    print('completed pass %i at alpha %f' % (epoch + 1, alpha))\n",
    "    alpha -= alpha_delta\n",
    "    \n",
    "print(\"END %s\" % str(datetime.datetime.now()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# print best error rates achieved\n",
    "for rate, name in sorted((rate, name) for name, rate in best_error.items()):\n",
    "    print(\"%f %s\" % (rate, name))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc; re-run cell for more examples\n",
    "print('for doc %d...' % doc_id)\n",
    "for model in simple_models:\n",
    "    inferred_docvec = model.infer_vector(alldocs[doc_id].words)\n",
    "    print('%s:\\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import random\n",
    "\n",
    "doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc, re-run cell for more examples\n",
    "model = random.choice(simple_models)  # and a random model\n",
    "sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents\n",
    "print(u'TARGET (%d): «%s»\\n' % (doc_id, ' '.join(alldocs[doc_id].words)))\n",
    "print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\\n' % model)\n",
    "for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:\n",
    "    print(u'%s %s: «%s»\\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "word_models = simple_models[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "import random\n",
    "from IPython.display import HTML\n",
    "# pick a random word with a suitable number of occurences\n",
    "while True:\n",
    "    word = random.choice(word_models[0].index2word)\n",
    "    if word_models[0].vocab[word].count > 10:\n",
    "        break\n",
    "# or uncomment below line, to just pick a word from the relevant domain:\n",
    "#word = 'comedy/drama'\n",
    "similars_per_model = [str(model.most_similar(word, topn=20)).replace('), ','),<br>\\n') for model in word_models]\n",
    "similar_table = (\"<table><tr><th>\" +\n",
    "    \"</th><th>\".join([str(model) for model in word_models]) + \n",
    "    \"</th></tr><tr><td>\" +\n",
    "    \"</td><td>\".join(similars_per_model) +\n",
    "    \"</td></tr></table>\")\n",
    "print(\"most similar words for '%s' (%d occurences)\" % (word, simple_models[0].vocab[word].count))\n",
    "HTML(similar_table)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# assuming something like\n",
    "# https://word2vec.googlecode.com/svn/trunk/questions-words.txt \n",
    "# is in local directory\n",
    "# note: this takes many minutes\n",
    "for model in word_models:\n",
    "    sections = model.accuracy('questions-words.txt')\n",
    "    correct, incorrect = len(sections[-1]['correct']), len(sections[-1]['incorrect'])\n",
    "    print('%s: %0.2f%% correct (%d of %d)' % (model, float(correct*100)/(correct+incorrect), correct, correct+incorrect))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}