{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## pyword2vec single thread experiment"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import numpy as np\n",
      "from collections import Counter\n",
      "import mmap\n",
      "import re\n",
      "import networkx as nx\n",
      "import math"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## EXPonetial memoization table\n",
      "\n",
      "MAX_EXP = 6\n",
      "EXP_TABLE_SIZE = 1000\n",
      "EXP_TABLE = np.arange(start = 0, stop = EXP_TABLE_SIZE, \n",
      "                      step = 1, dtype = np.float64)\n",
      "EXP_TABLE = np.exp((EXP_TABLE / EXP_TABLE_SIZE * 2. - 1.) * MAX_EXP)\n",
      "EXP_TABLE = EXP_TABLE / (EXP_TABLE + 1.)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## HELPER FUNCTION\n",
      "def file_to_wordstream(file_path):\n",
      "    with open(file_path) as fin:\n",
      "        mf = mmap.mmap(fin.fileno(), 0, access = mmap.ACCESS_READ)\n",
      "        for match in re.finditer(r'(.*?)\\s', mf):\n",
      "            word = match.group(1)\n",
      "            if word:\n",
      "                yield word\n",
      "                \n",
      "def inspect_vocab_tree(vocab):\n",
      "    g = nx.DiGraph()\n",
      "    vocab_size = len(vocab)\n",
      "    edges = set()\n",
      "    for vw in vocab:\n",
      "        tree_path = [i + vocab_size for i in vw['path']]\n",
      "        tree_path = [str(i) if i >= vocab_size \n",
      "                         else \"%d_%s(%d)\" % (i, vocab[i]['word'], vocab[i]['count']) \n",
      "                     for i in tree_path]\n",
      "        edges.update(zip(tree_path[:-1], tree_path[1:]))\n",
      "    g.add_edges_from(edges)\n",
      "    figure(figsize=(16, 16))\n",
      "    pos = nx.graphviz_layout(g, prog='dot')\n",
      "    nx.draw(g, pos, with_labels=True, arrows = True, node_size=3000, font_size = 30)\n",
      "    return g"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## HashedVocab\n",
      "\n",
      "class HashedVocab(object):\n",
      "    def __init__(self):\n",
      "        self.HASH_SIZE = 30000000\n",
      "        self.MIN_COUNT = 5\n",
      "        self.vocab = []\n",
      "        self.vocab_hash = np.empty(self.HASH_SIZE, dtype = np.int)\n",
      "        self.vocab_hash.fill(-1)\n",
      "    def fit(self, word_stream):\n",
      "        counter = Counter(word_stream)\n",
      "        self.vocab = map(lambda x: dict(zip(['word', 'count'], x)), \n",
      "                         filter(lambda x: x[1] > self.MIN_COUNT, \n",
      "                                    counter.most_common(len(counter))))\n",
      "        if len(self.vocab) > self.HASH_SIZE * 0.7:\n",
      "            raise RuntimeError('Vocab size too large, increase MIN_COUNT or increase HASH_SIZE')\n",
      "        self.build_hash()\n",
      "        self.build_huffman_tree()\n",
      "    def search_for(self, word):\n",
      "        word_hash = self.get_word_hash(word)\n",
      "        while True:\n",
      "            word_index = self.vocab_hash[word_hash]\n",
      "            if word_index == -1:\n",
      "                return - 1\n",
      "            elif word == self.vocab[word_index]['word']:\n",
      "                return word_index\n",
      "            else:\n",
      "                word_hash = (word_hash + 1) % self.HASH_SIZE\n",
      "    def __getitem__(self, word_index):\n",
      "        return self.vocab[word_index]\n",
      "    def __len__(self):\n",
      "        return len(self.vocab)\n",
      "    \n",
      "    def build_hash(self):\n",
      "        self.vocab_hash = np.empty(self.HASH_SIZE, dtype = np.int)\n",
      "        self.vocab_hash.fill(-1)\n",
      "        for word_index, vocab_word in enumerate(self.vocab):\n",
      "            word = vocab_word['word']\n",
      "            word_hash = self.get_word_hash(word)\n",
      "            self.add_to_hash(word_hash, word_index)\n",
      "    def get_word_hash(self, word):\n",
      "        ## TOO SLOW\n",
      "        #word_hash = sum([ord(c)*(257**i) \n",
      "        #             for i, c in zip(range(len(word))[::-1], word)])\n",
      "        word_hash = 0\n",
      "        for c in word:\n",
      "            word_hash = word_hash * 257 + ord(c)\n",
      "        word_hash %= self.HASH_SIZE\n",
      "        return word_hash\n",
      "    def add_to_hash(self, word_hash, word_index):\n",
      "        while self.vocab_hash[word_hash] != -1:\n",
      "            word_hash = (word_hash + 1) % self.HASH_SIZE\n",
      "        self.vocab_hash[word_hash] = word_index\n",
      "    def build_huffman_tree(self):\n",
      "        \"\"\"\n",
      "        build binary Huffman Tree by word counts,\n",
      "        the structure will be embedded in the vocab_word\n",
      "        dict(word, count, path, code) in the vocab\n",
      "        \n",
      "        vocab_word['code'] will be the binary representation of word\n",
      "        based on frequency\n",
      "        vocab_word['path'] will be the path from root to leaf \n",
      "        \"\"\"\n",
      "        ## for arbitary full binary, n-1 internal inodes at max \n",
      "        ## given n leaves. But in the original C code, the count\n",
      "        ## binary and parent_node size are n*2+1 instead of n*2-1\n",
      "        vocab_size = len(self.vocab)\n",
      "        print \"DEBUG:\", vocab_size\n",
      "        ## workhorse data structure for tree construction\n",
      "        count = np.empty(vocab_size*2-1, dtype = np.int64)\n",
      "        count.fill(1e15)\n",
      "        count[:vocab_size] = [vw['count'] for vw in self.vocab]\n",
      "        ## boolean values of each node\n",
      "        binary = np.zeros(vocab_size*2-1, dtype = np.int8)\n",
      "        ## parent node to store the path\n",
      "        parent_node = np.empty(vocab_size*2-1, dtype = np.int64)\n",
      "        ## construct the tree\n",
      "        pos1, pos2 = vocab_size-1, vocab_size\n",
      "        for a in xrange(vocab_size-1):\n",
      "            ## min1i\n",
      "            if pos1 >= 0:\n",
      "                if count[pos1] < count[pos2]:\n",
      "                    min1i, pos1 = pos1, pos1-1\n",
      "                else:\n",
      "                    min1i, pos2 = pos2, pos2+1\n",
      "            else:\n",
      "                min1i, pos2 = pos2, pos2+1\n",
      "            ## min2i\n",
      "            if pos1 >= 0:\n",
      "                if count[pos1] < count[pos2]:\n",
      "                    min2i, pos1 = pos1, pos1-1\n",
      "                else:\n",
      "                    min2i, pos2 = pos2, pos2+1\n",
      "            else:\n",
      "                min2i, pos2 = pos2, pos2+1\n",
      "            count[vocab_size + a] = count[min1i] + count[min2i]\n",
      "            parent_node[min1i] = vocab_size + a\n",
      "            parent_node[min2i] = vocab_size + a\n",
      "            binary[min2i] = 1\n",
      "        for a in xrange(vocab_size):\n",
      "            b, i = a, 0\n",
      "            code, path = [], []\n",
      "            while True:\n",
      "                code.append(binary[b])\n",
      "                path.append(b)\n",
      "                i += 1\n",
      "                b = parent_node[b]\n",
      "                if b == vocab_size * 2 - 2: break\n",
      "            self.vocab[a]['path'] = [vocab_size - 2] + [p -vocab_size for p in path[::-1]]\n",
      "            self.vocab[a]['code'] = code[::-1]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## Unigram Table\n",
      "class UnigramTable(object):\n",
      "    def __init__(self, hashed_vocab):\n",
      "        self.TABLE_SIZE = int(1e8)\n",
      "        self.table = np.empty(self.TABLE_SIZE, np.int64)\n",
      "        self.power = 0.75\n",
      "        vocab = hashed_vocab.vocab\n",
      "        vocab_size = len(vocab)\n",
      "        train_words_pow = sum(math.pow(w['count'], self.power) \n",
      "                              for w in vocab)\n",
      "        i = 0\n",
      "        d1 = math.pow(vocab[i]['count'], self.power) / train_words_pow\n",
      "        for a in xrange(self.TABLE_SIZE):\n",
      "            self.table[a] = i\n",
      "            if a * 1. / self.TABLE_SIZE > d1:\n",
      "                i += 1\n",
      "                d1 += math.pow(vocab[i]['count'], self.power) / train_words_pow\n",
      "            if i >= vocab_size:\n",
      "                i = vocab_size - 1\n",
      "    def __getitem__(self, index):\n",
      "        return self.table[index]\n",
      "    \n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## Learning Models\n",
      "class Word2Vec(object):\n",
      "    \"\"\"\n",
      "    Net Architecture: cbow / skip_gram\n",
      "    Learning: hs / negative_sampling\n",
      "    \"\"\"\n",
      "    def __init__(self, hashed_vocab, unigram_table, layer1_size,\n",
      "                 net_type, learn_hs, learn_negative):\n",
      "        \"\"\"\n",
      "        hashed_vocab: vocab to build on\n",
      "        layer1_size: the dimensionality of feature space\n",
      "        net_type: {'cbow', 'skip_gram'}\n",
      "        learn_hs : hierarchical softmax\n",
      "        learn_negative: negative sampling\n",
      "        \"\"\"\n",
      "        self.vocab = hashed_vocab\n",
      "        self.table = unigram_table\n",
      "        self.layer1_size = layer1_size\n",
      "        self.net_type = net_type\n",
      "        self.learn_hs = learn_hs\n",
      "        self.learn_negative = learn_negative\n",
      "        \n",
      "        self.starting_alpha = 0.025\n",
      "        self.sentence_len = 1000\n",
      "        self.sampling = 1e-4\n",
      "        self.window = 5 # max skip length between words\n",
      "        self.negative = 5 # 5 - 10\n",
      "        \n",
      "        self.REAL_TYPE = np.float64\n",
      "        self.syn0 = np.array([], dtype = self.REAL_TYPE)\n",
      "        self.syn1 = np.array([], dtype = self.REAL_TYPE)\n",
      "        self.syn1neg = np.array([], dtype = self.REAL_TYPE)\n",
      "        \n",
      "    def init_net(self):\n",
      "        \"\"\"\n",
      "        syn0 - len(vocab) * layer1_size\n",
      "        syn1 - len(vocab) * layer1_size\n",
      "        syn1neg - len(vocab) * layer1_size\n",
      "        \"\"\"\n",
      "        vocab_size = len(self.vocab)\n",
      "        self.syn0 = np.random.uniform(low = -.5 / self.layer1_size, \n",
      "                                      high = .5 / self.layer1_size, \n",
      "                                      size = (vocab_size, self.layer1_size)).astype(self.REAL_TYPE)\n",
      "        if self.learn_hs: \n",
      "            self.syn1 = np.zeros((vocab_size, self.layer1_size), dtype = self.REAL_TYPE)\n",
      "        if self.learn_negative:\n",
      "            self.syn1neg = np.zeros((vocab_size, self.layer1_size), dtype = self.REAL_TYPE)\n",
      "    def fit(self, words):\n",
      "        \"\"\"\n",
      "        word_stream: stream of words\n",
      "        ntotal: total number of words in word_stream\n",
      "        \"\"\"\n",
      "        self.init_net()\n",
      "        \n",
      "        ntotal = len(words)\n",
      "        \n",
      "        alpha = self.starting_alpha\n",
      "        next_random = 0\n",
      "        \n",
      "        nprocessed = 0\n",
      "        while nprocessed < ntotal:\n",
      "            ## adjust learning rate alpha\n",
      "            alpha = max(self.starting_alpha * (1 - nprocessed / (ntotal + 1.)), \n",
      "                        self.starting_alpha * 0.0001)\n",
      "            ## refill the sentence\n",
      "            sentence_index = []\n",
      "            while nprocessed < ntotal and len(sentence_index) < self.sentence_len:\n",
      "                ## sampling down the infrequent words\n",
      "                if nprocessed % 10000 == 0: \n",
      "                    print 'progress:', nprocessed * 100. / ntotal, '%'\n",
      "                word = words[nprocessed]\n",
      "                word_index = self.vocab.search_for(word)\n",
      "                word_count = self.vocab[word_index]['count']\n",
      "                nprocessed += 1\n",
      "                if word_index == -1: continue\n",
      "                if self.sampling > 0:\n",
      "                    ran = ( (math.sqrt(word_count / (self.sampling * ntotal)) + 1) \n",
      "                                                       * (self.sampling * ntotal) / word_count);\n",
      "                    next_random = next_random * 25214903917 + 11\n",
      "                    if ran < (next_random & 0xFFFF) / 65536.: continue\n",
      "                sentence_index.append(word_index)\n",
      "            ## for each word in sentence\n",
      "            ## pivot is the current word index in sentence_index\n",
      "            for ipivot, pivot in enumerate(sentence_index):\n",
      "                next_random = next_random * 25214903917 + 11\n",
      "                b = next_random % self.window\n",
      "                neu1 = np.zeros(self.layer1_size, dtype = self.REAL_TYPE)\n",
      "                neu1e = np.zeros(self.layer1_size, dtype = self.REAL_TYPE)\n",
      "                if self.net_type == 'cbow':\n",
      "                    ## accumulate neighbors into neu1\n",
      "                    ## neighbors [ipivot-(window-b), ipivot+window-b]\n",
      "                    left = max(0, ipivot-(self.window-b))\n",
      "                    right = min(self.sentence_len-1, ipivot+self.window-b)\n",
      "                    ## NEU1 -- ACCUMULATION IN SENTENCE\n",
      "                    ## IN -> HIDDEN\n",
      "                    neu1 = np.sum(self.syn0[left:right+1, :], axis = 0)\n",
      "                    if self.learn_hs:\n",
      "                        ## parent node in Huffman tree\n",
      "                        ## the last one is root\n",
      "                        for parent_index, parent_code in zip(self.vocab[pivot]['path'][:-1], \n",
      "                                                             self.vocab[pivot]['code'][:-1]):\n",
      "                            ## F IS COS-DIFF OF NEIGHBORS'SYN0 AND PARENTS (class)' SYN1\n",
      "                            f = np.dot(neu1, self.syn1[parent_index, :])\n",
      "                            if f <= -MAX_EXP or f >= MAX_EXP:\n",
      "                                continue\n",
      "                            else:\n",
      "                                f = EXP_TABLE[int((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]\n",
      "                            g = (1 - parent_code - f) * alpha\n",
      "                            neu1e += g * self.syn1[parent_index]\n",
      "                            self.syn1[parent_index] += g * neu1\n",
      "                    if self.learn_negative:\n",
      "                        for d in xrange(self.negative + 1):\n",
      "                            if d == 0:\n",
      "                                target = pivot\n",
      "                                label = 1\n",
      "                            else:\n",
      "                                next_random = next_random * 25214903917 + 11\n",
      "                                target = self.table[(next_random >> 16) % self.table.TABLE_SIZE]\n",
      "                                if (target == pivot): continue\n",
      "                                label = 0\n",
      "                            f = np.dot(neu1, self.syn1neg[target, :])\n",
      "                            if f > MAX_EXP: \n",
      "                                g = (label - 1) * alpha\n",
      "                            elif f < -MAX_EXP: \n",
      "                                g = (label - 0) * alpha\n",
      "                            else: \n",
      "                                g = alpha * (label - EXP_TABLE[int((f + MAX_EXP) \n",
      "                                                                   * (EXP_TABLE_SIZE / MAX_EXP / 2))])\n",
      "                            neu1e += g * self.syn1neg[target]\n",
      "                            self.syn1neg[target] += g * neu1\n",
      "                    ## HIDDEN -> IN\n",
      "                    self.syn0[left:right+1, :] += neu1e\n",
      "                elif self.net_type == 'skip_gram':\n",
      "                    pass"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 17
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "##TESTING"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## TEST HashedVocab\n",
      "hashed_vocab = HashedVocab()\n",
      "#hashed_vocab.fit(file_to_wordstream('data/text_simple'))\n",
      "hashed_vocab.fit(file_to_wordstream('data/simple.txt'))\n",
      "unigram_table = UnigramTable(hashed_vocab)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "DEBUG: 1179\n"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "for i, vw in enumerate(hashed_vocab.vocab):\n",
      "    word = vw['word']\n",
      "    assert i == hashed_vocab.search_for(word)\n",
      "    \n",
      "print hashed_vocab.search_for('alien')\n",
      "print len(hashed_vocab.vocab)\n",
      "#inspect_vocab_tree(hashed_vocab.vocab)\n",
      "print hashed_vocab[10]['word']\n",
      "for n in hashed_vocab[10]['path'][:-1]:\n",
      "    print hashed_vocab[n]['word']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "-1\n",
        "1179\n",
        "as\n",
        "respect\n",
        "practice\n",
        "issued\n",
        "texts\n",
        "outside\n",
        "private\n"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "train_words = list(file_to_wordstream('data/simple.txt'))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "word2vec = Word2Vec(hashed_vocab, unigram_table, 100, 'cbow', True, True)\n",
      "word2vec.fit(train_words)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "progress: 0.0 %\n",
        "progress:"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 20.000400008 %\n",
        "progress:"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 40.000800016 %\n",
        "progress:"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 60.001200024 %\n",
        "progress:"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 80.001600032 %\n"
       ]
      }
     ],
     "prompt_number": 18
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}