{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## pyword2vec single thread experiment" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "from collections import Counter\n", "import mmap\n", "import re\n", "import networkx as nx\n", "import math" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "## EXPonetial memoization table\n", "\n", "MAX_EXP = 6\n", "EXP_TABLE_SIZE = 1000\n", "EXP_TABLE = np.arange(start = 0, stop = EXP_TABLE_SIZE, \n", " step = 1, dtype = np.float64)\n", "EXP_TABLE = np.exp((EXP_TABLE / EXP_TABLE_SIZE * 2. - 1.) * MAX_EXP)\n", "EXP_TABLE = EXP_TABLE / (EXP_TABLE + 1.)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "## HELPER FUNCTION\n", "def file_to_wordstream(file_path):\n", " with open(file_path) as fin:\n", " mf = mmap.mmap(fin.fileno(), 0, access = mmap.ACCESS_READ)\n", " for match in re.finditer(r'(.*?)\\s', mf):\n", " word = match.group(1)\n", " if word:\n", " yield word\n", " \n", "def inspect_vocab_tree(vocab):\n", " g = nx.DiGraph()\n", " vocab_size = len(vocab)\n", " edges = set()\n", " for vw in vocab:\n", " tree_path = [i + vocab_size for i in vw['path']]\n", " tree_path = [str(i) if i >= vocab_size \n", " else \"%d_%s(%d)\" % (i, vocab[i]['word'], vocab[i]['count']) \n", " for i in tree_path]\n", " edges.update(zip(tree_path[:-1], tree_path[1:]))\n", " g.add_edges_from(edges)\n", " figure(figsize=(16, 16))\n", " pos = nx.graphviz_layout(g, prog='dot')\n", " nx.draw(g, pos, with_labels=True, arrows = True, node_size=3000, font_size = 30)\n", " return g" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "## HashedVocab\n", "\n", "class HashedVocab(object):\n", " def __init__(self):\n", " self.HASH_SIZE = 30000000\n", " self.MIN_COUNT = 5\n", " self.vocab = []\n", " self.vocab_hash = np.empty(self.HASH_SIZE, dtype = np.int)\n", " self.vocab_hash.fill(-1)\n", " def fit(self, word_stream):\n", " counter = Counter(word_stream)\n", " self.vocab = map(lambda x: dict(zip(['word', 'count'], x)), \n", " filter(lambda x: x[1] > self.MIN_COUNT, \n", " counter.most_common(len(counter))))\n", " if len(self.vocab) > self.HASH_SIZE * 0.7:\n", " raise RuntimeError('Vocab size too large, increase MIN_COUNT or increase HASH_SIZE')\n", " self.build_hash()\n", " self.build_huffman_tree()\n", " def search_for(self, word):\n", " word_hash = self.get_word_hash(word)\n", " while True:\n", " word_index = self.vocab_hash[word_hash]\n", " if word_index == -1:\n", " return - 1\n", " elif word == self.vocab[word_index]['word']:\n", " return word_index\n", " else:\n", " word_hash = (word_hash + 1) % self.HASH_SIZE\n", " def __getitem__(self, word_index):\n", " return self.vocab[word_index]\n", " def __len__(self):\n", " return len(self.vocab)\n", " \n", " def build_hash(self):\n", " self.vocab_hash = np.empty(self.HASH_SIZE, dtype = np.int)\n", " self.vocab_hash.fill(-1)\n", " for word_index, vocab_word in enumerate(self.vocab):\n", " word = vocab_word['word']\n", " word_hash = self.get_word_hash(word)\n", " self.add_to_hash(word_hash, word_index)\n", " def get_word_hash(self, word):\n", " ## TOO SLOW\n", " #word_hash = sum([ord(c)*(257**i) \n", " # for i, c in zip(range(len(word))[::-1], word)])\n", " word_hash = 0\n", " for c in word:\n", " word_hash = word_hash * 257 + ord(c)\n", " word_hash %= self.HASH_SIZE\n", " return word_hash\n", " def add_to_hash(self, word_hash, word_index):\n", " while self.vocab_hash[word_hash] != -1:\n", " word_hash = (word_hash + 1) % self.HASH_SIZE\n", " self.vocab_hash[word_hash] = word_index\n", " def build_huffman_tree(self):\n", " \"\"\"\n", " build binary Huffman Tree by word counts,\n", " the structure will be embedded in the vocab_word\n", " dict(word, count, path, code) in the vocab\n", " \n", " vocab_word['code'] will be the binary representation of word\n", " based on frequency\n", " vocab_word['path'] will be the path from root to leaf \n", " \"\"\"\n", " ## for arbitary full binary, n-1 internal inodes at max \n", " ## given n leaves. But in the original C code, the count\n", " ## binary and parent_node size are n*2+1 instead of n*2-1\n", " vocab_size = len(self.vocab)\n", " print \"DEBUG:\", vocab_size\n", " ## workhorse data structure for tree construction\n", " count = np.empty(vocab_size*2-1, dtype = np.int64)\n", " count.fill(1e15)\n", " count[:vocab_size] = [vw['count'] for vw in self.vocab]\n", " ## boolean values of each node\n", " binary = np.zeros(vocab_size*2-1, dtype = np.int8)\n", " ## parent node to store the path\n", " parent_node = np.empty(vocab_size*2-1, dtype = np.int64)\n", " ## construct the tree\n", " pos1, pos2 = vocab_size-1, vocab_size\n", " for a in xrange(vocab_size-1):\n", " ## min1i\n", " if pos1 >= 0:\n", " if count[pos1] < count[pos2]:\n", " min1i, pos1 = pos1, pos1-1\n", " else:\n", " min1i, pos2 = pos2, pos2+1\n", " else:\n", " min1i, pos2 = pos2, pos2+1\n", " ## min2i\n", " if pos1 >= 0:\n", " if count[pos1] < count[pos2]:\n", " min2i, pos1 = pos1, pos1-1\n", " else:\n", " min2i, pos2 = pos2, pos2+1\n", " else:\n", " min2i, pos2 = pos2, pos2+1\n", " count[vocab_size + a] = count[min1i] + count[min2i]\n", " parent_node[min1i] = vocab_size + a\n", " parent_node[min2i] = vocab_size + a\n", " binary[min2i] = 1\n", " for a in xrange(vocab_size):\n", " b, i = a, 0\n", " code, path = [], []\n", " while True:\n", " code.append(binary[b])\n", " path.append(b)\n", " i += 1\n", " b = parent_node[b]\n", " if b == vocab_size * 2 - 2: break\n", " self.vocab[a]['path'] = [vocab_size - 2] + [p -vocab_size for p in path[::-1]]\n", " self.vocab[a]['code'] = code[::-1]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "## Unigram Table\n", "class UnigramTable(object):\n", " def __init__(self, hashed_vocab):\n", " self.TABLE_SIZE = int(1e8)\n", " self.table = np.empty(self.TABLE_SIZE, np.int64)\n", " self.power = 0.75\n", " vocab = hashed_vocab.vocab\n", " vocab_size = len(vocab)\n", " train_words_pow = sum(math.pow(w['count'], self.power) \n", " for w in vocab)\n", " i = 0\n", " d1 = math.pow(vocab[i]['count'], self.power) / train_words_pow\n", " for a in xrange(self.TABLE_SIZE):\n", " self.table[a] = i\n", " if a * 1. / self.TABLE_SIZE > d1:\n", " i += 1\n", " d1 += math.pow(vocab[i]['count'], self.power) / train_words_pow\n", " if i >= vocab_size:\n", " i = vocab_size - 1\n", " def __getitem__(self, index):\n", " return self.table[index]\n", " \n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "## Learning Models\n", "class Word2Vec(object):\n", " \"\"\"\n", " Net Architecture: cbow / skip_gram\n", " Learning: hs / negative_sampling\n", " \"\"\"\n", " def __init__(self, hashed_vocab, unigram_table, layer1_size,\n", " net_type, learn_hs, learn_negative):\n", " \"\"\"\n", " hashed_vocab: vocab to build on\n", " layer1_size: the dimensionality of feature space\n", " net_type: {'cbow', 'skip_gram'}\n", " learn_hs : hierarchical softmax\n", " learn_negative: negative sampling\n", " \"\"\"\n", " self.vocab = hashed_vocab\n", " self.table = unigram_table\n", " self.layer1_size = layer1_size\n", " self.net_type = net_type\n", " self.learn_hs = learn_hs\n", " self.learn_negative = learn_negative\n", " \n", " self.starting_alpha = 0.025\n", " self.sentence_len = 1000\n", " self.sampling = 1e-4\n", " self.window = 5 # max skip length between words\n", " self.negative = 5 # 5 - 10\n", " \n", " self.REAL_TYPE = np.float64\n", " self.syn0 = np.array([], dtype = self.REAL_TYPE)\n", " self.syn1 = np.array([], dtype = self.REAL_TYPE)\n", " self.syn1neg = np.array([], dtype = self.REAL_TYPE)\n", " \n", " def init_net(self):\n", " \"\"\"\n", " syn0 - len(vocab) * layer1_size\n", " syn1 - len(vocab) * layer1_size\n", " syn1neg - len(vocab) * layer1_size\n", " \"\"\"\n", " vocab_size = len(self.vocab)\n", " self.syn0 = np.random.uniform(low = -.5 / self.layer1_size, \n", " high = .5 / self.layer1_size, \n", " size = (vocab_size, self.layer1_size)).astype(self.REAL_TYPE)\n", " if self.learn_hs: \n", " self.syn1 = np.zeros((vocab_size, self.layer1_size), dtype = self.REAL_TYPE)\n", " if self.learn_negative:\n", " self.syn1neg = np.zeros((vocab_size, self.layer1_size), dtype = self.REAL_TYPE)\n", " def fit(self, words):\n", " \"\"\"\n", " word_stream: stream of words\n", " ntotal: total number of words in word_stream\n", " \"\"\"\n", " self.init_net()\n", " \n", " ntotal = len(words)\n", " \n", " alpha = self.starting_alpha\n", " next_random = 0\n", " \n", " nprocessed = 0\n", " while nprocessed < ntotal:\n", " ## adjust learning rate alpha\n", " alpha = max(self.starting_alpha * (1 - nprocessed / (ntotal + 1.)), \n", " self.starting_alpha * 0.0001)\n", " ## refill the sentence\n", " sentence_index = []\n", " while nprocessed < ntotal and len(sentence_index) < self.sentence_len:\n", " ## sampling down the infrequent words\n", " if nprocessed % 10000 == 0: \n", " print 'progress:', nprocessed * 100. / ntotal, '%'\n", " word = words[nprocessed]\n", " word_index = self.vocab.search_for(word)\n", " word_count = self.vocab[word_index]['count']\n", " nprocessed += 1\n", " if word_index == -1: continue\n", " if self.sampling > 0:\n", " ran = ( (math.sqrt(word_count / (self.sampling * ntotal)) + 1) \n", " * (self.sampling * ntotal) / word_count);\n", " next_random = next_random * 25214903917 + 11\n", " if ran < (next_random & 0xFFFF) / 65536.: continue\n", " sentence_index.append(word_index)\n", " ## for each word in sentence\n", " ## pivot is the current word index in sentence_index\n", " for ipivot, pivot in enumerate(sentence_index):\n", " next_random = next_random * 25214903917 + 11\n", " b = next_random % self.window\n", " neu1 = np.zeros(self.layer1_size, dtype = self.REAL_TYPE)\n", " neu1e = np.zeros(self.layer1_size, dtype = self.REAL_TYPE)\n", " if self.net_type == 'cbow':\n", " ## accumulate neighbors into neu1\n", " ## neighbors [ipivot-(window-b), ipivot+window-b]\n", " left = max(0, ipivot-(self.window-b))\n", " right = min(self.sentence_len-1, ipivot+self.window-b)\n", " ## NEU1 -- ACCUMULATION IN SENTENCE\n", " ## IN -> HIDDEN\n", " neu1 = np.sum(self.syn0[left:right+1, :], axis = 0)\n", " if self.learn_hs:\n", " ## parent node in Huffman tree\n", " ## the last one is root\n", " for parent_index, parent_code in zip(self.vocab[pivot]['path'][:-1], \n", " self.vocab[pivot]['code'][:-1]):\n", " ## F IS COS-DIFF OF NEIGHBORS'SYN0 AND PARENTS (class)' SYN1\n", " f = np.dot(neu1, self.syn1[parent_index, :])\n", " if f <= -MAX_EXP or f >= MAX_EXP:\n", " continue\n", " else:\n", " f = EXP_TABLE[int((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]\n", " g = (1 - parent_code - f) * alpha\n", " neu1e += g * self.syn1[parent_index]\n", " self.syn1[parent_index] += g * neu1\n", " if self.learn_negative:\n", " for d in xrange(self.negative + 1):\n", " if d == 0:\n", " target = pivot\n", " label = 1\n", " else:\n", " next_random = next_random * 25214903917 + 11\n", " target = self.table[(next_random >> 16) % self.table.TABLE_SIZE]\n", " if (target == pivot): continue\n", " label = 0\n", " f = np.dot(neu1, self.syn1neg[target, :])\n", " if f > MAX_EXP: \n", " g = (label - 1) * alpha\n", " elif f < -MAX_EXP: \n", " g = (label - 0) * alpha\n", " else: \n", " g = alpha * (label - EXP_TABLE[int((f + MAX_EXP) \n", " * (EXP_TABLE_SIZE / MAX_EXP / 2))])\n", " neu1e += g * self.syn1neg[target]\n", " self.syn1neg[target] += g * neu1\n", " ## HIDDEN -> IN\n", " self.syn0[left:right+1, :] += neu1e\n", " elif self.net_type == 'skip_gram':\n", " pass" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 17 }, { "cell_type": "markdown", "metadata": {}, "source": [ "##TESTING" ] }, { "cell_type": "code", "collapsed": false, "input": [ "## TEST HashedVocab\n", "hashed_vocab = HashedVocab()\n", "#hashed_vocab.fit(file_to_wordstream('data/text_simple'))\n", "hashed_vocab.fit(file_to_wordstream('data/simple.txt'))\n", "unigram_table = UnigramTable(hashed_vocab)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "DEBUG: 1179\n" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "for i, vw in enumerate(hashed_vocab.vocab):\n", " word = vw['word']\n", " assert i == hashed_vocab.search_for(word)\n", " \n", "print hashed_vocab.search_for('alien')\n", "print len(hashed_vocab.vocab)\n", "#inspect_vocab_tree(hashed_vocab.vocab)\n", "print hashed_vocab[10]['word']\n", "for n in hashed_vocab[10]['path'][:-1]:\n", " print hashed_vocab[n]['word']" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "-1\n", "1179\n", "as\n", "respect\n", "practice\n", "issued\n", "texts\n", "outside\n", "private\n" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "train_words = list(file_to_wordstream('data/simple.txt'))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "word2vec = Word2Vec(hashed_vocab, unigram_table, 100, 'cbow', True, True)\n", "word2vec.fit(train_words)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "progress: 0.0 %\n", "progress:" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 20.000400008 %\n", "progress:" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 40.000800016 %\n", "progress:" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 60.001200024 %\n", "progress:" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 80.001600032 %\n" ] } ], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }