{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 3. GloVe: Global Vectors for Word Representation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "I recommend you take a look at these material first." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture3.pdf\n", "* https://nlp.stanford.edu/pubs/glove.pdf" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import torch\n", "import torch.nn as nn\n", "from torch.autograd import Variable\n", "import torch.optim as optim\n", "import torch.nn.functional as F\n", "import nltk\n", "import random\n", "import numpy as np\n", "from collections import Counter\n", "flatten = lambda l: [item for sublist in l for item in sublist]\n", "random.seed(1024)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.3.0.post4\n", "3.2.4\n" ] } ], "source": [ "print(torch.__version__)\n", "print(nltk.__version__)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "USE_CUDA = torch.cuda.is_available()\n", "gpus = [0]\n", "torch.cuda.set_device(gpus[0])\n", "\n", "FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor\n", "LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor\n", "ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def getBatch(batch_size, train_data):\n", " random.shuffle(train_data)\n", " sindex = 0\n", " eindex = batch_size\n", " while eindex < len(train_data):\n", " batch = train_data[sindex:eindex]\n", " temp = eindex\n", " eindex = eindex + batch_size\n", " sindex = temp\n", " yield batch\n", " \n", " if eindex >= len(train_data):\n", " batch = train_data[sindex:]\n", " yield batch" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def prepare_sequence(seq, word2index):\n", " idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index[\"\"], seq))\n", " return Variable(LongTensor(idxs))\n", "\n", "def prepare_word(word, word2index):\n", " return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index[\"\"]]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data load and Preprocessing " ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:500]\n", "corpus = [[word.lower() for word in sent] for sent in corpus]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Build vocab" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "vocab = list(set(flatten(corpus)))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "word2index = {}\n", "for vo in vocab:\n", " if word2index.get(vo) is None:\n", " word2index[vo] = len(word2index)\n", " \n", "index2word={v:k for k, v in word2index.items()}" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "WINDOW_SIZE = 5\n", "windows = flatten([list(nltk.ngrams([''] * WINDOW_SIZE + c + [''] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])\n", "\n", "window_data = []\n", "\n", "for window in windows:\n", " for i in range(WINDOW_SIZE * 2 + 1):\n", " if i == WINDOW_SIZE or window[i] == '': \n", " continue\n", " window_data.append((window[WINDOW_SIZE], window[i]))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Weighting Function " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "
borrowed image from https://nlp.stanford.edu/pubs/glove.pdf
" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def weighting(w_i, w_j):\n", " try:\n", " x_ij = X_ik[(w_i, w_j)]\n", " except:\n", " x_ij = 1\n", " \n", " x_max = 100 #100 # fixed in paper\n", " alpha = 0.75\n", " \n", " if x_ij < x_max:\n", " result = (x_ij/x_max)**alpha\n", " else:\n", " result = 1\n", " \n", " return result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Build Co-occurence Matrix X" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Because of model complexity, It is important to determine whether a tighter bound can be placed on the number of nonzero elements of X." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X_i = Counter(flatten(corpus)) # X_i" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X_ik_window_5 = Counter(window_data) # Co-occurece in window size 5" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X_ik = {}\n", "weighting_dic = {}" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from itertools import combinations_with_replacement" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "for bigram in combinations_with_replacement(vocab, 2):\n", " if X_ik_window_5.get(bigram) is not None: # nonzero elements\n", " co_occer = X_ik_window_5[bigram]\n", " X_ik[bigram] = co_occer + 1 # log(Xik) -> log(Xik+1) to prevent divergence\n", " X_ik[(bigram[1],bigram[0])] = co_occer+1\n", " else:\n", " pass\n", " \n", " weighting_dic[bigram] = weighting(bigram[0], bigram[1])\n", " weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(',', 'was')\n", "True\n" ] } ], "source": [ "test = random.choice(window_data)\n", "print(test)\n", "try:\n", " print(X_ik[(test[0], test[1])] == X_ik[(test[1], test[0])])\n", "except:\n", " 1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Prepare train data" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(Variable containing:\n", " 703\n", "[torch.cuda.LongTensor of size 1x1 (GPU 0)]\n", ", Variable containing:\n", " 23\n", "[torch.cuda.LongTensor of size 1x1 (GPU 0)]\n", ", Variable containing:\n", " 0.6931\n", "[torch.cuda.FloatTensor of size 1x1 (GPU 0)]\n", ", Variable containing:\n", "1.00000e-02 *\n", " 5.3183\n", "[torch.cuda.FloatTensor of size 1x1 (GPU 0)]\n", ")\n" ] } ], "source": [ "u_p = [] # center vec\n", "v_p = [] # context vec\n", "co_p = [] # log(x_ij)\n", "weight_p = [] # f(x_ij)\n", "\n", "for pair in window_data: \n", " u_p.append(prepare_word(pair[0], word2index).view(1, -1))\n", " v_p.append(prepare_word(pair[1], word2index).view(1, -1))\n", " \n", " try:\n", " cooc = X_ik[pair]\n", " except:\n", " cooc = 1\n", "\n", " co_p.append(torch.log(Variable(FloatTensor([cooc]))).view(1, -1))\n", " weight_p.append(Variable(FloatTensor([weighting_dic[pair]])).view(1, -1))\n", " \n", "train_data = list(zip(u_p, v_p, co_p, weight_p))\n", "del u_p\n", "del v_p\n", "del co_p\n", "del weight_p\n", "print(train_data[0]) # tuple (center vec i, context vec j log(x_ij), weight f(w_ij))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Modeling " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "
borrowed image from https://nlp.stanford.edu/pubs/glove.pdf
" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "class GloVe(nn.Module):\n", " \n", " def __init__(self, vocab_size,projection_dim):\n", " super(GloVe,self).__init__()\n", " self.embedding_v = nn.Embedding(vocab_size, projection_dim) # center embedding\n", " self.embedding_u = nn.Embedding(vocab_size, projection_dim) # out embedding\n", " \n", " self.v_bias = nn.Embedding(vocab_size, 1)\n", " self.u_bias = nn.Embedding(vocab_size, 1)\n", " \n", " initrange = (2.0 / (vocab_size + projection_dim))**0.5 # Xavier init\n", " self.embedding_v.weight.data.uniform_(-initrange, initrange) # init\n", " self.embedding_u.weight.data.uniform_(-initrange, initrange) # init\n", " self.v_bias.weight.data.uniform_(-initrange, initrange) # init\n", " self.u_bias.weight.data.uniform_(-initrange, initrange) # init\n", " \n", " def forward(self, center_words, target_words, coocs, weights):\n", " center_embeds = self.embedding_v(center_words) # B x 1 x D\n", " target_embeds = self.embedding_u(target_words) # B x 1 x D\n", " \n", " center_bias = self.v_bias(center_words).squeeze(1)\n", " target_bias = self.u_bias(target_words).squeeze(1)\n", " \n", " inner_product = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # Bx1\n", " \n", " loss = weights*torch.pow(inner_product +center_bias + target_bias - coocs, 2)\n", " \n", " return torch.sum(loss)\n", " \n", " def prediction(self, inputs):\n", " v_embeds = self.embedding_v(inputs) # B x 1 x D\n", " u_embeds = self.embedding_u(inputs) # B x 1 x D\n", " \n", " return v_embeds+u_embeds # final embed" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train " ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true }, "outputs": [], "source": [ "EMBEDDING_SIZE = 50\n", "BATCH_SIZE = 256\n", "EPOCH = 50" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": true }, "outputs": [], "source": [ "losses = []\n", "model = GloVe(len(word2index), EMBEDDING_SIZE)\n", "if USE_CUDA:\n", " model = model.cuda()\n", "optimizer = optim.Adam(model.parameters(), lr=0.001)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch : 0, mean_loss : 236.10\n", "Epoch : 10, mean_loss : 2.27\n", "Epoch : 20, mean_loss : 0.53\n", "Epoch : 30, mean_loss : 0.12\n", "Epoch : 40, mean_loss : 0.04\n" ] } ], "source": [ "for epoch in range(EPOCH):\n", " for i,batch in enumerate(getBatch(BATCH_SIZE, train_data)):\n", " \n", " inputs, targets, coocs, weights = zip(*batch)\n", " \n", " inputs = torch.cat(inputs) # B x 1\n", " targets = torch.cat(targets) # B x 1\n", " coocs = torch.cat(coocs)\n", " weights = torch.cat(weights)\n", " model.zero_grad()\n", "\n", " loss = model(inputs, targets, coocs, weights)\n", " \n", " loss.backward()\n", " optimizer.step()\n", " \n", " losses.append(loss.data.tolist()[0])\n", " if epoch % 10 == 0:\n", " print(\"Epoch : %d, mean_loss : %.02f\" % (epoch, np.mean(losses)))\n", " losses = []" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test " ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def word_similarity(target, vocab):\n", " if USE_CUDA:\n", " target_V = model.prediction(prepare_word(target, word2index))\n", " else:\n", " target_V = model.prediction(prepare_word(target, word2index))\n", " similarities = []\n", " for i in range(len(vocab)):\n", " if vocab[i] == target: \n", " continue\n", " \n", " if USE_CUDA:\n", " vector = model.prediction(prepare_word(list(vocab)[i], word2index))\n", " else:\n", " vector = model.prediction(prepare_word(list(vocab)[i], word2index))\n", " \n", " cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] \n", " similarities.append([vocab[i], cosine_sim])\n", " return sorted(similarities, key=lambda x: x[1], reverse=True)[:10]" ] }, { "cell_type": "code", "execution_count": 86, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'spiral'" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test = random.choice(list(vocab))\n", "test" ] }, { "cell_type": "code", "execution_count": 87, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[['horns', 0.9727935194969177],\n", " ['swords', 0.9076412916183472],\n", " ['hooked', 0.8984033465385437],\n", " ['thar', 0.8066437244415283],\n", " ['montaigne', 0.8062068819999695],\n", " ['rabelais', 0.789764940738678],\n", " ['orion', 0.7886737585067749],\n", " ['isaiah', 0.780662477016449],\n", " ['hamlet', 0.7799868583679199],\n", " ['colnett', 0.7792885899543762]]" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "word_similarity(test, vocab)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## TODO" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* Use sparse-matrix to build co-occurence matrix for memory efficiency" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Suggested Readings" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* Word embeddings in 2017: Trends and future directions" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }