In [1]:
#!/usr/bin/env python
import import_ipynb

In [2]:
import random
import numpy as np
from utils.treebank import StanfordSentiment
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time

from q3_word2vec import *
from q3_sgd import *

# Reset the random seed to make sure that everyone gets the same results
random.seed(314)
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)

# We are going to train 10-dimensional vectors for this assignment
dimVectors = 10

# Context size
C = 5

# Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)

startTime=time.time()
wordVectors = np.concatenate(
 ((np.random.rand(nWords, dimVectors) - 0.5) /
 dimVectors, np.zeros((nWords, dimVectors))),
 axis=0)

wordVectors = sgd(
 lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C,
 negSamplingCostAndGradient),
 wordVectors, 0.3, 40000, None, True, PRINT_EVERY=100)
# Note that normalization is not called here. This is not a bug,
# normalizing during training loses the notion of length.

print("sanity check: cost at convergence should be around or below 10")
print("training took %d seconds" % (time.time() - startTime))

# concatenate the input and output word vectors
# 这里将U,V合并,后面会进行奇异值分解
wordVectors = np.concatenate(
 (wordVectors[:nWords,:], wordVectors[nWords:,:]),
 axis=0)

visualizeWords = [
 "the", "a", "an", ",", ".", "?", "!", "``", "''", "--",
 "good", "great", "cool", "brilliant", "wonderful", "well", "amazing",
 "worth", "sweet", "enjoyable", "boring", "bad", "waste", "dumb",
 "annoying"]
 
visualizeIdx = [tokens[word] for word in visualizeWords]
visualizeVecs = wordVectors[visualizeIdx, :]

# PCA,采用SVD来实现的,PCA很重要的一点中心化,均值为0
temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
# SVD的左奇异矩阵恰好就是X.dot(X.T)的特征向量组成的矩阵,而这个矩阵的特征向量恰好就是PCA的主成分
U,S,V = np.linalg.svd(covariance)
coord = temp.dot(U[:,0:2])

for i in range(len(visualizeWords)):
 plt.text(coord[i,0], coord[i,1], visualizeWords[i],
 bbox=dict(facecolor='green', alpha=0.1))

plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))

plt.savefig('q3_word_vectors.png')

importing Jupyter notebook from q3_word2vec.ipynb
importing Jupyter notebook from q1_softmax.ipynb
importing Jupyter notebook from q2_gradcheck.ipynb
importing Jupyter notebook from q2_sigmoid.ipynb
importing Jupyter notebook from q3_sgd.ipynb
iter_ 100: 18.604024
iter_ 200: 18.779389
iter_ 300: 18.968864
iter_ 400: 19.209857
iter_ 500: 19.156670
iter_ 600: 19.243421
iter_ 700: 19.440181
iter_ 800: 19.520359
iter_ 900: 19.588859
iter_ 1000: 19.882518
iter_ 1100: 20.161624
iter_ 1200: 20.121685
iter_ 1300: 20.228471
iter_ 1400: 20.474703
iter_ 1500: 20.487733
iter_ 1600: 20.621488
iter_ 1700: 20.671889
iter_ 1800: 20.673450
iter_ 1900: 20.947727
iter_ 2000: 21.009787
iter_ 2100: 21.031895
iter_ 2200: 21.075417
iter_ 2300: 21.037911
iter_ 2400: 21.063337
iter_ 2500: 21.159010
iter_ 2600: 21.220136
iter_ 2700: 21.306704
iter_ 2800: 21.317022
iter_ 2900: 21.313567
iter_ 3000: 21.280537
iter_ 3100: 21.394261
iter_ 3200: 21.222326
iter_ 3300: 21.103933
iter_ 3400: 21.026450
iter_ 3500: 20.94

iter_ 35500: 9.610944
iter_ 35600: 9.663755
iter_ 35700: 9.722588
iter_ 35800: 9.722484
iter_ 35900: 9.677905
iter_ 36000: 9.681488
iter_ 36100: 9.697359
iter_ 36200: 9.675975
iter_ 36300: 9.632207
iter_ 36400: 9.572983
iter_ 36500: 9.546814
iter_ 36600: 9.563374
iter_ 36700: 9.569601
iter_ 36800: 9.610571
iter_ 36900: 9.583856
iter_ 37000: 9.579905
iter_ 37100: 9.555246
iter_ 37200: 9.574555
iter_ 37300: 9.529198
iter_ 37400: 9.501708
iter_ 37500: 9.510385
iter_ 37600: 9.576810
iter_ 37700: 9.520415
iter_ 37800: 9.561922
iter_ 37900: 9.574543
iter_ 38000: 9.605944
iter_ 38100: 9.620448
iter_ 38200: 9.662221
iter_ 38300: 9.625025
iter_ 38400: 9.581447
iter_ 38500: 9.615949
iter_ 38600: 9.606902
iter_ 38700: 9.663988
iter_ 38800: 9.608830
iter_ 38900: 9.631785
iter_ 39000: 9.638900
iter_ 39100: 9.590407
iter_ 39200: 9.596133
iter_ 39300: 9.532854
iter_ 39400: 9.522097
iter_ 39500: 9.497443
iter_ 39600: 9.458520
iter_ 39700: 9.447899
iter_ 39800: 9.428893
iter_ 39900: 9.406359
iter_ 4000