## Question 1

### Q1.1

$$\begin{align*}\text{buy} &= [1,0,0,0,0,0] \\ \text{bought} &= [0,1,0,0,0,0] \\ \text{girl} &= [0,0,1,0,0,0] \\ \text{woman} &= [0,0,0,1,0,0] \\ \text{word} &= [0,0,0,0,1,0] \\ \text{words} &= [0,0,0,0,0,1] \end{align*}$$

### Q1.2

1. The dimension of the embedding linearly increases with the vocabulary size.
2. Not involve semantic features.

### Q1.3

Represent the words as {00, 01, 10, 11}.

## Question 2

### Q2.1

- bi-grams: 
    - "CS6493": "taking CS6493", "CS6493 this"
    - "NLP": "studying NLP", "NLP is"
- tri-grams:
    - "CS6493": "am taking CS6493", "taking CS6493 this", "CS6493 this semester"
    - "NLP": "and studying NLP", "studying NLP is", "NLP is really"

### Q2.2

1. Sparse feature space;
2. Only suitable to the large training dataset;
3. Cannot interpret unseen words;
4. Sensitive to the hyper-parameter `N`.

### Question 2.3 4-Gram Model

In [132]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Define the training data (4-grams) and vocabulary
text = 'I am taking CS6493 this semester and studying NLP is really fascinating'
text_list = text.split()
text_list_length = len(text_list)
vocab = set(text_list)
vocab_size = len(vocab)
training_data = []

for i in range(0, text_list_length-3):
    context = ' '.join(text_list[i:i+3])
    next_word = text_list[i+3]
    training_data.append((context, next_word))

word_to_ix = {word: i for i, word in enumerate(vocab)}

# Hyperparameters
embedding_dims = [32, 64, 128]
context_size = 3
hidden_size = 128
learning_rate = 0.01
epochs = 10

# Create the language model
class LanguageModeler(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size, hidden_size=128):
        super(LanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_size)
        self.linear2 = nn.Linear(hidden_size, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

for embedding_dim in embedding_dims:
    model = LanguageModeler(vocab_size, embedding_dim, context_size, hidden_size)
    
    # Loss and optimizer
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    
    # Training loop
    for epoch in range(epochs):
        total_loss = 0
        for context, target in training_data:
            context_idxs = torch.tensor([word_to_ix[word] for word in context.split()], dtype=torch.long)
            model.zero_grad()
            log_probs = model(context_idxs)
            # print(log_probs)
            target_idx = torch.tensor([word_to_ix[target]], dtype=torch.long)
            loss = loss_function(log_probs, target_idx)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss}")
    
    print("Training complete!")
    print(f"The training loss for embedding_dim={embedding_dim} is {total_loss}")
    
    # Example usage to predict the next word
    context = "I am taking"
    context_idxs = torch.tensor([word_to_ix[word] for word in context.split()], dtype=torch.long)
    log_probs = model(context_idxs)
    predicted_idx = torch.argmax(log_probs).item()
    predicted_word = [word for word, idx in word_to_ix.items() if idx == predicted_idx][0]
    print(f"Next word prediction: {predicted_word}")

Epoch 1, Loss: 22.401702046394348
Epoch 2, Loss: 19.851872205734253
Epoch 3, Loss: 17.524168014526367
Epoch 4, Loss: 15.372764229774475
Epoch 5, Loss: 13.371510922908783
Epoch 6, Loss: 11.53460431098938
Epoch 7, Loss: 9.843500971794128
Epoch 8, Loss: 8.309678226709366
Epoch 9, Loss: 6.957409352064133
Epoch 10, Loss: 5.794081926345825
Training complete!
The training loss for embedding_dim=32 is 5.794081926345825
Next word prediction: CS6493
Epoch 1, Loss: 23.116318225860596
Epoch 2, Loss: 18.840309739112854
Epoch 3, Loss: 15.208776473999023
Epoch 4, Loss: 12.037311911582947
Epoch 5, Loss: 9.362350881099701
Epoch 6, Loss: 7.122990161180496
Epoch 7, Loss: 5.349436938762665
Epoch 8, Loss: 4.025747239589691
Epoch 9, Loss: 3.078266069293022
Epoch 10, Loss: 2.413187339901924
Training complete!
The training loss for embedding_dim=64 is 2.413187339901924
Next word prediction: CS6493
Epoch 1, Loss: 22.857290029525757
Epoch 2, Loss: 16.749339699745178
Epoch 3, Loss: 12.256677389144897
Epoch 4, Lo

### Question 3.1 CBOW Model

In [133]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Define the training data and vocabulary
text = 'I am taking CS6493 this semester and studying NLP is really fascinating'
text_list = text.split()
text_list_length = len(text_list)
vocab = set(text_list)
vocab_size = len(vocab)
training_data = []

window_size = 2 # the context length is window_size * 2

for i in range(window_size, text_list_length-window_size):
    context = ' '.join(text_list[i-window_size:i] + text_list[i+1:i+1+window_size])
    target_word = text_list[i]
    training_data.append((context, target_word))

# print(training_data)
word_to_ix = {word: i for i, word in enumerate(vocab)}

# Hyperparameters
embedding_dim = 128
hidden_size = 128
learning_rate = 0.01
epochs = 10

# Create the language model
class LanguageModeler(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size=128):
        super(LanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, hidden_size)
        self.linear2 = nn.Linear(hidden_size, vocab_size)

    def forward(self, inputs):
        # take the sum of the context embedding as the representation of the context
        embeds = self.embeddings(inputs)
        embeds = torch.sum(embeds, dim=0)
        embeds = torch.unsqueeze(embeds, dim=0)
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

model = LanguageModeler(vocab_size, embedding_dim, hidden_size)

# Loss and optimizer
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for context, target in training_data:
        context_idxs = torch.tensor([word_to_ix[word] for word in context.split()], dtype=torch.long)
        model.zero_grad()
        log_probs = model(context_idxs)
        target_idx = torch.tensor([word_to_ix[target]], dtype=torch.long)
        loss = loss_function(log_probs, target_idx)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss}")

print("Training complete!")
print(f"The training loss for embedding_dim={embedding_dim} is {total_loss}")

# Example usage to predict the next word
context = "am taking this semester"
context_idxs = torch.tensor([word_to_ix[word] for word in context.split()], dtype=torch.long)
log_probs = model(context_idxs)
predicted_idx = torch.argmax(log_probs).item()
predicted_word = [word for word, idx in word_to_ix.items() if idx == predicted_idx][0]
print(f"Target word prediction: {predicted_word}")

Epoch 1, Loss: 21.519867658615112
Epoch 2, Loss: 13.469220399856567
Epoch 3, Loss: 8.231013298034668
Epoch 4, Loss: 4.947874903678894
Epoch 5, Loss: 3.111188143491745
Epoch 6, Loss: 2.105540543794632
Epoch 7, Loss: 1.534349948167801
Epoch 8, Loss: 1.1862706989049911
Epoch 9, Loss: 0.9588550999760628
Epoch 10, Loss: 0.7983578592538834
Training complete!
The training loss for embedding_dim=128 is 0.7983578592538834
Target word prediction: CS6493


### 3.2 Skip-gram Model

In [135]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Define the training data (4-grams) and vocabulary
text = 'I am taking CS6493 this semester and studying NLP is really fascinating'
text_list = text.split()
text_list_length = len(text_list)
vocab = set(text_list)
vocab_size = len(vocab)
training_data = []

window_size = 2 # the context length is window_size * 2
for i in range(window_size, text_list_length-window_size):
    # target_context = ' '.join(text_list[i-window_size:i] + text_list[i+1:i+1+window_size])
    target_context = text_list[i-window_size:i] + text_list[i+1:i+1+window_size]
    central_word = text_list[i]
    training_data.append((central_word, target_context))

# print(training_data)
word_to_ix = {word: i for i, word in enumerate(vocab)}

# Hyperparameters
embedding_dim = 128
hidden_size = 128
learning_rate = 0.01
epochs = 10

# Create the language model
class LanguageModeler(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, context_length):
        super(LanguageModeler, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.context_length = context_length
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, hidden_size)
        self.linear2 = nn.Linear(hidden_size, context_length*vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        out = out.view(1, self.context_length, -1)
        log_probs = F.log_softmax(out, dim=2)
        log_probs = torch.squeeze(log_probs)
        return log_probs

model = LanguageModeler(vocab_size, embedding_dim, hidden_size, window_size*2)

# Loss and optimizer
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for central_word, target_context in training_data:
        central_word_idx = torch.tensor([word_to_ix[central_word]], dtype=torch.long)
        model.zero_grad()
        log_probs = model(central_word_idx)
        target_context_idx = torch.tensor([word_to_ix[word] for word in target_context], dtype=torch.long)
        loss = loss_function(log_probs, target_context_idx)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss}")

print("Training complete!")
print(f"The training loss for embedding_dim={embedding_dim} is {total_loss}")

# Example usage to predict the next word
central_word = "CS6493"
central_word_idx = torch.tensor([word_to_ix[central_word]], dtype=torch.long)
log_probs = model(central_word_idx)
predicted_idxs = torch.argmax(log_probs, dim=1)
predicted_context = ''
for predicted_idx in predicted_idxs:
    for word, idx in word_to_ix.items():
        if idx == predicted_idx:
            predicted_context += word + ' '
predicted_context = predicted_context.strip()
# predicted_word = [word for word, idx in word_to_ix.items() if idx == predicted_idx for predicted_idx in predicted_idxs][0]
print(f"Target context prediction: {predicted_context}")

Epoch 1, Loss: 20.37692356109619
Epoch 2, Loss: 19.625171661376953
Epoch 3, Loss: 18.91118288040161
Epoch 4, Loss: 18.229763507843018
Epoch 5, Loss: 17.573153018951416
Epoch 6, Loss: 16.935642957687378
Epoch 7, Loss: 16.3105411529541
Epoch 8, Loss: 15.688520431518555
Epoch 9, Loss: 15.058596849441528
Epoch 10, Loss: 14.423372864723206
Training complete!
The training loss for embedding_dim=128 is 14.423372864723206
Target context prediction: am taking this semester


### Wikipedia corpus
The logic of Question 3.3 is the same as Question 3.2.

### Question 4

### 4.1

1. Very large vocabulary size;
2. Cannot deal with out of vocablary words;
3. Cannot capture the semantic relations between similar words.

### 4.2

In [137]:
words = [('old', 10), ('older', 5), ('oldest', 8), ('hug', 8), ('pug', 4), ('hugs', 5)]
vocab = []
for word in words:
    for char in word[0]:
        if char not in vocab:
            vocab.append(char)
print(vocab)

['o', 'l', 'd', 'e', 'r', 's', 't', 'h', 'u', 'g', 'p']


In [138]:
len(vocab)

11

In [139]:
# compute the frequencies of subwords
freqs = {}
for word in words:
    for i, _ in enumerate(word[0]):
        for j in range(i+1, len(word[0])):
            if word[0][i:j+1] not in freqs:
                freqs[word[0][i:j+1]] = word[1]
            else:
                freqs[word[0][i:j+1]] += word[1]
print(freqs)

{'ol': 23, 'old': 23, 'ld': 23, 'olde': 13, 'older': 5, 'lde': 13, 'lder': 5, 'de': 13, 'der': 5, 'er': 5, 'oldes': 8, 'oldest': 8, 'ldes': 8, 'ldest': 8, 'des': 8, 'dest': 8, 'es': 8, 'est': 8, 'st': 8, 'hu': 13, 'hug': 13, 'ug': 17, 'pu': 4, 'pug': 4, 'hugs': 5, 'ugs': 5, 'gs': 5}


In [140]:
freq_tuples = list(freqs.items())
sorted_freq_tuples = sorted(freq_tuples, key=lambda t: t[1], reverse=True)
print(sorted_freq_tuples)

[('ol', 23), ('old', 23), ('ld', 23), ('ug', 17), ('olde', 13), ('lde', 13), ('de', 13), ('hu', 13), ('hug', 13), ('oldes', 8), ('oldest', 8), ('ldes', 8), ('ldest', 8), ('des', 8), ('dest', 8), ('es', 8), ('est', 8), ('st', 8), ('older', 5), ('lder', 5), ('der', 5), ('er', 5), ('hugs', 5), ('ugs', 5), ('gs', 5), ('pu', 4), ('pug', 4)]


In [141]:
# add the 5 most frequent symbol pairs to the vocab
for i in range(5):
    vocab.append(sorted_freq_tuples[i][0])
print(vocab)

['o', 'l', 'd', 'e', 'r', 's', 't', 'h', 'u', 'g', 'p', 'ol', 'old', 'ld', 'ug', 'olde']


In [142]:
# using the vocab to tokenize the given words
words = ['hold', 'oldest', 'older', 'pug', 'mug', 'huggingface']
tokenizations = []
for word in words:
    subword = word
    tokenization = []
    while len(subword) != 0:
        for i in range(len(subword), 0, -1):
            if subword[:i] in vocab:
                tokenization.append(subword[:i])
                subword = subword[i:]
                break
            elif i==1 and subword[:i] not in vocab:
                tokenization.append('[unk]')
                subword = subword[i:]
                break
    tokenizations.append(tokenization)
print(tokenizations)

[['h', 'old'], ['olde', 's', 't'], ['olde', 'r'], ['p', 'ug'], ['[unk]', 'ug'], ['h', 'ug', 'g', '[unk]', '[unk]', 'g', '[unk]', '[unk]', '[unk]', 'e']]
