{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 4. Word Window Classification and Neural Networks "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I recommend you take a look at these material first."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture4.pdf\n",
"* https://en.wikipedia.org/wiki/Named-entity_recognition"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import torch\n",
"import torch.nn as nn\n",
"from torch.autograd import Variable\n",
"import torch.optim as optim\n",
"import torch.nn.functional as F\n",
"import nltk\n",
"import random\n",
"import numpy as np\n",
"from collections import Counter\n",
"flatten = lambda l: [item for sublist in l for item in sublist]\n",
"from sklearn_crfsuite import metrics\n",
"random.seed(1024)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You also need sklearn_crfsuite latest version for print confusion matrix"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.3.0.post4\n",
"3.2.4\n"
]
}
],
"source": [
"print(torch.__version__)\n",
"print(nltk.__version__)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"USE_CUDA = torch.cuda.is_available()\n",
"gpus = [0]\n",
"torch.cuda.set_device(gpus[0])\n",
"\n",
"FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor\n",
"LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor\n",
"ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def getBatch(batch_size, train_data):\n",
" random.shuffle(train_data)\n",
" sindex = 0\n",
" eindex = batch_size\n",
" while eindex < len(train_data):\n",
" batch = train_data[sindex: eindex]\n",
" temp = eindex\n",
" eindex = eindex + batch_size\n",
" sindex = temp\n",
" yield batch\n",
" \n",
" if eindex >= len(train_data):\n",
" batch = train_data[sindex:]\n",
" yield batch"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def prepare_sequence(seq, word2index):\n",
" idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index[\"\"], seq))\n",
" return Variable(LongTensor(idxs))\n",
"\n",
"def prepare_word(word, word2index):\n",
" return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index[\"\"]]))\n",
"\n",
"def prepare_tag(tag,tag2index):\n",
" return Variable(LongTensor([tag2index[tag]]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data load and Preprocessing "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"CoNLL-2002 Shared Task: Language-Independent Named Entity Recognition
\n",
"https://www.clips.uantwerpen.be/conll2002/ner/"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"corpus = nltk.corpus.conll2002.iob_sents()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data = []\n",
"for cor in corpus:\n",
" sent, _, tag = list(zip(*cor))\n",
" data.append([sent, tag])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"35651\n",
"[('Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', 'may', '(', 'EFECOM', ')', '.'), ('B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O')]\n"
]
}
],
"source": [
"print(len(data))\n",
"print(data[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Build Vocab"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sents,tags = list(zip(*data))\n",
"vocab = list(set(flatten(sents)))\n",
"tagset = list(set(flatten(tags)))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"word2index={'' : 0, '' : 1} # dummy token is for start or end of sentence\n",
"for vo in vocab:\n",
" if word2index.get(vo) is None:\n",
" word2index[vo] = len(word2index)\n",
"index2word = {v:k for k, v in word2index.items()}\n",
"\n",
"tag2index = {}\n",
"for tag in tagset:\n",
" if tag2index.get(tag) is None:\n",
" tag2index[tag] = len(tag2index)\n",
"index2tag={v:k for k, v in tag2index.items()}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Prepare data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Example : Classify 'Paris' in the context of this sentence with window length 2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"borrowed image from http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture4.pdf"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"WINDOW_SIZE = 2\n",
"windows = []"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"for sample in data:\n",
" dummy = [''] * WINDOW_SIZE\n",
" window = list(nltk.ngrams(dummy + list(sample[0]) + dummy, WINDOW_SIZE * 2 + 1))\n",
" windows.extend([[list(window[i]), sample[1][i]] for i in range(len(sample[0]))])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[['', '', 'Sao', 'Paulo', '('], 'B-LOC']"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"windows[0]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"678377"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(windows)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"random.shuffle(windows)\n",
"\n",
"train_data = windows[:int(len(windows) * 0.9)]\n",
"test_data = windows[int(len(windows) * 0.9):]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Modeling "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"borrowed image from http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture4.pdf"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"class WindowClassifier(nn.Module): \n",
" def __init__(self, vocab_size, embedding_size, window_size, hidden_size, output_size):\n",
"\n",
" super(WindowClassifier, self).__init__()\n",
" \n",
" self.embed = nn.Embedding(vocab_size, embedding_size)\n",
" self.h_layer1 = nn.Linear(embedding_size * (window_size * 2 + 1), hidden_size)\n",
" self.h_layer2 = nn.Linear(hidden_size, hidden_size)\n",
" self.o_layer = nn.Linear(hidden_size, output_size)\n",
" self.relu = nn.ReLU()\n",
" self.softmax = nn.LogSoftmax(dim=1)\n",
" self.dropout = nn.Dropout(0.3)\n",
" \n",
" def forward(self, inputs, is_training=False): \n",
" embeds = self.embed(inputs) # BxWxD\n",
" concated = embeds.view(-1, embeds.size(1)*embeds.size(2)) # Bx(W*D)\n",
" h0 = self.relu(self.h_layer1(concated))\n",
" if is_training:\n",
" h0 = self.dropout(h0)\n",
" h1 = self.relu(self.h_layer2(h0))\n",
" if is_training:\n",
" h1 = self.dropout(h1)\n",
" out = self.softmax(self.o_layer(h1))\n",
" return out"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"BATCH_SIZE = 128\n",
"EMBEDDING_SIZE = 50 # x (WINDOW_SIZE*2+1) = 250\n",
"HIDDEN_SIZE = 300\n",
"EPOCH = 3\n",
"LEARNING_RATE = 0.001"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It takes for a while if you use just cpu."
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"model = WindowClassifier(len(word2index), EMBEDDING_SIZE, WINDOW_SIZE, HIDDEN_SIZE, len(tag2index))\n",
"if USE_CUDA:\n",
" model = model.cuda()\n",
"loss_function = nn.CrossEntropyLoss()\n",
"optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0/3] mean_loss : 2.25\n",
"[0/3] mean_loss : 0.47\n",
"[0/3] mean_loss : 0.36\n",
"[0/3] mean_loss : 0.31\n",
"[0/3] mean_loss : 0.28\n",
"[1/3] mean_loss : 0.22\n",
"[1/3] mean_loss : 0.21\n",
"[1/3] mean_loss : 0.21\n",
"[1/3] mean_loss : 0.19\n",
"[1/3] mean_loss : 0.19\n",
"[2/3] mean_loss : 0.12\n",
"[2/3] mean_loss : 0.15\n",
"[2/3] mean_loss : 0.15\n",
"[2/3] mean_loss : 0.14\n",
"[2/3] mean_loss : 0.14\n"
]
}
],
"source": [
"for epoch in range(EPOCH):\n",
" losses = []\n",
" for i,batch in enumerate(getBatch(BATCH_SIZE, train_data)):\n",
" x,y=list(zip(*batch))\n",
" inputs = torch.cat([prepare_sequence(sent, word2index).view(1, -1) for sent in x])\n",
" targets = torch.cat([prepare_tag(tag, tag2index) for tag in y])\n",
" model.zero_grad()\n",
" preds = model(inputs, is_training=True)\n",
" loss = loss_function(preds, targets)\n",
" losses.append(loss.data.tolist()[0])\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" if i % 1000 == 0:\n",
" print(\"[%d/%d] mean_loss : %0.2f\" %(epoch, EPOCH, np.mean(losses)))\n",
" losses = []"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test "
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"for_f1_score = []"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"95.69120551903063\n"
]
}
],
"source": [
"accuracy = 0\n",
"for test in test_data:\n",
" x, y = test[0], test[1]\n",
" input_ = prepare_sequence(x, word2index).view(1, -1)\n",
"\n",
" i = model(input_).max(1)[1]\n",
" pred = index2tag[i.data.tolist()[0]]\n",
" for_f1_score.append([pred, y])\n",
" if pred == y:\n",
" accuracy += 1\n",
"\n",
"print(accuracy/len(test_data) * 100)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This high score is because most of labels are 'O' tag. So we need to measure f1 score."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Print Confusion matrix "
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y_pred, y_test = list(zip(*for_f1_score))"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sorted_labels = sorted(\n",
" list(set(y_test) - {'O'}),\n",
" key=lambda name: (name[1:], name[0])\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted_labels"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y_pred = [[y] for y in y_pred] # this is because sklearn_crfsuite.metrics function flatten inputs\n",
"y_test = [[y] for y in y_test]"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" B-LOC 0.802 0.636 0.710 1085\n",
" I-LOC 0.732 0.457 0.562 311\n",
" B-MISC 0.750 0.378 0.503 801\n",
" I-MISC 0.679 0.331 0.445 641\n",
" B-ORG 0.723 0.738 0.730 1430\n",
" I-ORG 0.710 0.700 0.705 969\n",
" B-PER 0.782 0.773 0.777 1268\n",
" I-PER 0.853 0.871 0.861 950\n",
"\n",
"avg / total 0.759 0.656 0.693 7455\n",
"\n"
]
}
],
"source": [
"print(metrics.flat_classification_report(\n",
" y_test, y_pred, labels = sorted_labels, digits=3\n",
"))"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"### TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"* use max-margin objective function http://pytorch.org/docs/master/nn.html#multilabelmarginloss"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}