{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# News topic classification tasks" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## packages" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import torchtext\n", "import os\n", "from keras.preprocessing.text import Tokenizer\n", "from keras_preprocessing import sequence\n", "import string\n", "import re\n", "import numpy as np\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "from torch.utils.data.dataset import random_split\n", "import time\n", "from torch.utils.data import DataLoader" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Download the data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We're going to use torchtext.datasets.AG_NEWS,there are four types of news inside." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "load_data_path = \"../data\"\n", " \n", "if not os.path.isdir(load_data_path):\n", " os.mkdir(load_data_path)\n", " \n", "train_dataset, test_dataset = torchtext.datasets.AG_NEWS(\n", " root='../data/', split=('train', 'test'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can have a look about the data." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(list(train_dataset)[:3])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The training set has 120,000 samples, and the label has four values: 1, 2, 3, and 4. The distribution of various types of labels in the training set and test set is relatively uniform." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Work with datasets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Function: \n", "1. Replace with space (i.e., split the words on both sides of it into two words), convert all letters to lowercase. \n", "2. Convert the label to [0,3]. \n", "3. Sentence length interception" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**The length of the sentences in the sample was analyzed: more than 90% of the sentences were not more than 50 in length, so 50 words were subsequently intercepted.**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "punct = str.maketrans('','',string.punctuation)\n", "def process_datasets_by_Tokenizer(train_dataset, test_dataset, seq_len=200):\n", " \"\"\"\n", " Parameter:\n", " Tran Dattaset: Training sample list Lister (Tapoel (Inter, Ster))\n", " Return:\n", " Tran Dattaset: Training set list Lister Petter (Tapul (Turner, Inter))\n", " \"\"\"\n", " tokenizer = Tokenizer()\n", " train_dataset_texts, train_dataset_labels = [], []\n", " test_dataset_texts, test_dataset_labels = [], []\n", " \n", " for label, text in train_dataset:\n", " # In the previous print, you can see that there is \"\\\\\" , which is replaced with a space, and all of them are lowercase letters\n", " train_dataset_texts.append(text.replace('\\\\',' ').translate(punct).lower())\n", " train_dataset_labels.append(label - 1) # Mapping Labels to [0,3]\n", " \n", " for label, text in test_dataset:\n", " test_dataset_texts.append(text.replace('\\\\',' ').translate(punct).lower())\n", " test_dataset_labels.append(label - 1)\n", " \n", " # Here's the trick and put the training set of tests together to build a vocabulary list, so that there are no unlogged words\n", " all_dataset_texts = train_dataset_texts + test_dataset_texts\n", " all_dataset_labels = train_dataset_labels + test_dataset_labels\n", " tokenizer.fit_on_texts(all_dataset_texts)\n", " \n", " #train_dataset_seqs is a list in which each element is a list that transforms a sentence from a literal representation into an index representation in a vocabulary\n", " train_dataset_seqs = tokenizer.texts_to_sequences(train_dataset_texts)\n", " test_datase_seqs = tokenizer.texts_to_sequences(test_dataset_texts)\n", " #print(type(train_dataset_seqs), type(train_dataset_seqs[0])) # \n", " #print(train_dataset_seqs)\n", " \n", " # Intercept the first seq_len, and make up 0 after the shortage\n", " # train_dataset_seqs is a tensor,size:(Number of samples, seq_len)\n", " train_dataset_seqs = torch.tensor(sequence.pad_sequences(\n", " train_dataset_seqs, seq_len, padding='post'), dtype=torch.int32)\n", " test_datase_seqs = torch.tensor(sequence.pad_sequences(\n", " test_datase_seqs, seq_len, padding='post'), dtype=torch.int32)\n", " #print(type(train_dataset_seqs), type(train_dataset_seqs[0])) # \n", " #print(train_dataset_seqs)\n", " \n", " train_dataset = list(zip(train_dataset_seqs, train_dataset_labels))\n", " test_dataset = list(zip(test_datase_seqs, test_dataset_labels))\n", " \n", " vocab_size = len(tokenizer.index_word.keys())\n", " num_class = len(set(all_dataset_labels))\n", " return train_dataset, test_dataset, vocab_size, num_class\n", " \n", " \n", "embed_dim = 16 # There are about 90,000 words, and the embedding dimension here is 16\n", "batch_size = 64\n", "seq_len = 50 # A sentence length of 50 can cover more than 90% of the sample\n", " \n", "train_dataset, test_dataset, vocab_size, num_class = process_datasets_by_Tokenizer(\n", " train_dataset, test_dataset, seq_len=seq_len)\n", " \n", "print(train_dataset[:2])\n", "print(\"vocab_size = {}, num_class = {}\".format(vocab_size, num_class))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "With the 4 print statements commented out open, let's test the code:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train = [(1, 'The moon is light'),\n", " (2, 'This is the last rose of summer')]\n", "test = train[:]\n", "train, test, sz, cls = process_datasets_by_Tokenizer(train, test, seq_len=5)\n", "train, test, sz, cls" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Build a model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The structure of the model is simple: embedding layer + average pooling layer + fully connected layer" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class TextSentiment(nn.Module):\n", " \"\"\"Text classification model\"\"\"\n", " def __init__(self, vocab_size, embed_dim, num_class, seq_len):\n", " \"\"\"\n", " description: Initialization function of the class\n", " :param vocab_size: The total number of distinct words contained in the entire corpus\n", " :param embed_dim: Specifies the dimension in which the word is embedded\n", " :param num_class: The total number of categories for the text classification\n", " \"\"\" \n", " super(TextSentiment, self).__init__()\n", " \n", " self.seq_len = seq_len\n", " self.embed_dim = embed_dim\n", " \n", " # Instantiating the embedding layer, \n", " #sparse=True means that only part of the weights are updated each time the gradient is solved for that layer.\n", " self.embedding = nn.Embedding(vocab_size, embed_dim, sparse=True)\n", " # Instantiate the linear layer, the parameters are embed_dim and num_class.\n", " self.fc = nn.Linear(embed_dim, num_class)\n", " # Initialize weights for each layer\n", " self.init_weights()\n", " \n", " def init_weights(self):\n", " \"\"\"Initialize the weight function\"\"\"\n", " # Specifies the number of value ranges for the initial weight\n", " initrange = 0.5\n", " # The weight parameters of each layer are initialized to a uniform distribution\n", " self.embedding.weight.data.uniform_(-initrange, initrange)\n", " self.fc.weight.data.uniform_(-initrange, initrange)\n", " # The bias is initialized to 0\n", " self.fc.bias.data.zero_()\n", " \n", " def forward(self, text):\n", " \"\"\"\n", " :param text: The result of the text numeric mapping\n", " :return: A tensor of the same size as the number of categories, which is used to determine the category of the text\n", " \"\"\"\n", " # [batch_size, seq_len, embed_dim]\n", " embedded = self.embedding(text) \n", " # [batch_size, embed_dim, seq_len],\n", " # Later, the dimension where the sentence is located is pooling, so put the dimension where the sentence is located at the end\n", " embedded = embedded.transpose(2, 1) # The dimension of the sentence changes from the second dimension to the third dimension\n", " # [batch_size, embed_dim, 1] \n", " embedded = F.avg_pool1d(embedded, kernel_size=self.seq_len)\n", " # [embed_dim, batch_size] \n", " embedded = embedded.squeeze(-1)\n", " # [batch_size, embed_dim]\n", " # I saw that torch.nn.CrossEntropyLoss() comes with softmax, so I don't have softmax here\n", " return self.fc(embedded)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Training" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### generate_batch" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "generate_batch: Construct the data in a batch and pass it in as a parameter of the DataLoader function" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def generate_batch(batch):\n", " \"\"\"[summary]\n", " Args:\n", " batch ([type]): [description] A batch_size-sized list of sample tensors and tuples of corresponding labels\n", " [(sample1, label1), (sample2, label2), ..., (samplen, labeln)]\n", " :return Sample tensors and labels are in their respective list forms(Tensor)\n", " \"\"\"\n", " text = [entry[0].reshape(1, -1) for entry in batch]\n", "# print(text)\n", " label = torch.tensor([entry[1] for entry in batch])\n", " text = torch.cat(text, dim=0)\n", " \n", " return torch.tensor(text), torch.tensor(label)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's test the effect of this paragraph:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "batch = [(torch.tensor([3, 23, 2, 8]), 1), (torch.tensor([3, 45, 21, 6]), 0)]\n", "res = generate_batch(batch)\n", "print(res, res[0].size())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Training & Validation Functions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "def run(data, batch_size, model, criterion, \n", " mode='train', optimizer=None, scheduler=None):\n", " total_loss, total_acc = 0., 0.\n", " \n", " shuffle = False\n", " if mode == 'train':\n", " shuffle = True\n", " data = DataLoader(data, batch_size=batch_size, shuffle=shuffle,\n", " collate_fn=generate_batch)\n", " \n", " for i, (text, label) in enumerate(data):\n", "# text = text.to(device) # gpu version\n", "# label = label.to(device)\n", " sz = text.size(0)\n", " if mode == 'train':\n", " optimizer.zero_grad()\n", " output = model(text)\n", " loss = criterion(output, label)\n", " # Cumulative batch average, referring to the reservoir sampling algorithm\n", " total_loss = i / (i + 1) * total_loss + loss.item() / sz / (i + 1)\n", " loss.backward()\n", " optimizer.step()\n", "# predict = F.softmax(output, dim=-1)\n", " correct_cnt = (output.argmax(1) == label).sum().item()\n", " total_acc = i / (i + 1) * total_acc + correct_cnt / sz / (i + 1)\n", " else:\n", " with torch.no_grad():\n", " output = model(text)\n", " loss = criterion(output, label)\n", " total_loss = i / (i + 1) * total_loss + loss.item() / sz / (i + 1)\n", "# predict = F.softmax(output, dim=-1)\n", " correct_cnt = (output.argmax(1) == label).sum().item()\n", " total_acc = i / (i + 1) * total_acc + correct_cnt / sz / (i + 1)\n", " \n", "# if i % 10 == 0:\n", "# print(\"i: {}, loss: {}\".format(i, total_loss))\n", " \n", " # Adjust the optimizer learning rate\n", " if (scheduler):\n", " scheduler.step()\n", "# print(total_loss, total_acc, total_loss / count, total_acc / count, count)\n", " return total_loss , total_acc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Main process" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = TextSentiment(vocab_size + 1, embed_dim, num_class, seq_len)\n", "# model = TextSentiment(vocab_size + 1, embed_dim, num_class, seq_len).to(device) # gpu version\n", " \n", "criterion = torch.nn.CrossEntropyLoss() # Comes with softmax\n", "optimizer = torch.optim.SGD(model.parameters(), lr=0.1)\n", "scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.99)\n", " \n", "train_len = int(len(train_dataset) * 0.95)\n", "sub_train_, sub_valid_ = random_split(train_dataset, \n", " [train_len, len(train_dataset) - train_len])\n", "n_epochs = 10\n", "for epoch in range(n_epochs):\n", " start_time = time.time()\n", " train_loss, train_acc = run(sub_train_, batch_size, model, criterion, \n", " mode='train', optimizer=optimizer, scheduler=scheduler)\n", " \n", " valid_loss, valid_acc = run(sub_train_, batch_size, model, criterion, mode='validation')\n", " \n", " secs = int(time.time() - start_time)\n", " mins = secs / 60\n", " secs = secs % 60\n", " \n", " print(\"Epoch: %d\" % (epoch + 1),\n", " \" | time in %d minutes, %d seconds\" % (mins, secs))\n", " print(\n", " f\"\\tLoss: {train_loss:.4f}(train)\\t|\\tAcc: {train_acc * 100:.1f}%(train)\"\n", " )\n", " print(\n", " f\"\\tLoss: {valid_loss:.4f}(valid)\\t|\\tAcc: {valid_acc * 100:.1f}%(valid)\"\n", " )" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 2 }