{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# News topic classification tasks"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import torchtext\n",
    "import os\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "from keras_preprocessing import sequence\n",
    "import string\n",
    "import re\n",
    "import numpy as np\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "from torch.utils.data.dataset import random_split\n",
    "import time\n",
    "from torch.utils.data import DataLoader"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download the data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We're going to use torchtext.datasets.AG_NEWS,there are four types of news inside."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "load_data_path = \"../data\"\n",
    " \n",
    "if not os.path.isdir(load_data_path):\n",
    "    os.mkdir(load_data_path)\n",
    "    \n",
    "train_dataset, test_dataset = torchtext.datasets.AG_NEWS(\n",
    "    root='../data/', split=('train', 'test'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can have a look about the data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(list(train_dataset)[:3])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The training set has 120,000 samples, and the label has four values: 1, 2, 3, and 4. The distribution of various types of labels in the training set and test set is relatively uniform."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Work with datasets"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Function: \n",
    "1. Replace  with space (i.e., split the words on both sides of it into two words), convert all letters to lowercase. \n",
    "2. Convert the label to [0,3]. \n",
    "3. Sentence length interception"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**The length of the sentences in the sample was analyzed: more than 90% of the sentences were not more than 50 in length, so 50 words were subsequently intercepted.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "punct  = str.maketrans('','',string.punctuation)\n",
    "def process_datasets_by_Tokenizer(train_dataset, test_dataset, seq_len=200):\n",
    "    \"\"\"\n",
    "    Parameter:\n",
    "    Tran Dattaset: Training sample list Lister (Tapoel (Inter, Ster))\n",
    "    Return:\n",
    "    Tran Dattaset: Training set list Lister Petter (Tapul (Turner, Inter))\n",
    "    \"\"\"\n",
    "    tokenizer = Tokenizer()\n",
    "    train_dataset_texts, train_dataset_labels = [], []\n",
    "    test_dataset_texts, test_dataset_labels = [], []\n",
    "    \n",
    "    for label, text in train_dataset:\n",
    "        # In the previous print, you can see that there is \"\\\\\" , which is replaced with a space, and all of them are lowercase letters\n",
    "        train_dataset_texts.append(text.replace('\\\\',' ').translate(punct).lower())\n",
    "        train_dataset_labels.append(label - 1)  # Mapping Labels to [0,3]\n",
    "        \n",
    "    for label, text in test_dataset:\n",
    "        test_dataset_texts.append(text.replace('\\\\',' ').translate(punct).lower())\n",
    "        test_dataset_labels.append(label - 1)\n",
    "    \n",
    "    # Here's the trick and put the training set of tests together to build a vocabulary list, so that there are no unlogged words\n",
    "    all_dataset_texts = train_dataset_texts + test_dataset_texts\n",
    "    all_dataset_labels = train_dataset_labels + test_dataset_labels\n",
    "    tokenizer.fit_on_texts(all_dataset_texts)\n",
    "    \n",
    "    #train_dataset_seqs is a list in which each element is a list that transforms a sentence from a literal representation into an index representation in a vocabulary\n",
    "    train_dataset_seqs = tokenizer.texts_to_sequences(train_dataset_texts)\n",
    "    test_datase_seqs = tokenizer.texts_to_sequences(test_dataset_texts)\n",
    "    #print(type(train_dataset_seqs), type(train_dataset_seqs[0]))  # <class 'list'> <class 'list'>\n",
    "    #print(train_dataset_seqs)\n",
    "    \n",
    "    # Intercept the first seq_len, and make up 0 after the shortage\n",
    "    # train_dataset_seqs is a tensor，size:(Number of samples, seq_len)\n",
    "    train_dataset_seqs = torch.tensor(sequence.pad_sequences(\n",
    "        train_dataset_seqs, seq_len, padding='post'), dtype=torch.int32)\n",
    "    test_datase_seqs = torch.tensor(sequence.pad_sequences(\n",
    "        test_datase_seqs, seq_len, padding='post'), dtype=torch.int32)\n",
    "    #print(type(train_dataset_seqs), type(train_dataset_seqs[0]))  # <class 'torch.Tensor'> <class 'torch.Tensor'>\n",
    "    #print(train_dataset_seqs)\n",
    "    \n",
    "    train_dataset = list(zip(train_dataset_seqs, train_dataset_labels))\n",
    "    test_dataset = list(zip(test_datase_seqs, test_dataset_labels))\n",
    "    \n",
    "    vocab_size = len(tokenizer.index_word.keys())\n",
    "    num_class = len(set(all_dataset_labels))\n",
    "    return train_dataset, test_dataset, vocab_size, num_class\n",
    " \n",
    " \n",
    "embed_dim = 16  # There are about 90,000 words, and the embedding dimension here is 16\n",
    "batch_size = 64\n",
    "seq_len = 50  #  A sentence length of 50 can cover more than 90% of the sample\n",
    " \n",
    "train_dataset, test_dataset, vocab_size, num_class = process_datasets_by_Tokenizer(\n",
    "    train_dataset, test_dataset, seq_len=seq_len)\n",
    " \n",
    "print(train_dataset[:2])\n",
    "print(\"vocab_size = {}, num_class = {}\".format(vocab_size, num_class))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "With the 4 print statements commented out open, let's test the code:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = [(1, 'The moon is light'),\n",
    "        (2, 'This is the last rose of summer')]\n",
    "test = train[:]\n",
    "train, test, sz, cls = process_datasets_by_Tokenizer(train, test, seq_len=5)\n",
    "train, test, sz, cls"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build a model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The structure of the model is simple: embedding layer + average pooling layer + fully connected layer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class TextSentiment(nn.Module):\n",
    "    \"\"\"Text classification model\"\"\"\n",
    "    def __init__(self, vocab_size, embed_dim, num_class, seq_len):\n",
    "        \"\"\"\n",
    "        description: Initialization function of the class\n",
    "        :param vocab_size: The total number of distinct words contained in the entire corpus\n",
    "        :param embed_dim: Specifies the dimension in which the word is embedded\n",
    "        :param num_class: The total number of categories for the text classification\n",
    "        \"\"\" \n",
    "        super(TextSentiment, self).__init__()\n",
    "        \n",
    "        self.seq_len = seq_len\n",
    "        self.embed_dim = embed_dim\n",
    "        \n",
    "        # Instantiating the embedding layer, \n",
    "        #sparse=True means that only part of the weights are updated each time the gradient is solved for that layer.\n",
    "        self.embedding = nn.Embedding(vocab_size, embed_dim, sparse=True)\n",
    "        # Instantiate the linear layer, the parameters are embed_dim and num_class.\n",
    "        self.fc = nn.Linear(embed_dim, num_class)\n",
    "        # Initialize weights for each layer\n",
    "        self.init_weights()\n",
    "        \n",
    "    def init_weights(self):\n",
    "        \"\"\"Initialize the weight function\"\"\"\n",
    "        # Specifies the number of value ranges for the initial weight\n",
    "        initrange = 0.5\n",
    "        # The weight parameters of each layer are initialized to a uniform distribution\n",
    "        self.embedding.weight.data.uniform_(-initrange, initrange)\n",
    "        self.fc.weight.data.uniform_(-initrange, initrange)\n",
    "        # The bias is initialized to 0\n",
    "        self.fc.bias.data.zero_()\n",
    "        \n",
    "    def forward(self, text):\n",
    "        \"\"\"\n",
    "        :param text: The result of the text numeric mapping\n",
    "        :return: A tensor of the same size as the number of categories, which is used to determine the category of the text\n",
    "        \"\"\"\n",
    "        # [batch_size, seq_len, embed_dim]\n",
    "        embedded = self.embedding(text)  \n",
    "        # [batch_size, embed_dim, seq_len]，\n",
    "        # Later, the dimension where the sentence is located is pooling, so put the dimension where the sentence is located at the end\n",
    "        embedded = embedded.transpose(2, 1) # The dimension of the sentence changes from the second dimension to the third dimension\n",
    "        # [batch_size, embed_dim, 1] \n",
    "        embedded = F.avg_pool1d(embedded, kernel_size=self.seq_len)\n",
    "        # [embed_dim, batch_size] \n",
    "        embedded = embedded.squeeze(-1)\n",
    "        # [batch_size, embed_dim]\n",
    "        # I saw that torch.nn.CrossEntropyLoss() comes with softmax, so I don't have softmax here\n",
    "        return self.fc(embedded)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### generate_batch"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "generate_batch: Construct the data in a batch and pass it in as a parameter of the DataLoader function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_batch(batch):\n",
    "    \"\"\"[summary]\n",
    "    Args:\n",
    "        batch ([type]): [description] A batch_size-sized list of sample tensors and tuples of corresponding labels\n",
    "            [(sample1, label1), (sample2, label2), ..., (samplen, labeln)]\n",
    "    :return Sample tensors and labels are in their respective list forms(Tensor)\n",
    "    \"\"\"\n",
    "    text = [entry[0].reshape(1, -1) for entry in batch]\n",
    "#     print(text)\n",
    "    label = torch.tensor([entry[1] for entry in batch])\n",
    "    text = torch.cat(text, dim=0)\n",
    "    \n",
    "    return torch.tensor(text), torch.tensor(label)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's test the effect of this paragraph:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch = [(torch.tensor([3, 23, 2, 8]), 1), (torch.tensor([3, 45, 21, 6]), 0)]\n",
    "res = generate_batch(batch)\n",
    "print(res, res[0].size())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Training & Validation Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "def run(data, batch_size, model, criterion, \n",
    "        mode='train', optimizer=None, scheduler=None):\n",
    "    total_loss, total_acc = 0., 0.\n",
    "    \n",
    "    shuffle = False\n",
    "    if mode == 'train':\n",
    "        shuffle = True\n",
    "    data = DataLoader(data, batch_size=batch_size, shuffle=shuffle,\n",
    "                     collate_fn=generate_batch)\n",
    " \n",
    "    for i, (text, label) in enumerate(data):\n",
    "#         text = text.to(device)  # gpu version\n",
    "#         label = label.to(device)\n",
    "        sz = text.size(0)\n",
    "        if mode == 'train':\n",
    "            optimizer.zero_grad()\n",
    "            output = model(text)\n",
    "            loss = criterion(output, label)\n",
    "            # Cumulative batch average, referring to the reservoir sampling algorithm\n",
    "            total_loss = i / (i + 1) * total_loss + loss.item() / sz / (i + 1)\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "#             predict = F.softmax(output, dim=-1)\n",
    "            correct_cnt = (output.argmax(1) == label).sum().item()\n",
    "            total_acc = i / (i + 1) * total_acc + correct_cnt / sz / (i + 1)\n",
    "        else:\n",
    "            with torch.no_grad():\n",
    "                output = model(text)\n",
    "                loss = criterion(output, label)\n",
    "                total_loss = i / (i + 1) * total_loss + loss.item() / sz / (i + 1)\n",
    "#                 predict = F.softmax(output, dim=-1)\n",
    "                correct_cnt = (output.argmax(1) == label).sum().item()\n",
    "                total_acc = i / (i + 1) * total_acc + correct_cnt / sz / (i + 1)\n",
    "        \n",
    "#         if i % 10 == 0:\n",
    "#             print(\"i: {}, loss: {}\".format(i, total_loss))\n",
    "    \n",
    "     # Adjust the optimizer learning rate\n",
    "    if (scheduler):\n",
    "        scheduler.step()\n",
    "#     print(total_loss, total_acc, total_loss / count, total_acc / count, count)\n",
    "    return total_loss , total_acc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Main process"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = TextSentiment(vocab_size + 1, embed_dim, num_class, seq_len)\n",
    "# model = TextSentiment(vocab_size + 1, embed_dim, num_class, seq_len).to(device) # gpu version\n",
    " \n",
    "criterion = torch.nn.CrossEntropyLoss()  # Comes with softmax\n",
    "optimizer = torch.optim.SGD(model.parameters(), lr=0.1)\n",
    "scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.99)\n",
    " \n",
    "train_len = int(len(train_dataset) * 0.95)\n",
    "sub_train_, sub_valid_ = random_split(train_dataset, \n",
    "                                      [train_len, len(train_dataset) - train_len])\n",
    "n_epochs = 10\n",
    "for epoch in range(n_epochs):\n",
    "    start_time = time.time()\n",
    "    train_loss, train_acc = run(sub_train_, batch_size, model, criterion, \n",
    "        mode='train', optimizer=optimizer, scheduler=scheduler)\n",
    "    \n",
    "    valid_loss, valid_acc = run(sub_train_, batch_size, model, criterion, mode='validation')\n",
    "    \n",
    "    secs = int(time.time() - start_time)\n",
    "    mins = secs / 60\n",
    "    secs = secs % 60\n",
    "    \n",
    "    print(\"Epoch: %d\" % (epoch + 1),\n",
    "          \" | time in %d minutes, %d seconds\" % (mins, secs))\n",
    "    print(\n",
    "        f\"\\tLoss: {train_loss:.4f}(train)\\t|\\tAcc: {train_acc * 100:.1f}%(train)\"\n",
    "    )\n",
    "    print(\n",
    "        f\"\\tLoss: {valid_loss:.4f}(valid)\\t|\\tAcc: {valid_acc * 100:.1f}%(valid)\"\n",
    "    )"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}