{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "785045d4-dabd-4d1c-a2bc-d05cca1a117f",
   "metadata": {},
   "source": [
    "# DistilBert Multilabel"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4d0b9b1d-aa4f-49ae-aefe-915573c8ad4f",
   "metadata": {},
   "source": [
    "Multi-label DistilBert model trained (/fine-tuned) on toxicity dataset:\n",
    "    \n",
    "    \n",
    "- https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b09a9adf",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# !conda install watermark -c conda-forge --yes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "bdc3f297",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T13:52:31.759757Z",
     "iopub.status.busy": "2022-10-15T13:52:31.759405Z",
     "iopub.status.idle": "2022-10-15T13:52:32.540284Z",
     "shell.execute_reply": "2022-10-15T13:52:32.539260Z",
     "shell.execute_reply.started": "2022-10-15T13:52:31.759720Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch       : 1.11.0\n",
      "transformers: 4.20.1\n",
      "pandas      : 1.3.5\n",
      "tqdm        : 4.64.0\n",
      "\n"
     ]
    }
   ],
   "source": [
    "%load_ext watermark\n",
    "%watermark -p torch,transformers,pandas,tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "15688a18",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T13:52:32.542676Z",
     "iopub.status.busy": "2022-10-15T13:52:32.541816Z",
     "iopub.status.idle": "2022-10-15T13:52:33.718479Z",
     "shell.execute_reply": "2022-10-15T13:52:33.717348Z",
     "shell.execute_reply.started": "2022-10-15T13:52:32.542619Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import torch\n",
    "from tqdm import tqdm\n",
    "from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler\n",
    "from transformers import DistilBertTokenizer, DistilBertModel"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c41c8600",
   "metadata": {},
   "source": [
    "# Config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "5c4059d4",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T13:52:33.722912Z",
     "iopub.status.busy": "2022-10-15T13:52:33.721403Z",
     "iopub.status.idle": "2022-10-15T13:52:33.754111Z",
     "shell.execute_reply": "2022-10-15T13:52:33.752925Z",
     "shell.execute_reply.started": "2022-10-15T13:52:33.722865Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cuda:0\n"
     ]
    }
   ],
   "source": [
    "MAX_LEN = 512\n",
    "TRAIN_BATCH_SIZE = 16\n",
    "VALID_BATCH_SIZE = 16\n",
    "EPOCHS = 3\n",
    "LEARNING_RATE = 1e-05\n",
    "DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'\n",
    "print(DEVICE)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "282bd787",
   "metadata": {},
   "source": [
    "# Load and Prepare Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "b19be326",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T13:52:33.755972Z",
     "iopub.status.busy": "2022-10-15T13:52:33.755623Z",
     "iopub.status.idle": "2022-10-15T13:52:37.816127Z",
     "shell.execute_reply": "2022-10-15T13:52:37.815166Z",
     "shell.execute_reply.started": "2022-10-15T13:52:33.755935Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>comment_text</th>\n",
       "      <th>labels</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Explanation\\nWhy the edits made under my usern...</td>\n",
       "      <td>[0, 0, 0, 0, 0, 0]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>D'aww! He matches this background colour I'm s...</td>\n",
       "      <td>[0, 0, 0, 0, 0, 0]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Hey man, I'm really not trying to edit war. It...</td>\n",
       "      <td>[0, 0, 0, 0, 0, 0]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n",
       "      <td>[0, 0, 0, 0, 0, 0]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>You, sir, are my hero. Any chance you remember...</td>\n",
       "      <td>[0, 0, 0, 0, 0, 0]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        comment_text              labels\n",
       "0  Explanation\\nWhy the edits made under my usern...  [0, 0, 0, 0, 0, 0]\n",
       "1  D'aww! He matches this background colour I'm s...  [0, 0, 0, 0, 0, 0]\n",
       "2  Hey man, I'm really not trying to edit war. It...  [0, 0, 0, 0, 0, 0]\n",
       "3  \"\\nMore\\nI can't make any real suggestions on ...  [0, 0, 0, 0, 0, 0]\n",
       "4  You, sir, are my hero. Any chance you remember...  [0, 0, 0, 0, 0, 0]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')\n",
    "\n",
    "label_columns = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n",
    "train_data['labels'] = train_data[label_columns].apply(lambda x: list(x), axis=1)\n",
    "\n",
    "train_data.drop(['id'], inplace=True, axis=1)\n",
    "train_data.drop(label_columns, inplace=True, axis=1)\n",
    "\n",
    "train_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "890c7fff",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T13:52:37.823176Z",
     "iopub.status.busy": "2022-10-15T13:52:37.820051Z",
     "iopub.status.idle": "2022-10-15T13:52:37.836971Z",
     "shell.execute_reply": "2022-10-15T13:52:37.835982Z",
     "shell.execute_reply.started": "2022-10-15T13:52:37.823136Z"
    }
   },
   "outputs": [],
   "source": [
    "class MultiLabelDataset(Dataset):\n",
    "\n",
    "    def __init__(self, dataframe, tokenizer, max_len, new_data=False):\n",
    "        self.tokenizer = tokenizer\n",
    "        self.data = dataframe\n",
    "        self.text = dataframe.comment_text\n",
    "        self.new_data = new_data\n",
    "        \n",
    "        if not new_data:\n",
    "            self.targets = self.data.labels\n",
    "        self.max_len = max_len\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.text)\n",
    "\n",
    "    def __getitem__(self, index):\n",
    "        text = str(self.text[index])\n",
    "        text = \" \".join(text.split())\n",
    "\n",
    "        inputs = self.tokenizer.encode_plus(\n",
    "            text,\n",
    "            None,\n",
    "            add_special_tokens=True,\n",
    "            max_length=self.max_len,\n",
    "            pad_to_max_length=True,\n",
    "            return_token_type_ids=True\n",
    "        )\n",
    "        ids = inputs['input_ids']\n",
    "        mask = inputs['attention_mask']\n",
    "        token_type_ids = inputs[\"token_type_ids\"]\n",
    "\n",
    "        out = {\n",
    "            'ids': torch.tensor(ids, dtype=torch.long),\n",
    "            'mask': torch.tensor(mask, dtype=torch.long),\n",
    "            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),\n",
    "        }\n",
    "        \n",
    "        if not self.new_data:\n",
    "            out['targets'] = torch.tensor(self.targets[index], dtype=torch.float)\n",
    "\n",
    "        return out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "ed86ec0c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T13:52:37.844663Z",
     "iopub.status.busy": "2022-10-15T13:52:37.841436Z",
     "iopub.status.idle": "2022-10-15T13:52:40.989366Z",
     "shell.execute_reply": "2022-10-15T13:52:40.988419Z",
     "shell.execute_reply.started": "2022-10-15T13:52:37.844608Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Orig Dataset: (159571, 2)\n",
      "Training Dataset: (159571, 2)\n",
      "Validation Dataset: (0, 2)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2d4b50b831824e3e9edf79f2afab2623",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8276f1592c60430aa8a79f43ba71121f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0c00e8260c6d46d389f58b2224817d9d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "train_size = 1.0\n",
    "\n",
    "train_df = train_data.sample(frac=train_size, random_state=123)\n",
    "val_df = train_data.drop(train_df.index).reset_index(drop=True)\n",
    "train_df = train_df.reset_index(drop=True)\n",
    "\n",
    "\n",
    "print(\"Orig Dataset: {}\".format(train_data.shape))\n",
    "print(\"Training Dataset: {}\".format(train_df.shape))\n",
    "print(\"Validation Dataset: {}\".format(val_df.shape))\n",
    "\n",
    "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)\n",
    "training_set = MultiLabelDataset(train_df, tokenizer, MAX_LEN)\n",
    "val_set = MultiLabelDataset(val_df, tokenizer, MAX_LEN)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "fc015a3c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T13:52:40.991345Z",
     "iopub.status.busy": "2022-10-15T13:52:40.990967Z",
     "iopub.status.idle": "2022-10-15T13:52:40.998708Z",
     "shell.execute_reply": "2022-10-15T13:52:40.997062Z",
     "shell.execute_reply.started": "2022-10-15T13:52:40.991306Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py:490: UserWarning: This DataLoader will create 8 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n",
      "  cpuset_checked))\n"
     ]
    }
   ],
   "source": [
    "train_params = {'batch_size': TRAIN_BATCH_SIZE,\n",
    "                'shuffle': True,\n",
    "                'num_workers': 8\n",
    "                }\n",
    "\n",
    "val_params = {'batch_size': VALID_BATCH_SIZE,\n",
    "               'shuffle': False,\n",
    "               'num_workers': 8\n",
    "                }\n",
    "\n",
    "training_loader = DataLoader(training_set, **train_params)\n",
    "#val_loader = DataLoader(val_set, **val_params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0c1d8eba",
   "metadata": {},
   "source": [
    "# Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "385ae828",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T13:52:41.000712Z",
     "iopub.status.busy": "2022-10-15T13:52:41.000114Z",
     "iopub.status.idle": "2022-10-15T13:52:53.194175Z",
     "shell.execute_reply": "2022-10-15T13:52:53.193116Z",
     "shell.execute_reply.started": "2022-10-15T13:52:41.000673Z"
    }
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dc44a6d23fc048538a0c20c33cd25921",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight']\n",
      "- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    }
   ],
   "source": [
    "class DistilBERTClass(torch.nn.Module):\n",
    "    def __init__(self):\n",
    "        super(DistilBERTClass, self).__init__()\n",
    "        \n",
    "        self.bert = DistilBertModel.from_pretrained(\"distilbert-base-uncased\")\n",
    "        self.classifier = torch.nn.Sequential(\n",
    "            torch.nn.Linear(768, 768),\n",
    "            torch.nn.ReLU(),\n",
    "            torch.nn.Dropout(0.1),\n",
    "            torch.nn.Linear(768, 6)\n",
    "        )\n",
    "\n",
    "    def forward(self, input_ids, attention_mask, token_type_ids):\n",
    "        output_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask)\n",
    "        hidden_state = output_1[0]\n",
    "        out = hidden_state[:, 0]\n",
    "        out = self.classifier(out)\n",
    "        return out\n",
    "\n",
    "model = DistilBERTClass()\n",
    "model.to(DEVICE);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "a41f7fd5",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T13:52:53.198462Z",
     "iopub.status.busy": "2022-10-15T13:52:53.198158Z",
     "iopub.status.idle": "2022-10-15T13:52:53.204896Z",
     "shell.execute_reply": "2022-10-15T13:52:53.203922Z",
     "shell.execute_reply.started": "2022-10-15T13:52:53.198434Z"
    }
   },
   "outputs": [],
   "source": [
    "optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4239d00d",
   "metadata": {},
   "source": [
    "# Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "70a7406e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T13:52:53.207070Z",
     "iopub.status.busy": "2022-10-15T13:52:53.206343Z",
     "iopub.status.idle": "2022-10-15T13:52:53.216326Z",
     "shell.execute_reply": "2022-10-15T13:52:53.215357Z",
     "shell.execute_reply.started": "2022-10-15T13:52:53.207031Z"
    }
   },
   "outputs": [],
   "source": [
    "def train(epoch):\n",
    "    model.train()\n",
    "    \n",
    "    for _, data in tqdm(enumerate(training_loader, 0)):\n",
    "        ids = data['ids'].to(DEVICE, dtype=torch.long)\n",
    "        mask = data['mask'].to(DEVICE, dtype=torch.long)\n",
    "        token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)\n",
    "        targets = data['targets'].to(DEVICE, dtype=torch.float)\n",
    "\n",
    "        outputs = model(ids, mask, token_type_ids)\n",
    "\n",
    "        optimizer.zero_grad()\n",
    "        loss = torch.nn.functional.binary_cross_entropy_with_logits(outputs, targets)\n",
    "        \n",
    "        if _ % 5000 == 0:\n",
    "            print(f'Epoch: {epoch}, Loss:  {loss.item()}')\n",
    "        \n",
    "        loss.backward()\n",
    "        optimizer.step()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "0b0a39d1",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T13:52:53.218041Z",
     "iopub.status.busy": "2022-10-15T13:52:53.217549Z",
     "iopub.status.idle": "2022-10-15T17:21:21.971366Z",
     "shell.execute_reply": "2022-10-15T17:21:21.969444Z",
     "shell.execute_reply.started": "2022-10-15T13:52:53.218004Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch: 0, Loss:  0.7045817375183105\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "5000it [34:46,  2.41it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch: 0, Loss:  0.0034446530044078827\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "9974it [1:09:20,  2.40it/s]\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch: 1, Loss:  0.08794672787189484\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "5000it [34:50,  2.40it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch: 1, Loss:  0.01744117960333824\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "9974it [1:09:32,  2.39it/s]\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch: 2, Loss:  0.024290692061185837\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "5000it [34:53,  2.40it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch: 2, Loss:  0.008041102439165115\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "9974it [1:09:34,  2.39it/s]\n"
     ]
    }
   ],
   "source": [
    "for epoch in range(EPOCHS):\n",
    "    train(epoch)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ec86c2cf",
   "metadata": {},
   "source": [
    "# Generate Test Submissions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "d7a067aa",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T17:32:32.949201Z",
     "iopub.status.busy": "2022-10-15T17:32:32.948751Z",
     "iopub.status.idle": "2022-10-15T17:32:34.754191Z",
     "shell.execute_reply": "2022-10-15T17:32:34.753050Z",
     "shell.execute_reply.started": "2022-10-15T17:32:32.949159Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>comment_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>00001cee341fdb12</td>\n",
       "      <td>Yo bitch Ja Rule is more succesful then you'll...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0000247867823ef7</td>\n",
       "      <td>== From RfC == \\n\\n The title is fine as it is...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>00013b17ad220c46</td>\n",
       "      <td>\" \\n\\n == Sources == \\n\\n * Zawe Ashton on Lap...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>00017563c3f7919a</td>\n",
       "      <td>:If you have a look back at the source, the in...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>00017695ad8997eb</td>\n",
       "      <td>I don't anonymously edit articles at all.</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 id                                       comment_text\n",
       "0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...\n",
       "1  0000247867823ef7  == From RfC == \\n\\n The title is fine as it is...\n",
       "2  00013b17ad220c46  \" \\n\\n == Sources == \\n\\n * Zawe Ashton on Lap...\n",
       "3  00017563c3f7919a  :If you have a look back at the source, the in...\n",
       "4  00017695ad8997eb          I don't anonymously edit articles at all."
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_data = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')\n",
    "test_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "b2643443",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T17:32:34.756605Z",
     "iopub.status.busy": "2022-10-15T17:32:34.756231Z",
     "iopub.status.idle": "2022-10-15T17:32:34.763296Z",
     "shell.execute_reply": "2022-10-15T17:32:34.762149Z",
     "shell.execute_reply.started": "2022-10-15T17:32:34.756567Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py:490: UserWarning: This DataLoader will create 8 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n",
      "  cpuset_checked))\n"
     ]
    }
   ],
   "source": [
    "test_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN, new_data=True)\n",
    "test_loader = DataLoader(test_set, **val_params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "52d65222",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T17:32:37.165660Z",
     "iopub.status.busy": "2022-10-15T17:32:37.164342Z",
     "iopub.status.idle": "2022-10-15T17:53:57.945099Z",
     "shell.execute_reply": "2022-10-15T17:53:57.943866Z",
     "shell.execute_reply.started": "2022-10-15T17:32:37.165598Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "0it [00:00, ?it/s]/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "9573it [21:20,  7.48it/s]\n"
     ]
    }
   ],
   "source": [
    "all_test_pred = []\n",
    "\n",
    "def test(epoch):\n",
    "    model.eval()\n",
    "    \n",
    "    with torch.inference_mode():\n",
    "    \n",
    "        for _, data in tqdm(enumerate(test_loader, 0)):\n",
    "\n",
    "\n",
    "            ids = data['ids'].to(DEVICE, dtype=torch.long)\n",
    "            mask = data['mask'].to(DEVICE, dtype=torch.long)\n",
    "            token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)\n",
    "            outputs = model(ids, mask, token_type_ids)\n",
    "            probas = torch.sigmoid(outputs)\n",
    "\n",
    "            all_test_pred.append(probas)\n",
    "            \n",
    "            \n",
    "    return probas\n",
    "probas = test(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "6b484e66",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T17:53:57.948511Z",
     "iopub.status.busy": "2022-10-15T17:53:57.947874Z",
     "iopub.status.idle": "2022-10-15T17:53:57.970095Z",
     "shell.execute_reply": "2022-10-15T17:53:57.969162Z",
     "shell.execute_reply.started": "2022-10-15T17:53:57.948460Z"
    }
   },
   "outputs": [],
   "source": [
    "all_test_pred = torch.cat(all_test_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "c5f590c3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T17:53:57.973188Z",
     "iopub.status.busy": "2022-10-15T17:53:57.972789Z",
     "iopub.status.idle": "2022-10-15T17:53:58.014544Z",
     "shell.execute_reply": "2022-10-15T17:53:58.013540Z",
     "shell.execute_reply.started": "2022-10-15T17:53:57.973160Z"
    }
   },
   "outputs": [],
   "source": [
    "submit_df = test_data.copy()\n",
    "submit_df.drop(\"comment_text\", inplace=True, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "fa28e0d7",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T17:53:58.017452Z",
     "iopub.status.busy": "2022-10-15T17:53:58.016993Z",
     "iopub.status.idle": "2022-10-15T17:53:58.024191Z",
     "shell.execute_reply": "2022-10-15T17:53:58.023285Z",
     "shell.execute_reply.started": "2022-10-15T17:53:58.017416Z"
    }
   },
   "outputs": [],
   "source": [
    "label_columns = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "44dab086",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T17:53:58.027310Z",
     "iopub.status.busy": "2022-10-15T17:53:58.026525Z",
     "iopub.status.idle": "2022-10-15T17:53:58.038844Z",
     "shell.execute_reply": "2022-10-15T17:53:58.037882Z",
     "shell.execute_reply.started": "2022-10-15T17:53:58.027274Z"
    }
   },
   "outputs": [],
   "source": [
    "for i,name in enumerate(label_columns):\n",
    "\n",
    "    submit_df[name] = all_test_pred[:, i].cpu()\n",
    "    submit_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "eb47a32b",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-15T17:53:58.041083Z",
     "iopub.status.busy": "2022-10-15T17:53:58.040403Z",
     "iopub.status.idle": "2022-10-15T17:53:58.926274Z",
     "shell.execute_reply": "2022-10-15T17:53:58.925148Z",
     "shell.execute_reply.started": "2022-10-15T17:53:58.041047Z"
    }
   },
   "outputs": [],
   "source": [
    "submit_df.to_csv('submission.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e6f0979f-c7a1-4296-b039-4612c6567903",
   "metadata": {},
   "source": [
    "# Scores"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7e6e2d43-548b-4c46-995d-ded3cd20b565",
   "metadata": {},
   "source": [
    "- Public leaderboard: 0.98515\n",
    "- Private leaderboard: 0.98511"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19d4b51b-d0a3-498b-a4fb-aa31c820c25b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}