{ "cells": [ { "cell_type": "markdown", "id": "785045d4-dabd-4d1c-a2bc-d05cca1a117f", "metadata": {}, "source": [ "# DistilBert Multilabel" ] }, { "cell_type": "markdown", "id": "4d0b9b1d-aa4f-49ae-aefe-915573c8ad4f", "metadata": {}, "source": [ "Multi-label DistilBert model trained (/fine-tuned) on toxicity dataset:\n", " \n", " \n", "- https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge" ] }, { "cell_type": "code", "execution_count": 1, "id": "b09a9adf", "metadata": { "tags": [] }, "outputs": [], "source": [ "# !conda install watermark -c conda-forge --yes" ] }, { "cell_type": "code", "execution_count": 2, "id": "bdc3f297", "metadata": { "execution": { "iopub.execute_input": "2022-10-15T13:52:31.759757Z", "iopub.status.busy": "2022-10-15T13:52:31.759405Z", "iopub.status.idle": "2022-10-15T13:52:32.540284Z", "shell.execute_reply": "2022-10-15T13:52:32.539260Z", "shell.execute_reply.started": "2022-10-15T13:52:31.759720Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch : 1.11.0\n", "transformers: 4.20.1\n", "pandas : 1.3.5\n", "tqdm : 4.64.0\n", "\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark -p torch,transformers,pandas,tqdm" ] }, { "cell_type": "code", "execution_count": 3, "id": "15688a18", "metadata": { "execution": { "iopub.execute_input": "2022-10-15T13:52:32.542676Z", "iopub.status.busy": "2022-10-15T13:52:32.541816Z", "iopub.status.idle": "2022-10-15T13:52:33.718479Z", "shell.execute_reply": "2022-10-15T13:52:33.717348Z", "shell.execute_reply.started": "2022-10-15T13:52:32.542619Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import torch\n", "from tqdm import tqdm\n", "from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler\n", "from transformers import DistilBertTokenizer, DistilBertModel" ] }, { "cell_type": "markdown", "id": "c41c8600", "metadata": {}, "source": [ "# Config" ] }, { "cell_type": "code", "execution_count": 4, "id": "5c4059d4", "metadata": { "execution": { "iopub.execute_input": "2022-10-15T13:52:33.722912Z", "iopub.status.busy": "2022-10-15T13:52:33.721403Z", "iopub.status.idle": "2022-10-15T13:52:33.754111Z", "shell.execute_reply": "2022-10-15T13:52:33.752925Z", "shell.execute_reply.started": "2022-10-15T13:52:33.722865Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "cuda:0\n" ] } ], "source": [ "MAX_LEN = 512\n", "TRAIN_BATCH_SIZE = 16\n", "VALID_BATCH_SIZE = 16\n", "EPOCHS = 3\n", "LEARNING_RATE = 1e-05\n", "DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'\n", "print(DEVICE)" ] }, { "cell_type": "markdown", "id": "282bd787", "metadata": {}, "source": [ "# Load and Prepare Dataset" ] }, { "cell_type": "code", "execution_count": 5, "id": "b19be326", "metadata": { "execution": { "iopub.execute_input": "2022-10-15T13:52:33.755972Z", "iopub.status.busy": "2022-10-15T13:52:33.755623Z", "iopub.status.idle": "2022-10-15T13:52:37.816127Z", "shell.execute_reply": "2022-10-15T13:52:37.815166Z", "shell.execute_reply.started": "2022-10-15T13:52:33.755935Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
comment_textlabels
0Explanation\\nWhy the edits made under my usern...[0, 0, 0, 0, 0, 0]
1D'aww! He matches this background colour I'm s...[0, 0, 0, 0, 0, 0]
2Hey man, I'm really not trying to edit war. It...[0, 0, 0, 0, 0, 0]
3\"\\nMore\\nI can't make any real suggestions on ...[0, 0, 0, 0, 0, 0]
4You, sir, are my hero. Any chance you remember...[0, 0, 0, 0, 0, 0]
\n", "
" ], "text/plain": [ " comment_text labels\n", "0 Explanation\\nWhy the edits made under my usern... [0, 0, 0, 0, 0, 0]\n", "1 D'aww! He matches this background colour I'm s... [0, 0, 0, 0, 0, 0]\n", "2 Hey man, I'm really not trying to edit war. It... [0, 0, 0, 0, 0, 0]\n", "3 \"\\nMore\\nI can't make any real suggestions on ... [0, 0, 0, 0, 0, 0]\n", "4 You, sir, are my hero. Any chance you remember... [0, 0, 0, 0, 0, 0]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_data = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')\n", "\n", "label_columns = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n", "train_data['labels'] = train_data[label_columns].apply(lambda x: list(x), axis=1)\n", "\n", "train_data.drop(['id'], inplace=True, axis=1)\n", "train_data.drop(label_columns, inplace=True, axis=1)\n", "\n", "train_data.head()" ] }, { "cell_type": "code", "execution_count": 6, "id": "890c7fff", "metadata": { "execution": { "iopub.execute_input": "2022-10-15T13:52:37.823176Z", "iopub.status.busy": "2022-10-15T13:52:37.820051Z", "iopub.status.idle": "2022-10-15T13:52:37.836971Z", "shell.execute_reply": "2022-10-15T13:52:37.835982Z", "shell.execute_reply.started": "2022-10-15T13:52:37.823136Z" } }, "outputs": [], "source": [ "class MultiLabelDataset(Dataset):\n", "\n", " def __init__(self, dataframe, tokenizer, max_len, new_data=False):\n", " self.tokenizer = tokenizer\n", " self.data = dataframe\n", " self.text = dataframe.comment_text\n", " self.new_data = new_data\n", " \n", " if not new_data:\n", " self.targets = self.data.labels\n", " self.max_len = max_len\n", "\n", " def __len__(self):\n", " return len(self.text)\n", "\n", " def __getitem__(self, index):\n", " text = str(self.text[index])\n", " text = \" \".join(text.split())\n", "\n", " inputs = self.tokenizer.encode_plus(\n", " text,\n", " None,\n", " add_special_tokens=True,\n", " max_length=self.max_len,\n", " pad_to_max_length=True,\n", " return_token_type_ids=True\n", " )\n", " ids = inputs['input_ids']\n", " mask = inputs['attention_mask']\n", " token_type_ids = inputs[\"token_type_ids\"]\n", "\n", " out = {\n", " 'ids': torch.tensor(ids, dtype=torch.long),\n", " 'mask': torch.tensor(mask, dtype=torch.long),\n", " 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),\n", " }\n", " \n", " if not self.new_data:\n", " out['targets'] = torch.tensor(self.targets[index], dtype=torch.float)\n", "\n", " return out" ] }, { "cell_type": "code", "execution_count": 7, "id": "ed86ec0c", "metadata": { "execution": { "iopub.execute_input": "2022-10-15T13:52:37.844663Z", "iopub.status.busy": "2022-10-15T13:52:37.841436Z", "iopub.status.idle": "2022-10-15T13:52:40.989366Z", "shell.execute_reply": "2022-10-15T13:52:40.988419Z", "shell.execute_reply.started": "2022-10-15T13:52:37.844608Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Orig Dataset: (159571, 2)\n", "Training Dataset: (159571, 2)\n", "Validation Dataset: (0, 2)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2d4b50b831824e3e9edf79f2afab2623", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/226k [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcomment_text
000001cee341fdb12Yo bitch Ja Rule is more succesful then you'll...
10000247867823ef7== From RfC == \\n\\n The title is fine as it is...
200013b17ad220c46\" \\n\\n == Sources == \\n\\n * Zawe Ashton on Lap...
300017563c3f7919a:If you have a look back at the source, the in...
400017695ad8997ebI don't anonymously edit articles at all.
\n", "" ], "text/plain": [ " id comment_text\n", "0 00001cee341fdb12 Yo bitch Ja Rule is more succesful then you'll...\n", "1 0000247867823ef7 == From RfC == \\n\\n The title is fine as it is...\n", "2 00013b17ad220c46 \" \\n\\n == Sources == \\n\\n * Zawe Ashton on Lap...\n", "3 00017563c3f7919a :If you have a look back at the source, the in...\n", "4 00017695ad8997eb I don't anonymously edit articles at all." ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_data = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')\n", "test_data.head()" ] }, { "cell_type": "code", "execution_count": 15, "id": "b2643443", "metadata": { "execution": { "iopub.execute_input": "2022-10-15T17:32:34.756605Z", "iopub.status.busy": "2022-10-15T17:32:34.756231Z", "iopub.status.idle": "2022-10-15T17:32:34.763296Z", "shell.execute_reply": "2022-10-15T17:32:34.762149Z", "shell.execute_reply.started": "2022-10-15T17:32:34.756567Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py:490: UserWarning: This DataLoader will create 8 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n", " cpuset_checked))\n" ] } ], "source": [ "test_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN, new_data=True)\n", "test_loader = DataLoader(test_set, **val_params)" ] }, { "cell_type": "code", "execution_count": 16, "id": "52d65222", "metadata": { "execution": { "iopub.execute_input": "2022-10-15T17:32:37.165660Z", "iopub.status.busy": "2022-10-15T17:32:37.164342Z", "iopub.status.idle": "2022-10-15T17:53:57.945099Z", "shell.execute_reply": "2022-10-15T17:53:57.943866Z", "shell.execute_reply.started": "2022-10-15T17:32:37.165598Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", " FutureWarning,\n", "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", " FutureWarning,\n", "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", " FutureWarning,\n", "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", " FutureWarning,\n", "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", " FutureWarning,\n", "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", " FutureWarning,\n", "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", "0it [00:00, ?it/s]/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", " FutureWarning,\n", "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2307: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", " FutureWarning,\n", "9573it [21:20, 7.48it/s]\n" ] } ], "source": [ "all_test_pred = []\n", "\n", "def test(epoch):\n", " model.eval()\n", " \n", " with torch.inference_mode():\n", " \n", " for _, data in tqdm(enumerate(test_loader, 0)):\n", "\n", "\n", " ids = data['ids'].to(DEVICE, dtype=torch.long)\n", " mask = data['mask'].to(DEVICE, dtype=torch.long)\n", " token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)\n", " outputs = model(ids, mask, token_type_ids)\n", " probas = torch.sigmoid(outputs)\n", "\n", " all_test_pred.append(probas)\n", " \n", " \n", " return probas\n", "probas = test(model)" ] }, { "cell_type": "code", "execution_count": 17, "id": "6b484e66", "metadata": { "execution": { "iopub.execute_input": "2022-10-15T17:53:57.948511Z", "iopub.status.busy": "2022-10-15T17:53:57.947874Z", "iopub.status.idle": "2022-10-15T17:53:57.970095Z", "shell.execute_reply": "2022-10-15T17:53:57.969162Z", "shell.execute_reply.started": "2022-10-15T17:53:57.948460Z" } }, "outputs": [], "source": [ "all_test_pred = torch.cat(all_test_pred)" ] }, { "cell_type": "code", "execution_count": 18, "id": "c5f590c3", "metadata": { "execution": { "iopub.execute_input": "2022-10-15T17:53:57.973188Z", "iopub.status.busy": "2022-10-15T17:53:57.972789Z", "iopub.status.idle": "2022-10-15T17:53:58.014544Z", "shell.execute_reply": "2022-10-15T17:53:58.013540Z", "shell.execute_reply.started": "2022-10-15T17:53:57.973160Z" } }, "outputs": [], "source": [ "submit_df = test_data.copy()\n", "submit_df.drop(\"comment_text\", inplace=True, axis=1)" ] }, { "cell_type": "code", "execution_count": 19, "id": "fa28e0d7", "metadata": { "execution": { "iopub.execute_input": "2022-10-15T17:53:58.017452Z", "iopub.status.busy": "2022-10-15T17:53:58.016993Z", "iopub.status.idle": "2022-10-15T17:53:58.024191Z", "shell.execute_reply": "2022-10-15T17:53:58.023285Z", "shell.execute_reply.started": "2022-10-15T17:53:58.017416Z" } }, "outputs": [], "source": [ "label_columns = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]" ] }, { "cell_type": "code", "execution_count": 20, "id": "44dab086", "metadata": { "execution": { "iopub.execute_input": "2022-10-15T17:53:58.027310Z", "iopub.status.busy": "2022-10-15T17:53:58.026525Z", "iopub.status.idle": "2022-10-15T17:53:58.038844Z", "shell.execute_reply": "2022-10-15T17:53:58.037882Z", "shell.execute_reply.started": "2022-10-15T17:53:58.027274Z" } }, "outputs": [], "source": [ "for i,name in enumerate(label_columns):\n", "\n", " submit_df[name] = all_test_pred[:, i].cpu()\n", " submit_df.head()" ] }, { "cell_type": "code", "execution_count": 21, "id": "eb47a32b", "metadata": { "execution": { "iopub.execute_input": "2022-10-15T17:53:58.041083Z", "iopub.status.busy": "2022-10-15T17:53:58.040403Z", "iopub.status.idle": "2022-10-15T17:53:58.926274Z", "shell.execute_reply": "2022-10-15T17:53:58.925148Z", "shell.execute_reply.started": "2022-10-15T17:53:58.041047Z" } }, "outputs": [], "source": [ "submit_df.to_csv('submission.csv', index=False)" ] }, { "cell_type": "markdown", "id": "e6f0979f-c7a1-4296-b039-4612c6567903", "metadata": {}, "source": [ "# Scores" ] }, { "cell_type": "markdown", "id": "7e6e2d43-548b-4c46-995d-ded3cd20b565", "metadata": {}, "source": [ "- Public leaderboard: 0.98515\n", "- Private leaderboard: 0.98511" ] }, { "cell_type": "code", "execution_count": null, "id": "19d4b51b-d0a3-498b-a4fb-aa31c820c25b", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }