{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# एक पूर्ण प्रशिक्षण" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers, Datasets, and Evaluate libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets evaluate transformers[sentencepiece]\n", "!pip install accelerate\n", "# To run the training on TPU, you will need to uncomment the following line:\n", "# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "from transformers import AutoTokenizer, DataCollatorWithPadding\n", "\n", "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n", "checkpoint = \"bert-base-uncased\"\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "\n", "\n", "def tokenize_function(example):\n", " return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True)\n", "\n", "\n", "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenized_datasets = tokenized_datasets.remove_columns([\"sentence1\", \"sentence2\", \"idx\"])\n", "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")\n", "tokenized_datasets.set_format(\"torch\")\n", "tokenized_datasets[\"train\"].column_names" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "[\"attention_mask\", \"input_ids\", \"labels\", \"token_type_ids\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from torch.utils.data import DataLoader\n", "\n", "train_dataloader = DataLoader(\n", " tokenized_datasets[\"train\"], shuffle=True, batch_size=8, collate_fn=data_collator\n", ")\n", "eval_dataloader = DataLoader(\n", " tokenized_datasets[\"validation\"], batch_size=8, collate_fn=data_collator\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'attention_mask': torch.Size([8, 65]),\n", " 'input_ids': torch.Size([8, 65]),\n", " 'labels': torch.Size([8]),\n", " 'token_type_ids': torch.Size([8, 65])}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for batch in train_dataloader:\n", " break\n", "{k: v.shape for k, v in batch.items()}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoModelForSequenceClassification\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor(0.5441, grad_fn=) torch.Size([8, 2])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "outputs = model(**batch)\n", "print(outputs.loss, outputs.logits.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AdamW\n", "\n", "optimizer = AdamW(model.parameters(), lr=5e-5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1377" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import get_scheduler\n", "\n", "num_epochs = 3\n", "num_training_steps = num_epochs * len(train_dataloader)\n", "lr_scheduler = get_scheduler(\n", " \"linear\",\n", " optimizer=optimizer,\n", " num_warmup_steps=0,\n", " num_training_steps=num_training_steps,\n", ")\n", "print(num_training_steps)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "device(type='cuda')" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch\n", "\n", "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", "model.to(device)\n", "device" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from tqdm.auto import tqdm\n", "\n", "progress_bar = tqdm(range(num_training_steps))\n", "\n", "model.train()\n", "for epoch in range(num_epochs):\n", " for batch in train_dataloader:\n", " batch = {k: v.to(device) for k, v in batch.items()}\n", " outputs = model(**batch)\n", " loss = outputs.loss\n", " loss.backward()\n", "\n", " optimizer.step()\n", " lr_scheduler.step()\n", " optimizer.zero_grad()\n", " progress_bar.update(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'accuracy': 0.8431372549019608, 'f1': 0.8907849829351535}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import evaluate\n", "\n", "metric = evaluate.load(\"glue\", \"mrpc\")\n", "model.eval()\n", "for batch in eval_dataloader:\n", " batch = {k: v.to(device) for k, v in batch.items()}\n", " with torch.no_grad():\n", " outputs = model(**batch)\n", "\n", " logits = outputs.logits\n", " predictions = torch.argmax(logits, dim=-1)\n", " metric.add_batch(predictions=predictions, references=batch[\"labels\"])\n", "\n", "metric.compute()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n", "optimizer = AdamW(model.parameters(), lr=3e-5)\n", "\n", "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", "model.to(device)\n", "\n", "num_epochs = 3\n", "num_training_steps = num_epochs * len(train_dataloader)\n", "lr_scheduler = get_scheduler(\n", " \"linear\",\n", " optimizer=optimizer,\n", " num_warmup_steps=0,\n", " num_training_steps=num_training_steps,\n", ")\n", "\n", "progress_bar = tqdm(range(num_training_steps))\n", "\n", "model.train()\n", "for epoch in range(num_epochs):\n", " for batch in train_dataloader:\n", " batch = {k: v.to(device) for k, v in batch.items()}\n", " outputs = model(**batch)\n", " loss = outputs.loss\n", " loss.backward()\n", "\n", " optimizer.step()\n", " lr_scheduler.step()\n", " optimizer.zero_grad()\n", " progress_bar.update(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from accelerate import Accelerator\n", "from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler\n", "\n", "accelerator = Accelerator()\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n", "optimizer = AdamW(model.parameters(), lr=3e-5)\n", "\n", "train_dl, eval_dl, model, optimizer = accelerator.prepare(\n", " train_dataloader, eval_dataloader, model, optimizer\n", ")\n", "\n", "num_epochs = 3\n", "num_training_steps = num_epochs * len(train_dl)\n", "lr_scheduler = get_scheduler(\n", " \"linear\",\n", " optimizer=optimizer,\n", " num_warmup_steps=0,\n", " num_training_steps=num_training_steps,\n", ")\n", "\n", "progress_bar = tqdm(range(num_training_steps))\n", "\n", "model.train()\n", "for epoch in range(num_epochs):\n", " for batch in train_dl:\n", " outputs = model(**batch)\n", " loss = outputs.loss\n", " accelerator.backward(loss)\n", "\n", " optimizer.step()\n", " lr_scheduler.step()\n", " optimizer.zero_grad()\n", " progress_bar.update(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from accelerate import notebook_launcher\n", "\n", "notebook_launcher(training_function)" ] } ], "metadata": { "colab": { "name": "एक पूर्ण प्रशिक्षण", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }