{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "aa95efb4-71eb-4f3d-8799-c12831385900", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import torch\n", "from torch import nn, optim\n", "from torch.utils.data import DataLoader, Dataset\n", "from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup\n", "from sklearn.metrics import accuracy_score, f1_score, classification_report\n", "from sklearn.model_selection import StratifiedKFold, train_test_split\n", "import random\n", "import numpy as np\n", "import statistics" ] }, { "cell_type": "code", "execution_count": 4, "id": "3e6b7a75-f6d2-4fd7-968c-b16f9832d1ca", "metadata": {}, "outputs": [], "source": [ "#Mudanças principais:\n", "#1400\n", "#Modelo Bertimbau Large: Alterado o model_name para 'neuralmind/bert-large-portuguese-cased'.\n", "\n", "#LR= 3e-5.\n", "\n", "#Descongelamento das camadas: Parametrizamos o número de camadas finais do BERT a descongelar, via unfreeze_layers. Por exemplo, se definirmos unfreeze_layers=8, descongelamos as últimas 8 camadas.\n", "\n", "#Outros otimizadores e LR Schedulers: Mantemos o AdamW como otimizador principal, mas agora adicionamos um scheduler (get_linear_schedule_with_warmup do transformers) para ajustar a taxa de aprendizado durante o treino. Caso queira testar outro otimizador, basta substituir a linha do optimizador. Também deixamos comentado outro exemplo (SGD) para referência.\n", "#Para testar diferentes taxas de aprendizado, basta alterar learning_rate no código.\n", "#Para testar diferentes números de camadas a descongelar, altere unfreeze_layers.\n", "\n", "#4\n", "#processo de treinamento e avaliação várias vezes (uma para cada fold).\n", "#diminuindo épocas ou early stopping, se necessário.\n", "#O early stopping agora é feito com base no conjunto de validação interno a cada fold.\n", "#Esse processo é mais demorado, pois treinaremos o modelo K vezes.\n", "#Ajuste parâmetros (como número de épocas, taxa de aprendizado, etc.) conforme necessário." ] }, { "cell_type": "code", "execution_count": 5, "id": "e375e916-07a1-44f1-9675-8fb7eb8045f1", "metadata": {}, "outputs": [], "source": [ "# Semente para reprodutibilidade\n", "seed = 42\n", "random.seed(seed)\n", "np.random.seed(seed)\n", "torch.manual_seed(seed)\n", "if torch.cuda.is_available():\n", " torch.cuda.manual_seed_all(seed)" ] }, { "cell_type": "code", "execution_count": 6, "id": "1b511a6a-4c56-4335-8c15-660d3d146399", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Usando dispositivo: cuda\n" ] } ], "source": [ "# Configurações gerais\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "print(f'Usando dispositivo: {device}')\n", "\n", "model_name = 'neuralmind/bert-large-portuguese-cased' \n", "learning_rate = 3e-5\n", "unfreeze_layers = 4\n", "nclasses = 2\n", "nepochs = 5\n", "batch_size = 8\n", "batch_status = 32\n", "early_stop = 2\n", "max_length = 360\n", "write_path = 'model_cv'\n", "\n", "if not os.path.exists(write_path):\n", " os.makedirs(write_path)" ] }, { "cell_type": "code", "execution_count": 7, "id": "3a0a71c0-667a-421f-b324-fa857999aa6f", "metadata": {}, "outputs": [], "source": [ "# Carregar os dados\n", "data = pd.read_csv(\"DATAFRAME1400.csv\")" ] }, { "cell_type": "code", "execution_count": 8, "id": "ff7f8d91-5750-45f3-9a75-1240e5401757", "metadata": {}, "outputs": [], "source": [ "# Dataset Customizado\n", "class CustomDataset(Dataset):\n", " def __init__(self, data, tokenizer, max_length):\n", " self.data = data.reset_index(drop=True)\n", " self.tokenizer = tokenizer\n", " self.max_length = max_length\n", "\n", " def __len__(self):\n", " return len(self.data)\n", "\n", " def __getitem__(self, idx):\n", " text = self.data.iloc[idx]['text']\n", " label = self.data.iloc[idx]['contra']\n", " inputs = self.tokenizer(text, return_tensors='pt',\n", " padding='max_length', truncation=True,\n", " max_length=self.max_length)\n", " return {key: val.squeeze(0) for key, val in inputs.items()}, torch.tensor(label)\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "ac08faef-e9c9-496f-b850-6295f892c4e6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "=== Fold 1/5 ===\n", "Epoch: 0 [32/135]\tLoss: 0.943423\n", "Epoch: 0 [64/135]\tLoss: 0.659408\n", "Epoch: 0 [96/135]\tLoss: 0.393854\n", "Epoch: 0 [128/135]\tLoss: 0.469751\n", "Epoch 0 - Val F1: 0.8000, Val Accuracy: 0.8000\n", "Novo melhor modelo salvo.\n", "Epoch: 1 [32/135]\tLoss: 0.371063\n", "Epoch: 1 [64/135]\tLoss: 0.466771\n", "Epoch: 1 [96/135]\tLoss: 0.271778\n", "Epoch: 1 [128/135]\tLoss: 0.209255\n", "Epoch 1 - Val F1: 0.8300, Val Accuracy: 0.8333\n", "Novo melhor modelo salvo.\n", "Epoch: 2 [32/135]\tLoss: 0.078391\n", "Epoch: 2 [64/135]\tLoss: 0.239022\n", "Epoch: 2 [96/135]\tLoss: 0.283219\n", "Epoch: 2 [128/135]\tLoss: 0.061773\n", "Epoch 2 - Val F1: 0.8483, Val Accuracy: 0.8500\n", "Novo melhor modelo salvo.\n", "Epoch: 3 [32/135]\tLoss: 0.081003\n", "Epoch: 3 [64/135]\tLoss: 0.289715\n", "Epoch: 3 [96/135]\tLoss: 0.047254\n", "Epoch: 3 [128/135]\tLoss: 0.025544\n", "Epoch 3 - Val F1: 0.8418, Val Accuracy: 0.8417\n", "Epoch: 4 [32/135]\tLoss: 0.096558\n", "Epoch: 4 [64/135]\tLoss: 0.023162\n", "Epoch: 4 [96/135]\tLoss: 0.028982\n", "Epoch: 4 [128/135]\tLoss: 0.015242\n", "Epoch 4 - Val F1: 0.8663, Val Accuracy: 0.8667\n", "Novo melhor modelo salvo.\n", "Desempenho no conjunto de teste desta dobra:\n", " precision recall f1-score support\n", "\n", " 0 0.90 0.93 0.91 157\n", " 1 0.92 0.88 0.90 143\n", "\n", " accuracy 0.91 300\n", " macro avg 0.91 0.91 0.91 300\n", "weighted avg 0.91 0.91 0.91 300\n", "\n", "F1 (teste): 0.9065, Accuracy (teste): 0.9067\n", "\n", "=== Fold 2/5 ===\n", "Epoch: 0 [32/135]\tLoss: 0.750314\n", "Epoch: 0 [64/135]\tLoss: 0.607893\n", "Epoch: 0 [96/135]\tLoss: 0.476860\n", "Epoch: 0 [128/135]\tLoss: 0.436566\n", "Epoch 0 - Val F1: 0.8496, Val Accuracy: 0.8500\n", "Novo melhor modelo salvo.\n", "Epoch: 1 [32/135]\tLoss: 0.266623\n", "Epoch: 1 [64/135]\tLoss: 0.167054\n", "Epoch: 1 [96/135]\tLoss: 0.574422\n", "Epoch: 1 [128/135]\tLoss: 0.136387\n", "Epoch 1 - Val F1: 0.8834, Val Accuracy: 0.8833\n", "Novo melhor modelo salvo.\n", "Epoch: 2 [32/135]\tLoss: 0.054383\n", "Epoch: 2 [64/135]\tLoss: 0.190985\n", "Epoch: 2 [96/135]\tLoss: 0.210877\n", "Epoch: 2 [128/135]\tLoss: 0.279167\n", "Epoch 2 - Val F1: 0.8833, Val Accuracy: 0.8833\n", "Epoch: 3 [32/135]\tLoss: 0.319686\n", "Epoch: 3 [64/135]\tLoss: 0.331613\n", "Epoch: 3 [96/135]\tLoss: 0.031250\n", "Epoch: 3 [128/135]\tLoss: 0.026889\n", "Epoch 3 - Val F1: 0.8917, Val Accuracy: 0.8917\n", "Novo melhor modelo salvo.\n", "Epoch: 4 [32/135]\tLoss: 0.051201\n", "Epoch: 4 [64/135]\tLoss: 0.115116\n", "Epoch: 4 [96/135]\tLoss: 0.014849\n", "Epoch: 4 [128/135]\tLoss: 0.025389\n", "Epoch 4 - Val F1: 0.8830, Val Accuracy: 0.8833\n", "Desempenho no conjunto de teste desta dobra:\n", " precision recall f1-score support\n", "\n", " 0 0.90 0.88 0.89 156\n", " 1 0.87 0.90 0.88 143\n", "\n", " accuracy 0.89 299\n", " macro avg 0.89 0.89 0.89 299\n", "weighted avg 0.89 0.89 0.89 299\n", "\n", "F1 (teste): 0.8863, Accuracy (teste): 0.8863\n", "\n", "=== Fold 3/5 ===\n", "Epoch: 0 [32/135]\tLoss: 0.830914\n", "Epoch: 0 [64/135]\tLoss: 0.683001\n", "Epoch: 0 [96/135]\tLoss: 0.577303\n", "Epoch: 0 [128/135]\tLoss: 0.676515\n", "Epoch 0 - Val F1: 0.8501, Val Accuracy: 0.8500\n", "Novo melhor modelo salvo.\n", "Epoch: 1 [32/135]\tLoss: 0.210992\n", "Epoch: 1 [64/135]\tLoss: 0.217102\n", "Epoch: 1 [96/135]\tLoss: 0.643290\n", "Epoch: 1 [128/135]\tLoss: 0.511327\n", "Epoch 1 - Val F1: 0.8748, Val Accuracy: 0.8750\n", "Novo melhor modelo salvo.\n", "Epoch: 2 [32/135]\tLoss: 0.392438\n", "Epoch: 2 [64/135]\tLoss: 0.138488\n", "Epoch: 2 [96/135]\tLoss: 0.379189\n", "Epoch: 2 [128/135]\tLoss: 0.057960\n", "Epoch 2 - Val F1: 0.9250, Val Accuracy: 0.9250\n", "Novo melhor modelo salvo.\n", "Epoch: 3 [32/135]\tLoss: 0.094107\n", "Epoch: 3 [64/135]\tLoss: 0.095579\n", "Epoch: 3 [96/135]\tLoss: 0.070371\n", "Epoch: 3 [128/135]\tLoss: 0.038925\n", "Epoch 3 - Val F1: 0.9250, Val Accuracy: 0.9250\n", "Epoch: 4 [32/135]\tLoss: 0.054453\n", "Epoch: 4 [64/135]\tLoss: 0.027761\n", "Epoch: 4 [96/135]\tLoss: 0.014768\n", "Epoch: 4 [128/135]\tLoss: 0.069529\n", "Epoch 4 - Val F1: 0.9333, Val Accuracy: 0.9333\n", "Novo melhor modelo salvo.\n", "Desempenho no conjunto de teste desta dobra:\n", " precision recall f1-score support\n", "\n", " 0 0.91 0.86 0.89 157\n", " 1 0.85 0.91 0.88 142\n", "\n", " accuracy 0.88 299\n", " macro avg 0.88 0.88 0.88 299\n", "weighted avg 0.88 0.88 0.88 299\n", "\n", "F1 (teste): 0.8830, Accuracy (teste): 0.8829\n", "\n", "=== Fold 4/5 ===\n", "Epoch: 0 [32/135]\tLoss: 0.743773\n", "Epoch: 0 [64/135]\tLoss: 0.690046\n", "Epoch: 0 [96/135]\tLoss: 0.623647\n", "Epoch: 0 [128/135]\tLoss: 0.497060\n", "Epoch 0 - Val F1: 0.8583, Val Accuracy: 0.8583\n", "Novo melhor modelo salvo.\n", "Epoch: 1 [32/135]\tLoss: 0.449734\n", "Epoch: 1 [64/135]\tLoss: 0.170243\n", "Epoch: 1 [96/135]\tLoss: 0.075471\n", "Epoch: 1 [128/135]\tLoss: 0.299446\n", "Epoch 1 - Val F1: 0.8416, Val Accuracy: 0.8417\n", "Epoch: 2 [32/135]\tLoss: 0.488727\n", "Epoch: 2 [64/135]\tLoss: 0.095850\n", "Epoch: 2 [96/135]\tLoss: 0.157705\n", "Epoch: 2 [128/135]\tLoss: 0.141444\n", "Epoch 2 - Val F1: 0.8416, Val Accuracy: 0.8417\n", "Early stopping atingido.\n", "Desempenho no conjunto de teste desta dobra:\n", " precision recall f1-score support\n", "\n", " 0 0.81 0.91 0.86 157\n", " 1 0.89 0.77 0.82 142\n", "\n", " accuracy 0.84 299\n", " macro avg 0.85 0.84 0.84 299\n", "weighted avg 0.85 0.84 0.84 299\n", "\n", "F1 (teste): 0.8417, Accuracy (teste): 0.8428\n", "\n", "=== Fold 5/5 ===\n", "Epoch: 0 [32/135]\tLoss: 0.598468\n", "Epoch: 0 [64/135]\tLoss: 0.593075\n", "Epoch: 0 [96/135]\tLoss: 0.494841\n", "Epoch: 0 [128/135]\tLoss: 0.455896\n", "Epoch 0 - Val F1: 0.8500, Val Accuracy: 0.8500\n", "Novo melhor modelo salvo.\n", "Epoch: 1 [32/135]\tLoss: 0.359274\n", "Epoch: 1 [64/135]\tLoss: 0.157859\n", "Epoch: 1 [96/135]\tLoss: 0.085530\n", "Epoch: 1 [128/135]\tLoss: 0.029607\n", "Epoch 1 - Val F1: 0.8748, Val Accuracy: 0.8750\n", "Novo melhor modelo salvo.\n", "Epoch: 2 [32/135]\tLoss: 0.119052\n", "Epoch: 2 [64/135]\tLoss: 0.519205\n", "Epoch: 2 [96/135]\tLoss: 0.403141\n", "Epoch: 2 [128/135]\tLoss: 0.155865\n", "Epoch 2 - Val F1: 0.9000, Val Accuracy: 0.9000\n", "Novo melhor modelo salvo.\n", "Epoch: 3 [32/135]\tLoss: 0.095529\n", "Epoch: 3 [64/135]\tLoss: 0.024375\n", "Epoch: 3 [96/135]\tLoss: 0.047616\n", "Epoch: 3 [128/135]\tLoss: 0.028527\n", "Epoch 3 - Val F1: 0.9250, Val Accuracy: 0.9250\n", "Novo melhor modelo salvo.\n", "Epoch: 4 [32/135]\tLoss: 0.019617\n", "Epoch: 4 [64/135]\tLoss: 0.032093\n", "Epoch: 4 [96/135]\tLoss: 0.128916\n", "Epoch: 4 [128/135]\tLoss: 0.042773\n", "Epoch 4 - Val F1: 0.9084, Val Accuracy: 0.9083\n", "Desempenho no conjunto de teste desta dobra:\n", " precision recall f1-score support\n", "\n", " 0 0.88 0.87 0.88 157\n", " 1 0.86 0.87 0.86 142\n", "\n", " accuracy 0.87 299\n", " macro avg 0.87 0.87 0.87 299\n", "weighted avg 0.87 0.87 0.87 299\n", "\n", "F1 (teste): 0.8696, Accuracy (teste): 0.8696\n", "\n", "=== Resultados Médios da Validação Cruzada ===\n", "F1 médio: 0.8774 (+/- 0.0214)\n", "Acurácia média: 0.8777 (+/- 0.0211)\n" ] } ], "source": [ "# Modelo\n", "class CustomBERTModel(nn.Module):\n", " def __init__(self, model_name, nclasses, unfreeze_layers):\n", " super(CustomBERTModel, self).__init__()\n", " self.bert = AutoModel.from_pretrained(model_name)\n", " self.dropout = nn.Dropout(0.3)\n", " self.classifier = nn.Linear(self.bert.config.hidden_size, nclasses)\n", "\n", " # Congelar tudo inicialmente\n", " for param in self.bert.parameters():\n", " param.requires_grad = False\n", "\n", " # Descongelar as últimas unfreeze_layers camadas\n", " if unfreeze_layers > 0:\n", " for param in self.bert.encoder.layer[-unfreeze_layers:].parameters():\n", " param.requires_grad = True\n", "\n", " def forward(self, input_ids, attention_mask, token_type_ids=None):\n", " outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)\n", " pooled_output = outputs.pooler_output\n", " dropped_out = self.dropout(pooled_output)\n", " logits = self.classifier(dropped_out)\n", " return logits\n", "\n", "def evaluate(model, dataloader):\n", " model.eval()\n", " y_real, y_pred = [], []\n", " with torch.no_grad():\n", " for inputs, labels in dataloader:\n", " inputs = {key: val.to(device) for key, val in inputs.items()}\n", " labels = labels.to(device)\n", " logits = model(**inputs)\n", " pred_labels = torch.argmax(logits, 1)\n", "\n", " y_real.extend(labels.cpu().tolist())\n", " y_pred.extend(pred_labels.cpu().tolist())\n", "\n", " f1 = f1_score(y_real, y_pred, average='weighted')\n", " acc = accuracy_score(y_real, y_pred)\n", " return f1, acc, (y_real, y_pred)\n", "\n", "\n", "# Cross-validation\n", "k = 5\n", "skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)\n", "X = data.index.values\n", "y = data['contra'].values\n", "\n", "f1_scores = []\n", "acc_scores = []\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)\n", "\n", "fold_num = 1\n", "for train_val_idx, test_idx in skf.split(X, y):\n", " print(f\"\\n=== Fold {fold_num}/{k} ===\")\n", "\n", " # Separamos test fold\n", " test_data = data.iloc[test_idx]\n", "\n", " # A partir do train_val_idx, dividimos em train e val\n", " train_val_data = data.iloc[train_val_idx]\n", " train_data, val_data = train_test_split(train_val_data, \n", " test_size=0.1, \n", " random_state=seed, \n", " stratify=train_val_data['contra'])\n", " \n", " # Criar datasets e dataloaders\n", " train_dataset = CustomDataset(train_data, tokenizer, max_length)\n", " val_dataset = CustomDataset(val_data, tokenizer, max_length)\n", " test_dataset = CustomDataset(test_data, tokenizer, max_length)\n", "\n", " traindata = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n", " valdata = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)\n", " testdata = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)\n", "\n", " model = CustomBERTModel(model_name, nclasses, unfreeze_layers).to(device)\n", " optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)\n", "\n", " loss_fn = nn.CrossEntropyLoss()\n", "\n", " total_steps = len(traindata) * nepochs\n", " scheduler = get_linear_schedule_with_warmup(optimizer, \n", " num_warmup_steps=int(0.1 * total_steps), \n", " num_training_steps=total_steps)\n", "\n", " max_f1, repeat = 0, 0\n", " best_model_path = os.path.join(write_path, f'best_model_fold{fold_num}.pth')\n", "\n", " for epoch in range(nepochs):\n", " model.train()\n", " losses = []\n", " for batch_idx, (inputs, labels) in enumerate(traindata):\n", " inputs = {key: val.to(device) for key, val in inputs.items()}\n", " labels = labels.to(device)\n", "\n", " logits = model(**inputs)\n", " loss = loss_fn(logits, labels)\n", " losses.append(float(loss))\n", "\n", " # Backprop\n", " loss.backward()\n", " optimizer.step()\n", " scheduler.step()\n", " optimizer.zero_grad()\n", "\n", " if (batch_idx + 1) % batch_status == 0:\n", " print(f'Epoch: {epoch} [{batch_idx + 1}/{len(traindata)}]\\tLoss: {loss:.6f}')\n", "\n", " f1_val, acc_val, _ = evaluate(model, valdata)\n", " print(f'Epoch {epoch} - Val F1: {f1_val:.4f}, Val Accuracy: {acc_val:.4f}')\n", "\n", " if f1_val > max_f1:\n", " torch.save(model.state_dict(), best_model_path)\n", " max_f1 = f1_val\n", " repeat = 0\n", " print('Novo melhor modelo salvo.')\n", " else:\n", " repeat += 1\n", "\n", " if repeat == early_stop:\n", " print('Early stopping atingido.')\n", " break\n", "\n", " # Avaliar no teste\n", " state_dict = torch.load(best_model_path, weights_only=True)\n", " model.load_state_dict(state_dict)\n", " f1_test, acc_test, (y_real, y_pred) = evaluate(model, testdata)\n", "\n", " print(\"Desempenho no conjunto de teste desta dobra:\")\n", " print(classification_report(y_real, y_pred, target_names=['0', '1']))\n", " print(f\"F1 (teste): {f1_test:.4f}, Accuracy (teste): {acc_test:.4f}\")\n", "\n", " f1_scores.append(f1_test)\n", " acc_scores.append(acc_test)\n", "\n", " fold_num += 1\n", "\n", "# Resultados médios da validação cruzada\n", "print(\"\\n=== Resultados Médios da Validação Cruzada ===\")\n", "print(f\"F1 médio: {statistics.mean(f1_scores):.4f} (+/- {statistics.pstdev(f1_scores):.4f})\")\n", "print(f\"Acurácia média: {statistics.mean(acc_scores):.4f} (+/- {statistics.pstdev(acc_scores):.4f})\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 5 }