In [None]:
%pip install comet_ml
%pip install transformers==3.3.1

In [None]:
import comet_ml

In [None]:
!wget https://raw.githubusercontent.com/susanli2016/NLP-with-Python/master/data/title_conference.csv

In [None]:
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
import transformers
from transformers import AutoTokenizer
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

In [None]:
import torch

In [None]:
PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
import pandas as pd

df = pd.read_csv("./title_conference.csv")

In [None]:
df.head()

In [None]:
df['Conference'] = pd.Categorical(df['Conference'])
df['Target'] = df['Conference'].cat.codes

In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.2, stratify=df["Target"])

In [None]:
train_texts, train_labels = train_data['Title'].values.tolist(), train_data['Target'].values.tolist()
test_texts, test_labels = test_data['Title'].values.tolist(), test_data['Target'].values.tolist()

In [None]:
def preprocess(texts, labels):
 encoded = tokenizer(
 texts, 
 add_special_tokens=True,
 truncation=True, 
 max_length=64, 
 pad_to_max_length=True,
 return_attention_mask=True, 
 return_tensors='pt',
 )
 
 return encoded, torch.tensor(labels) 

In [None]:
train_encoded, train_labels = preprocess(train_texts, train_labels)
test_encoded, test_labels = preprocess(test_texts, test_labels)

In [None]:
class Dataset(torch.utils.data.Dataset):
 def __init__(self, encodings, labels):
 self.encodings = encodings
 self.labels = labels

 def __getitem__(self, idx):
 item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 item['labels'] = torch.tensor(self.labels[idx])
 return item

 def __len__(self):
 return len(self.labels)

In [None]:
train_dataset = Dataset(train_encoded, train_labels)
test_dataset = Dataset(test_encoded, test_labels)

In [None]:
model = BertForSequenceClassification.from_pretrained(
 PRE_TRAINED_MODEL_NAME, 
 num_labels=len(df["Target"].unique()), 
 output_attentions=False,
 output_hidden_states=False,
)

# Tell pytorch to run this model on the GPU.
model.cuda()

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred): 
 experiment = comet_ml.get_global_experiment()
 
 labels = pred.label_ids
 preds = pred.predictions.argmax(-1)
 precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
 acc = accuracy_score(labels, preds)

 if experiment:
 experiment.log_confusion_matrix(preds, labels)

 return {
 'accuracy': acc,
 'f1': f1,
 'precision': precision,
 'recall': recall
 }

In [None]:
# Training Parameters
EPOCHS = 5

In [None]:
import itertools

decays = [0.0, 0.5, 0.99]
learning_rates = [5.0e-5, 3.0e-5, 2.0e-5, 1.0e-5]
batch_sizes = [32, 64, 128]

parameters = [
 {"weight_decay": x[0], "learning_rate": x[1], "batch_size": x[2]} for x in list(itertools.product(*[decays, learning_rates, batch_sizes]))
]

In [None]:
from tqdm import tqdm

%env COMET_MODE=ONLINE
%env COMET_API_KEY=
%env COMET_PROJECT_NAME=transformers

for idx, p in tqdm(enumerate(parameters)):
 weight_decay = p["weight_decay"]
 learning_rate = p["learning_rate"]
 batch_size = p["batch_size"]

 training_args = TrainingArguments(
 seed=42,
 output_dir='./results', 
 overwrite_output_dir=True, 
 num_train_epochs=EPOCHS, 
 per_device_train_batch_size=batch_size, 
 per_device_eval_batch_size=batch_size,
 warmup_steps=500, 
 weight_decay=weight_decay, 
 learning_rate=learning_rate, 
 evaluation_strategy="epoch",
 do_train=True,
 do_eval=True 
 )
 trainer = Trainer(
 model=model, 
 args=training_args, 
 train_dataset=train_dataset, 
 eval_dataset=test_dataset,
 compute_metrics=compute_metrics, 
 )
 trainer.train()