# 因果言語モデルを一から学習 (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the following line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs

You will need to setup git, adapt your email and name in the following cell.

In [None]:
!git config --global user.email "you@example.com"
!git config --global user.name "Your Name"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
def any_keyword_in_string(string, keywords):
 for keyword in keywords:
 if keyword in string:
 return True
 return False

In [None]:
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]
example_1 = "import numpy as np"
example_2 = "import pandas as pd"

print(
 any_keyword_in_string(example_1, filters), any_keyword_in_string(example_2, filters)
)

False True

In [None]:
from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset


def filter_streaming_dataset(dataset, filters):
 filtered_dict = defaultdict(list)
 total = 0
 for sample in tqdm(iter(dataset)):
 total += 1
 if any_keyword_in_string(sample["content"], filters):
 for k, v in sample.items():
 filtered_dict[k].append(v)
 print(f"{len(filtered_dict['content'])/total:.2%} of data after filtering.")
 return Dataset.from_dict(filtered_dict)

In [None]:
# This cell will take a very long time to execute, so you should skip it and go to
# the next one!
from datasets import load_dataset

split = "train" # "valid"
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]

data = load_dataset(f"transformersbook/codeparrot-{split}", split=split, streaming=True)
filtered_data = filter_streaming_dataset(data, filters)

3.26% of data after filtering.

In [None]:
from datasets import load_dataset, DatasetDict

ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")

raw_datasets = DatasetDict(
 {
 "train": ds_train, # .shuffle().select(range(50000)),
 "valid": ds_valid, # .shuffle().select(range(500))
 }
)

raw_datasets

DatasetDict({
 train: Dataset({
 features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
 num_rows: 606720
 })
 valid: Dataset({
 features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
 num_rows: 3322
 })
})

In [None]:
for key in raw_datasets["train"][0]:
 print(f"{key.upper()}: {raw_datasets['train'][0][key][:200]}")

'REPO_NAME: kmike/scikit-learn'
'PATH: sklearn/utils/__init__.py'
'COPIES: 3'
'SIZE: 10094'
'''CONTENT: """
The :mod:`sklearn.utils` module includes various utilites.
"""

from collections import Sequence

import numpy as np
from scipy.sparse import issparse

from .murmurhash import murm
LICENSE: bsd-3-clause'''

In [None]:
from transformers import AutoTokenizer

context_length = 128
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")

outputs = tokenizer(
 raw_datasets["train"][:2]["content"],
 truncation=True,
 max_length=context_length,
 return_overflowing_tokens=True,
 return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 34
Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 117, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 41]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [None]:
def tokenize(element):
 outputs = tokenizer(
 element["content"],
 truncation=True,
 max_length=context_length,
 return_overflowing_tokens=True,
 return_length=True,
 )
 input_batch = []
 for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
 if length == context_length:
 input_batch.append(input_ids)
 return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
 tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

DatasetDict({
 train: Dataset({
 features: ['input_ids'],
 num_rows: 16702061
 })
 valid: Dataset({
 features: ['input_ids'],
 num_rows: 93164
 })
})

In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
 "gpt2",
 vocab_size=len(tokenizer),
 n_ctx=context_length,
 bos_token_id=tokenizer.bos_token_id,
 eos_token_id=tokenizer.eos_token_id,
)

In [None]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.2M parameters

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
 print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
 output_dir="codeparrot-ds",
 per_device_train_batch_size=32,
 per_device_eval_batch_size=32,
 evaluation_strategy="steps",
 eval_steps=5_000,
 logging_steps=5_000,
 gradient_accumulation_steps=8,
 num_train_epochs=1,
 weight_decay=0.1,
 warmup_steps=1_000,
 lr_scheduler_type="cosine",
 learning_rate=5e-4,
 save_steps=5_000,
 fp16=True,
 push_to_hub=True,
)

trainer = Trainer(
 model=model,
 tokenizer=tokenizer,
 args=args,
 data_collator=data_collator,
 train_dataset=tokenized_datasets["train"],
 eval_dataset=tokenized_datasets["valid"],
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
 "text-generation", model="huggingface-course/codeparrot-ds", device=device
)

In [None]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
plt.scatter(x, y)

# create scatter

In [None]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
df = pd.DataFrame({'x': x, 'y': y})
df.insert(0,'x', x)
for

In [None]:
txt = """\
# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
profession = df.groupby(['profession']).mean()

# compute the

In [None]:
txt = """
# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:
rf = RandomForestRegressor(n_estimators=300, random_state=random_state, max_depth=3)
rf.fit(X, y)
rf

In [None]:
keytoken_ids = []
for keyword in [
 "plt",
 "pd",
 "sk",
 "fit",
 "predict",
 " plt",
 " pd",
 " sk",
 " fit",
 " predict",
 "testtest",
]:
 ids = tokenizer([keyword]).input_ids[0]
 if len(ids) == 1:
 keytoken_ids.append(ids[0])
 else:
 print(f"Keyword has not single token: {keyword}")

'Keyword has not single token: testtest'

In [None]:
from torch.nn import CrossEntropyLoss
import torch


def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
 # Shift so that tokens < n predict n
 shift_labels = inputs[..., 1:].contiguous()
 shift_logits = logits[..., :-1, :].contiguous()
 # Calculate per-token loss
 loss_fct = CrossEntropyLoss(reduce=False)
 loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
 # Resize and average loss per sample
 loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
 # Calculate and scale weighting
 weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(
 axis=[0, 2]
 )
 weights = alpha * (1.0 + weights)
 # Calculate weighted average
 weighted_loss = (loss_per_sample * weights).mean()
 return weighted_loss

In [None]:
from torch.utils.data.dataloader import DataLoader

tokenized_dataset.set_format("torch")
train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=32, shuffle=True)
eval_dataloader = DataLoader(tokenized_dataset["valid"], batch_size=32)

In [None]:
weight_decay = 0.1


def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
 params_with_wd, params_without_wd = [], []
 for n, p in model.named_parameters():
 if any(nd in n for nd in no_decay):
 params_without_wd.append(p)
 else:
 params_with_wd.append(p)
 return [
 {"params": params_with_wd, "weight_decay": weight_decay},
 {"params": params_without_wd, "weight_decay": 0.0},
 ]

In [None]:
def evaluate():
 model.eval()
 losses = []
 for step, batch in enumerate(eval_dataloader):
 with torch.no_grad():
 outputs = model(batch["input_ids"], labels=batch["input_ids"])

 losses.append(accelerator.gather(outputs.loss))
 loss = torch.mean(torch.cat(losses))
 try:
 perplexity = torch.exp(loss)
 except OverflowError:
 perplexity = float("inf")
 return loss.item(), perplexity.item()

In [None]:
model = GPT2LMHeadModel(config)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=5e-4)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator(fp16=True)

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
 model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
 name="linear",
 optimizer=optimizer,
 num_warmup_steps=1_000,
 num_training_steps=num_training_steps,
)

In [None]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "codeparrot-ds-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'sgugger/codeparrot-ds-accelerate'

In [None]:
output_dir = "codeparrot-ds-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

In [None]:
evaluate()

(10.934126853942871, 56057.14453125)

In [None]:
from tqdm.notebook import tqdm

gradient_accumulation_steps = 8
eval_steps = 5_000

model.train()
completed_steps = 0
for epoch in range(num_train_epochs):
 for step, batch in tqdm(
 enumerate(train_dataloader, start=1), total=num_training_steps
 ):
 logits = model(batch["input_ids"]).logits
 loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
 if step % 100 == 0:
 accelerator.print(
 {
 "lr": get_lr(),
 "samples": step * samples_per_step,
 "steps": completed_steps,
 "loss/train": loss.item() * gradient_accumulation_steps,
 }
 )
 loss = loss / gradient_accumulation_steps
 accelerator.backward(loss)
 if step % gradient_accumulation_steps == 0:
 accelerator.clip_grad_norm_(model.parameters(), 1.0)
 optimizer.step()
 lr_scheduler.step()
 optimizer.zero_grad()
 completed_steps += 1
 if (step % (eval_steps * gradient_accumulation_steps)) == 0:
 eval_loss, perplexity = evaluate()
 accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
 model.train()
 accelerator.wait_for_everyone()
 unwrapped_model = accelerator.unwrap_model(model)
 unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
 if accelerator.is_main_process:
 tokenizer.save_pretrained(output_dir)
 repo.push_to_hub(
 commit_message=f"Training in progress step {step}", blocking=False
 )