# Install

conda install pytorch torchvision -c soumith

# Import

In [1]:
import torch

# Tutorial

http://pytorch.org/tutorials/beginner/blitz/tensor_tutorial.html

http://pytorch.org/tutorials/

In [2]:
x = torch.Tensor(5, 3)
print(x)


1.00000e-44 *
 0.0000 0.0000 0.0000
 0.0000 1.6816 0.0000
 0.0000 0.0000 0.0000
 0.0000 0.0000 0.0000
 0.0000 0.0000 0.0000
[torch.FloatTensor of size 5x3]



# nn module

http://pytorch.org/tutorials/beginner/pytorch_with_examples.html#nn-module

In [6]:
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Variables for its weight and bias.
model = torch.nn.Sequential(
 torch.nn.Linear(D_in, H),
 torch.nn.ReLU(),
 torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
 # Forward pass: compute predicted y by passing x to the model. Module objects
 # override the __call__ operator so you can call them like functions. When
 # doing so you pass a Variable of input data to the Module and it produces
 # a Variable of output data.
 y_pred = model(x)

 # Compute and print loss. We pass Variables containing the predicted and true
 # values of y, and the loss function returns a Variable containing the
 # loss.
 loss = loss_fn(y_pred, y)
 if t%50 == 0:
 print(t, loss.data[0])

 # Zero the gradients before running the backward pass.
 model.zero_grad()

 # Backward pass: compute gradient of the loss with respect to all the learnable
 # parameters of the model. Internally, the parameters of each Module are stored
 # in Variables with requires_grad=True, so this call will compute gradients for
 # all learnable parameters in the model.
 loss.backward()

 # Update the weights using gradient descent. Each parameter is a Variable, so
 # we can access its data and gradients like we did before.
 for param in model.parameters():
 param.data -= learning_rate * param.grad.data

(0, 717.2719116210938)
(50, 35.097198486328125)
(100, 1.8821511268615723)
(150, 0.1728428155183792)
(200, 0.02194761298596859)
(250, 0.0034840735606849194)
(300, 0.0006572074489668012)
(350, 0.00014404028479475528)
(400, 3.580378324841149e-05)
(450, 9.810625670070294e-06)


https://github.com/huggingface/pytorch-transformers

# Pytorch transformers

A library of state-of-the-art pretrained models for Natural Language Processing (NLP) 

In [1]:
!pip install pytorch-transformers


Collecting pytorch-transformers
[?25l Downloading https://files.pythonhosted.org/packages/40/b5/2d78e74001af0152ee61d5ad4e290aec9a1e43925b21df2dc74ec100f1ab/pytorch_transformers-1.0.0-py3-none-any.whl (137kB)
[K 100% |████████████████████████████████| 143kB 488kB/s ta 0:00:01
[?25hCollecting sentencepiece (from pytorch-transformers)
[?25l Downloading https://files.pythonhosted.org/packages/99/8c/ca2c3ab61848526e85146aef40bfb7b399c7e70b1686a43b82d44cf1690f/sentencepiece-0.1.82-cp37-cp37m-macosx_10_6_x86_64.whl (1.1MB)
[K 100% |████████████████████████████████| 1.1MB 11.9MB/s ta 0:00:01
Collecting boto3 (from pytorch-transformers)
[?25l Downloading https://files.pythonhosted.org/packages/39/82/608bb4a689dc543d09555e70ffc0e180bd72df76d53b68bf8891d7cbba91/boto3-1.9.194-py2.py3-none-any.whl (128kB)
[K 100% |████████████████████████████████| 133kB 15.4MB/s ta 0:00:01
Collecting regex (from pytorch-transformers)
[?25l Downloading https://files.pythonhosted.org/packages/6f/4e/1b178c38c

In [1]:
import torch
from pytorch_transformers import *

# PyTorch-Transformers has a unified API
# for 6 transformer architectures and 27 pretrained weights.
# Model | Tokenizer | Pretrained weights shortcut
MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'),
 (OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'),
 (GPT2Model, GPT2Tokenizer, 'gpt2'),
 (TransfoXLModel, TransfoXLTokenizer, 'transfo-xl-wt103'),
 (XLNetModel, XLNetTokenizer, 'xlnet-base-cased'),
 (XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024')]

## Quickstart

In [1]:
import torch
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/datalab/.cache/torch/pytorch_transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [2]:
tokenized_text

['[CLS]',
 'who',
 'was',
 'jim',
 'henson',
 '?',
 '[SEP]',
 'jim',
 '[MASK]',
 'was',
 'a',
 'puppet',
 '##eer',
 '[SEP]']

In [9]:
?BertModel.from_pretrained

In [13]:
model = BertModel.from_pretrained(u"/Users/datalab/bigdata/bert-base-uncased.bin")

INFO:pytorch_transformers.modeling_utils:loading configuration file /Users/datalab/bigdata/bert-base-uncased.bin


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte

In [3]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('/Users/datalab/bigdata/bert-base-uncased-pytorch_model.bin')

# Set the model in evaluation mode to desactivate the DropOut modules
# This is IMPORTANT to have reproductible results during evaluation!
model.eval()

# If you have a GPU, put everything on cuda
# tokens_tensor = tokens_tensor.to('cuda')
# segments_tensors = segments_tensors.to('cuda')
# model.to('cuda') 

# Predict hidden states features for each layer
with torch.no_grad():
 # See the models docstrings for the detail of the inputs
 outputs = model(tokens_tensor, token_type_ids=segments_tensors)
 # PyTorch-Transformers models always output tuples.
 # See the models docstrings for the detail of all the outputs
 # In our case, the first element is the hidden state of the last layer of the Bert model
 encoded_layers = outputs[0]
# We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)

INFO:pytorch_transformers.modeling_utils:loading configuration file /Users/datalab/bigdata/bert-base-uncased-pytorch_model.bin


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte

In [None]:
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
segments_tensors = segments_tensors.to('cuda')
model.to('cuda')

# Predict all tokens
with torch.no_grad():
 outputs = model(tokens_tensor, token_type_ids=segments_tensors)
 predictions = outputs[0]

# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
assert predicted_token == 'henson'