{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Predict regulatory regions from the DNA sequence using pytorch" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this notebook we illustrate how to use Janggu datasets with pytorch in order to predict Oct4 and Mafk binding sites.\n", "We will make use only of a few concepts available from pytorch. For more information on pytorch, please consult the official documentation." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To run this tutorial you need to install pytorch and tqdm beforehand." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/wkopp/anaconda3/envs/jdev/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", " from ._conv import register_converters as _register_converters\n", "Using TensorFlow backend.\n" ] } ], "source": [ "import os\n", "from tqdm import tqdm\n", "\n", "import numpy as np\n", "import torch\n", "import torch.optim as optim\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "from torch.utils.data import Dataset, DataLoader\n", "\n", "from pkg_resources import resource_filename\n", "\n", "from janggu.data import Bioseq\n", "from janggu.data import Cover\n", "from janggu.data import ReduceDim\n", "from janggu.data import Transpose\n", "from janggu.layers import DnaConv2D\n", "\n", "from sklearn.metrics import roc_auc_score" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from IPython.display import Image\n", "\n", "np.random.seed(1234)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First, we need to specify the output directory in which the results are stored." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "os.environ['JANGGU_OUTPUT'] = '/home/wkopp/janggu_examples'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Similar as in the keras tutorial notebook, we start off by loading the Janggu datasets.\n", "Specify the DNA sequence feature order. Order 1, 2 and 3 correspond to mono-, di- and tri-nucleotide based features (see Tutorial)." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "order = 3\n", "binsize = 200" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# load the dataset\n", "# The pseudo genome represents just a concatenation of all sequences\n", "# in sample.fa and sample2.fa. Therefore, the results should be almost\n", "# identically to the models obtained from classify_fasta.py.\n", "REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa')\n", "# ROI contains regions spanning positive and negative examples\n", "ROI_TRAIN_FILE = resource_filename('janggu', 'resources/roi_train.bed')\n", "ROI_TEST_FILE = resource_filename('janggu', 'resources/roi_test.bed')\n", "# PEAK_FILE only contains positive examples\n", "PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load the datasets for training and testing" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Training input and labels are purely defined genomic coordinates\n", "DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,\n", " roi=ROI_TRAIN_FILE,\n", " binsize=binsize,\n", " order=order,\n", " cache=True)\n", "\n", "# The ReduceDim wrapper transforms the dataset from a 4D object to a 2D table-like representation\n", "LABELS = ReduceDim(Cover.create_from_bed('peaks', roi=ROI_TRAIN_FILE,\n", " bedfiles=PEAK_FILE,\n", " binsize=binsize,\n", " resolution=binsize,\n", " dtype='int',\n", " cache=True,\n", " storage='sparse'))\n", "\n", "\n", "DNA_TEST = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,\n", " roi=ROI_TEST_FILE,\n", " binsize=binsize,\n", " order=order)\n", "\n", "LABELS_TEST = ReduceDim(Cover.create_from_bed('peaks',\n", " bedfiles=PEAK_FILE,\n", " roi=ROI_TEST_FILE,\n", " binsize=binsize,\n", " resolution=binsize,\n", " dtype='int',\n", " storage='sparse'))\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "training set: (7797, 198, 1, 64) (7797, 1)\n", "test set: (200, 198, 1, 64) (200, 1)\n" ] } ], "source": [ "print('training set:', DNA.shape, LABELS.shape)\n", "print('test set:', DNA_TEST.shape, LABELS_TEST.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next, we specify a neural network in pytorch that takes the DNA sequence as input and predicts the class labels that reflect Oct4 or Mafk binding." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DNAConvNet(\n", " (conv1): Conv1d(64, 30, kernel_size=(21,), stride=(1,))\n", " (dense): Linear(in_features=30, out_features=1, bias=True)\n", " (sigmoid): Sigmoid()\n", ")\n" ] } ], "source": [ "class DNAConvNet(nn.Module):\n", " def __init__(self, order=1):\n", " super(DNAConvNet, self).__init__()\n", " \n", " self.conv1 = nn.Conv1d(pow(4, order), 30, 21)\n", " self.dense = nn.Linear(30, 1)\n", " self.sigmoid = nn.Sigmoid()\n", " \n", " def forward(self, x):\n", " x = F.relu(self.conv1(x))\n", " \n", " # emulates global_max_pooling\n", " x = F.max_pool1d(x, x.size()[-1]).flatten(1, -1)\n", " x = self.dense(x)\n", " x = self.sigmoid(x)\n", " return x\n", " \n", "net = DNAConvNet(order)\n", "print(net)\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Great! Now we have our model." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Before we can use the Janggu dataset objects with the pytorch model, we need to apply some transformation.\n", "First, we need to transform the 4D shape to a 3D object, since we use a Conv1D network layer at the input of the model. To this end we use the ReduceDim wrapper and remove axis=2 which represents merely a dummy dimension for the DNA sequence.\n", "Second, we need to change the order of the dimensions such that the channel is at the first dimension (rather than the last as is required for tensorflow). This is achieved by using the Transpose wrapper using the new axis ordering (0, 2, 1).\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finally, the numpy arrays produced by the janggu datasets need to converted to pytorch tensors.\n", "We introduce the ToTensor wrapper class for this purpose below:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "class ToTensor(torch.utils.data.Dataset):\n", " def __init__(self, data, dtype='float32'):\n", " self.data = data\n", " self.dtype = dtype\n", " \n", " def __len__(self):\n", " return len(self.data)\n", " \n", " def __repr__(self):\n", " return \"ToTensor({})\".format(str(self.data))\n", " def __getitem__(self, idx):\n", " if torch.is_tensor(idx):\n", " idx = idx.tolist()\n", " \n", " # enforce type compatibility\n", " data = self.data[idx].astype(self.dtype)\n", " \n", " return torch.from_numpy(data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "After applying these transformations we can feed input data to the model to generate predictions:\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ToTensor(Transpose(ReduceDim(Bioseq(\"dna\"))))" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "DNA_ = ToTensor(Transpose(ReduceDim(DNA, axis=2), axis=(0,2,1)), dtype='float32')\n", "DNA_" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[0.4196],\n", " [0.4227],\n", " [0.4189]], grad_fn=)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "net(DNA_[:3])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Utilizing the pytorch DataLoader infrastructure" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the following we shall set up the training infrastructure using the DataLoader class from pytorch.\n", "DataLoader allows to generate mini-batches which are then supplied to the network.\n", "\n", "For more details on DataLoaders see the official pytorch documentation." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First, we start by introducing an additional Dataset class, termed InOutTuple, which returns tuples of input-output datapoint pairs.\n", "In this example, our InOutTuple already generates batches of datapoints rather than individual samples and it takes care of shuffling. This may also be deferred to the DataLoader, but we don't do that for the purpose of this tutorial. " ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "class InOutTuple(Dataset):\n", " def __init__(self, inputs, labels, batch_size=32, shuffle=True):\n", " self.inputs = inputs\n", " self.labels = labels\n", " self.batch_size = batch_size\n", " self.shuffle = shuffle\n", " self.ridx = np.random.permutation(len(inputs))\n", " \n", " def __len__(self):\n", " return int(np.ceil(len(self.inputs) / self.batch_size))\n", " \n", " def __getitem__(self, idx):\n", " \n", " if torch.is_tensor(idx):\n", " idx = idx.tolist()\n", " \n", " ridx = self.ridx[self.batch_size*idx:(self.batch_size*(idx+1))].tolist()\n", " # enforce type compatibility\n", " return self.inputs[ridx], self.labels[ridx]\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "With this we create a dataset object representing input-output pairs and a dataloader that generates minibatches from IODATA." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "244" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "IODATA = InOutTuple(DNA_, ToTensor(LABELS))\n", "len(IODATA)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "dataloader = DataLoader(IODATA, batch_size=1,\n", " shuffle=False, num_workers=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We are now ready to implement the training loop where we make use of the binary cross-entropy loss and the Adadelta optimizer.\n", "The model is then fitted for 100 epochs." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 100/100 [06:14<00:00, 3.74s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "########################################\n", "loss: 2.4993820261443034e-05\n", "########################################\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "criterion = nn.BCELoss()\n", "optimizer = optim.Adadelta(net.parameters())\n", "\n", "final_loss = 0.\n", "for epoch in tqdm(range(100)):\n", " for i, d in enumerate(dataloader, 0):\n", " inputs, labels = d\n", " # when using dataloaders a dummy dimension\n", " # is introduced which we don't need.\n", " # with view we eliminate it.\n", " inputs = inputs.view(inputs.shape[1:])\n", " labels = labels.view(labels.shape[1:])\n", "\n", " optimizer.zero_grad()\n", " \n", " outputs = net(inputs)\n", " loss = criterion(outputs, labels)\n", " loss.backward()\n", " \n", " optimizer.step()\n", " final_loss = loss.item()\n", "\n", "print('#' * 40)\n", "print('loss: {}'.format(final_loss))\n", "print('#' * 40)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Since, we ran the model on cpus it is relatively time-consuming. pytorch can however, also be configured to utilize gpus if available." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The predictions may be converted back to Cover object and subsequently exported as bigwig in order to inspect the plausibility of the results in the genome browser." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ToTensor(Transpose(ReduceDim(Bioseq(\"dna\"))))" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "DNA_TEST_ = ToTensor(Transpose(ReduceDim(DNA_TEST, axis=2), axis=(0,2,1)), dtype='float32')\n", "\n", "DNA_TEST_" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "\n", "# convert the prediction to a cover object\n", "pred = net(DNA_TEST_[:]).detach().numpy()\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "cov_pred = Cover.create_from_array('BindingProba', pred, LABELS_TEST.gindexer)\n", "\n", "# predictions (or feature activities) can finally be exported to bigwig\n", "cov_pred.export_to_bigwig(output_dir=os.environ['JANGGU_OUTPUT'])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AUC: 0.9993\n" ] } ], "source": [ "print(\"AUC:\", roc_auc_score(LABELS_TEST[:], pred))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }