{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "7TF3MtJc9MXb" }, "source": [ "## PDB to ESM-IF1 embeddings" ] }, { "cell_type": "markdown", "metadata": { "id": "U3tYZu3O9Pi0" }, "source": [ "In this notebook, I use [ESM-IF1 model](https://www.biorxiv.org/content/10.1101/2022.04.10.487779v1) to obtain antibody structure embedding for train/valid/test pdb files. For my work, I froze the ESM-IF1 model, and average pooled the antibody structure embeddings from the transformer encoder, following the procedure used in the [facebook research ESM-IF1 notebook](https://github.com/facebookresearch/esm/tree/main/examples/inverse_folding)." ] }, { "cell_type": "markdown", "metadata": { "id": "yUW_BzroStJy" }, "source": [ "## Setup" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "AzSn1VSX9OVK", "outputId": "dca4bb82-c9d1-4d4b-f745-cd73a52ba5a2" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.9/10.9 MB\u001b[0m \u001b[31m69.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.1/5.1 MB\u001b[0m \u001b[31m28.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m20.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m947.1/947.1 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", " Building wheel for fair-esm (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.2/44.2 MB\u001b[0m \u001b[31m37.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h" ] } ], "source": [ "# Colab environment setup\n", "\n", "# Install the correct version of Pytorch Geometric.\n", "import torch\n", "\n", "def format_pytorch_version(version):\n", " return version.split('+')[0]\n", "\n", "TORCH_version = torch.__version__\n", "TORCH = format_pytorch_version(TORCH_version)\n", "\n", "def format_cuda_version(version):\n", " return 'cu' + version.replace('.', '')\n", "\n", "CUDA_version = torch.version.cuda\n", "CUDA = format_cuda_version(CUDA_version)\n", "\n", "!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-{TORCH}+{CUDA}.html\n", "!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-{TORCH}+{CUDA}.html\n", "!pip install -q torch-cluster -f https://data.pyg.org/whl/torch-{TORCH}+{CUDA}.html\n", "!pip install -q torch-spline-conv -f https://data.pyg.org/whl/torch-{TORCH}+{CUDA}.html\n", "!pip install -q torch-geometric\n", "\n", "# Install esm\n", "!pip install -q git+https://github.com/facebookresearch/esm.git\n", "\n", "# Install biotite\n", "!pip install -q biotite\n", "\n", "# Install proteinflow\n", "!pip install proteinflow &> /dev/null" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zlQdEQErBELS" }, "outputs": [], "source": [ "## Verify that pytorch-geometric is correctly installed\n", "import torch_geometric\n", "import torch_sparse\n", "from torch_geometric.nn import MessagePassing" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "OsCMx5-lPTI7" }, "outputs": [], "source": [ "# Import necessary libraries\n", "import os\n", "import pickle\n", "import pandas as pd\n", "import torch\n", "from tqdm import tqdm\n", "from pathlib import Path\n", "import esm\n", "from proteinflow.data import ProteinEntry\n", "from fastcore.parallel import parallel" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "FwuYN81tBBLP", "outputId": "6b0b5373-0585-4043-c130-e68e00d3f51b" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", " and should_run_async(code)\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/gdrive\n" ] } ], "source": [ "from google.colab import drive\n", "drive.mount('/content/gdrive', force_remount=True)\n", "\n", "path = Path(\"/content/gdrive/\")\n", "path_data = Path(\"/content/gdrive/MyDrive/data\")" ] }, { "cell_type": "code", "source": [ "# Define directories\n", "train_folder = Path(\"/content/gdrive/MyDrive/data/proteinflow_20240520-0899946/train\")\n", "valid_folder = Path(\"/content/gdrive/MyDrive/data/proteinflow_20240520-0899946/valid\")\n", "test_folder = Path(\"/content/gdrive/MyDrive/data/proteinflow_20240520-0899946/test\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "oar04dtNMIQW", "outputId": "44043771-e506-4622-8a43-ac9d4affff62" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", " and should_run_async(code)\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "7cNp6vPY4EgO" }, "source": [ "## PDB to ESM-IF1 embeddings" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "lt7K-nrqfBxG", "outputId": "9baacc6d-f99d-49dd-cda5-1de579d05f0a" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", " and should_run_async(code)\n" ] } ], "source": [ "import time\n", "\n", "# Load the ESM-IF1 model and move it to GPU\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "esm_if1_model, alphabet = esm.pretrained.esm_if1_gvp4_t16_142M_UR50()\n", "esm_if1_model = esm_if1_model.eval().to(device)\n", "\n", "# Function to process a batch of PDB files\n", "def process_pdb_batch(pdb_files):\n", " results = []\n", " for pdb_file in pdb_files:\n", " try:\n", " print(f\"Processing {pdb_file}...\") # Log progress\n", "\n", " # Load the protein entry from the pickle file\n", " protein_entry = ProteinEntry.from_pickle(pdb_file)\n", "\n", " # Convert to PDB format\n", " pdb_path = str(pdb_file).replace('.pickle', '.pdb')\n", " protein_entry.to_pdb(pdb_path)\n", "\n", " # Load the structure and extract coordinates and sequence\n", " structure = esm.inverse_folding.util.load_structure(pdb_path)\n", " coords, native_seq = esm.inverse_folding.util.extract_coords_from_structure(structure)\n", " coords = torch.tensor(coords, dtype=torch.float32).to(device)\n", "\n", " with torch.no_grad():\n", " rep = esm.inverse_folding.util.get_encoder_output(esm_if1_model, alphabet, coords)\n", "\n", " # Average pooling\n", " embedding = rep.mean(dim=0).detach().cpu().numpy()\n", "\n", " # Remove the temporary PDB file\n", " os.remove(pdb_path)\n", "\n", " # Add result to batch\n", " results.append({\n", " 'id': pdb_file.stem,\n", " 'sequence': native_seq,\n", " 'embedding': embedding\n", " })\n", "\n", " except Exception as e:\n", " print(f\"Error processing {pdb_file}: {e}\")\n", " results.append(None)\n", " return results\n", "\n", "# Function to process and extract data from PDB files in a directory using batching\n", "def process_pdb_files(data_dir, output_file, batch_size=8):\n", " pdb_files = list(data_dir.glob(\"*.pickle\"))\n", " all_results = []\n", "\n", " for i in tqdm(range(0, len(pdb_files), batch_size)):\n", " batch_files = pdb_files[i:i + batch_size]\n", " batch_results = process_pdb_batch(batch_files)\n", " all_results.extend(batch_results)\n", "\n", " # Filter out None results (errors)\n", " records = [result for result in all_results if result is not None]\n", "\n", " # Save the records to a DataFrame and then to a pickle file\n", " df = pd.DataFrame(records)\n", " df.to_pickle(output_file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tb7AAf72YV5G", "outputId": "ba93cff3-21f6-470e-d74d-fa371b9e2166" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", " and should_run_async(code)\n", " 0%| | 0/197 [00:00\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsequenceembedding
04r4b-F_E_nanDIQMTQSPSFVSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKL...[-0.06567193, -0.024550922, -0.08586867, -0.02...
14r4b-D_C_nanDIQMTQSPSFVSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKL...[-0.05318726, -0.008427778, -0.082444064, -0.0...
24r4b-B_A_nanDIQMTQSPSFVSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKL...[-0.07950935, -0.033451352, -0.079082675, -0.0...
34r4b-H_L_nanQVQLQQWGAGLLKPSETLSLTCGVYGESLSGHYWSWVRQPPGKRLE...[-0.07344883, -0.053831305, -0.08566607, -0.05...
41l7i-H_L_nanEVQLVESGGGLVQPGGSLRLSCAASGFTFTDYTMDWVRQAPGKGLE...[-0.028149318, -0.037230052, -0.1015465, -0.03...
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "train_df", "summary": "{\n \"name\": \"train_df\",\n \"rows\": 1567,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1567,\n \"samples\": [\n \"3grw-H_L_A\",\n \"8f5i-X_Y_A\",\n \"7xj9-I_H_C\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sequence\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1345,\n \"samples\": [\n \"KIEEGKLVIWINGDKGYNGLAEVGKKFEKDTGIKVTVEHPDKLEEKFPQVAATGDGPDIIFWAHDRFGGYAQSGLLAEITPDKAFQDKLYPFTWDAVRYNGKLIAYPIAVEALSLIYNKDLLPNPPKTWEEIPALDKELKAKGKSALMFNLQEPYFTWPLIAADGGYAFKYENGKYDIKDVGVDNAGAKAGLTFLVDLIKNKHMNADTDYSIAEAAFNKGETAMTINGPWAWSNIDTSKVNYGVTVLPTFKGQPSKPFVGVLSAGINAASPNKELAKEFLENYLLTDEGLEAVNKDKPLGAVALKSYEEELAKDPRIAATMENAQKGEIMPNIPQMSAFWYAVRTAVINAASGRQTVDEALKDAQTVQLVESGGGLVQPGGSLRLSCAASGFNVYYSSIHWVRQAPGKGLEWVASIYSYYGSTSYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCAREYHSYVYEPPLYGMDYWGQGTLVTVSSASTKGPSVFPLAPSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSCSPSSLSASVGDRVTITCRASQSVSSAVAWYQQKPGSAPSLLIYSASSLYSGVPSRFSGSRSGTDFTLTISSLQPEDFATYYCQQSPQGYLVTFGQGTKVEIKRTVAAPSVFIFPPSDSQLKSGTASVVCLLNNFYPREAKVQWSVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSSADYEKHKVYACEVTHQGLSSPVTKSFNRGEC\",\n \"DIQMTQSPSSLSASVGDRVTITCRASQGISSSLAWYQQKPGKAPKLLIYGASETESGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQNTKVGSSYGNTFGGGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGECVQLVESGGGLVQPGRSLRLSCAASGFTVHSSYYMAWVRQAPGKGLEWVGAIFTGSGAEYKAEWAKGRVTISKDTSKNQVVLTMTNMDPVDTATYYCASDAGYDYPTHAMHYWGQGTLVTVSSASTKGPSVFPLAPCSRSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTKTYTCNVDHKPSNTKVDKRVESKYEQTYVISAPKIFRVGASENIVIQVYGYTEAFDATISIKSYPDKKFSYSSGHVHLSSENKFQNSAILTIQPVSYVYLEVVSKHFSKSKRMPITY\",\n \"HEVVKFMDVYQRSYCHPIETLVDIFQEYPDEIEYIFKPSCVPLMRCGGCCNDEGLECVPTEESNITMQIMRIKPHQGQHIGEMSFLQHNKCECRPKKVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIGWVRRAPGKGEELVARIYPTNGYTRYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCYYHYYGWHPGYGLSYSSGQGTLVTVS\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"embedding\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 22 } ], "source": [ "train_df = pd.read_pickle(path_data/'proteinflow_esmif1_20240520-0899946/train_data.pkl')\n", "valid_df = pd.read_pickle(path_data/'proteinflow_esmif1_20240520-0899946/valid_data.pkl')\n", "test_df = pd.read_pickle(path_data/'proteinflow_esmif1_20240520-0899946/test_data.pkl')\n", "train_df.head()" ] }, { "cell_type": "code", "source": [ "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "\n", "# Set up the matplotlib figure\n", "plt.figure(figsize=(18, 5))\n", "\n", "plt.subplot(1, 3, 1)\n", "sns.histplot(train_df['sequence'].apply(lambda x: len(x)).values, kde=True)\n", "plt.subplot(1, 3, 2)\n", "sns.histplot(valid_df['sequence'].apply(lambda x: len(x)).values, kde=True)\n", "plt.subplot(1, 3, 3)\n", "sns.histplot(test_df['sequence'].apply(lambda x: len(x)).values, kde=True);" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 489 }, "id": "RMLQ3WCThNl2", "outputId": "c6a60940-99d9-419a-fb9d-ebf69d86a244" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", " and should_run_async(code)\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "O9C1pJwQh4Nm" }, "execution_count": null, "outputs": [] } ], "metadata": { "colab": { "machine_shape": "hm", "provenance": [], "gpuType": "A100", "authorship_tag": "ABX9TyPgE9nAxtMocqbi8P3kogQ7", "include_colab_link": true }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "nbformat": 4, "nbformat_minor": 0 }