{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Biomni 101" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Welcome to Biomni! Here is a simple tutorial on the basics of Biomni package. After you installed the environment, you can then simply try this to initialize the agent, which will automatically download the raw data lake files:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "🎓 Academic mode: Using all datasets (including non-commercial)\n", "\n", "==================================================\n", "🔧 BIOMNI CONFIGURATION\n", "==================================================\n", "📋 DEFAULT CONFIG (Including Database LLM):\n", " Path: ./data\n", " Timeout Seconds: 600\n", " Llm: claude-sonnet-4-5\n", " Temperature: 0.7\n", " Use Tool Retriever: True\n", " Commercial Mode: Academic (all datasets)\n", "\n", "🤖 AGENT LLM (Constructor Override):\n", " LLM Model: claude-sonnet-4-5-20250929\n", "==================================================\n", "\n", "Created directory: ./biomni_data\n", "Checking and downloading missing data lake files...\n", "Downloading affinity_capture-ms.parquet from data_lake...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "affinity_capture-ms.parquet: 100%|█████████| 5.59M/5.59M [00:00<00:00, 11.8MB/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "✓ Successfully downloaded: affinity_capture-ms.parquet\n", "Downloading affinity_capture-rna.parquet from data_lake...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "affinity_capture-rna.parquet: 100%|████████| 43.0k/43.0k [00:00<00:00, 1.21MB/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "✓ Successfully downloaded: affinity_capture-rna.parquet\n", "Downloading BindingDB_All_202409.tsv from data_lake...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "BindingDB_All_202409.tsv: 2%|▎ | 154M/6.25G [00:43<28:35, 3.55MB/s]\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 7\u001b[39m\n\u001b[32m 3\u001b[39m sys.path.append(\u001b[33m\"\u001b[39m\u001b[33m../\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 5\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mbiomni\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01magent\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m A1\n\u001b[32m----> \u001b[39m\u001b[32m7\u001b[39m agent = \u001b[43mA1\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m./biomni_data\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mllm\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mclaude-sonnet-4-5-20250929\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/Biomni/tutorials/../biomni/agent/a1.py:167\u001b[39m, in \u001b[36mA1.__init__\u001b[39m\u001b[34m(self, path, llm, source, use_tool_retriever, timeout_seconds, base_url, api_key, commercial_mode, expected_data_lake_files)\u001b[39m\n\u001b[32m 165\u001b[39m \u001b[38;5;66;03m# Check and download missing data lake files\u001b[39;00m\n\u001b[32m 166\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33mChecking and downloading missing data lake files...\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m167\u001b[39m \u001b[43mcheck_and_download_s3_files\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 168\u001b[39m \u001b[43m \u001b[49m\u001b[43ms3_bucket_url\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mhttps://biomni-release.s3.amazonaws.com\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 169\u001b[39m \u001b[43m \u001b[49m\u001b[43mlocal_data_lake_path\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdata_lake_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 170\u001b[39m \u001b[43m \u001b[49m\u001b[43mexpected_files\u001b[49m\u001b[43m=\u001b[49m\u001b[43mexpected_data_lake_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 171\u001b[39m \u001b[43m \u001b[49m\u001b[43mfolder\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mdata_lake\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 172\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 174\u001b[39m \u001b[38;5;66;03m# Check if benchmark directory structure is complete\u001b[39;00m\n\u001b[32m 175\u001b[39m benchmark_ok = \u001b[38;5;28;01mFalse\u001b[39;00m\n", "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/Biomni/tutorials/../biomni/utils.py:1011\u001b[39m, in \u001b[36mcheck_and_download_s3_files\u001b[39m\u001b[34m(s3_bucket_url, local_data_lake_path, expected_files, folder)\u001b[39m\n\u001b[32m 1008\u001b[39m s3_file_url = urljoin(s3_bucket_url + \u001b[33m\"\u001b[39m\u001b[33m/\u001b[39m\u001b[33m\"\u001b[39m + folder + \u001b[33m\"\u001b[39m\u001b[33m/\u001b[39m\u001b[33m\"\u001b[39m, filename)\n\u001b[32m 1009\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mDownloading \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfilename\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m from \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfolder\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m...\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m1011\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mdownload_with_progress\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms3_file_url\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlocal_file_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[32m 1012\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m✓ Successfully downloaded: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfilename\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 1013\u001b[39m download_results[filename] = \u001b[38;5;28;01mTrue\u001b[39;00m\n", "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/Biomni/tutorials/../biomni/utils.py:949\u001b[39m, in \u001b[36mcheck_and_download_s3_files..download_with_progress\u001b[39m\u001b[34m(url, file_path, desc)\u001b[39m\n\u001b[32m 947\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m total_size > \u001b[32m0\u001b[39m:\n\u001b[32m 948\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m tqdm.tqdm(total=total_size, unit=\u001b[33m\"\u001b[39m\u001b[33mB\u001b[39m\u001b[33m\"\u001b[39m, unit_scale=\u001b[38;5;28;01mTrue\u001b[39;00m, desc=desc, ncols=\u001b[32m80\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m pbar:\n\u001b[32m--> \u001b[39m\u001b[32m949\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mresponse\u001b[49m\u001b[43m.\u001b[49m\u001b[43miter_content\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk_size\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m8192\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 950\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 951\u001b[39m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m.\u001b[49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/biomni_fixed/lib/python3.11/site-packages/requests/models.py:820\u001b[39m, in \u001b[36mResponse.iter_content..generate\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 818\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m.raw, \u001b[33m\"\u001b[39m\u001b[33mstream\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m 819\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m820\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m.raw.stream(chunk_size, decode_content=\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[32m 821\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m ProtocolError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 822\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m ChunkedEncodingError(e)\n", "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/biomni_fixed/lib/python3.11/site-packages/urllib3/response.py:1091\u001b[39m, in \u001b[36mHTTPResponse.stream\u001b[39m\u001b[34m(self, amt, decode_content)\u001b[39m\n\u001b[32m 1089\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1090\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_fp_closed(\u001b[38;5;28mself\u001b[39m._fp) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m._decoded_buffer) > \u001b[32m0\u001b[39m:\n\u001b[32m-> \u001b[39m\u001b[32m1091\u001b[39m data = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m=\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1093\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m data:\n\u001b[32m 1094\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m data\n", "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/biomni_fixed/lib/python3.11/site-packages/urllib3/response.py:980\u001b[39m, in \u001b[36mHTTPResponse.read\u001b[39m\u001b[34m(self, amt, decode_content, cache_content)\u001b[39m\n\u001b[32m 977\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m._decoded_buffer) >= amt:\n\u001b[32m 978\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._decoded_buffer.get(amt)\n\u001b[32m--> \u001b[39m\u001b[32m980\u001b[39m data = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_raw_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 982\u001b[39m flush_decoder = amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m (amt != \u001b[32m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m data)\n\u001b[32m 984\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m data \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m._decoded_buffer) == \u001b[32m0\u001b[39m:\n", "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/biomni_fixed/lib/python3.11/site-packages/urllib3/response.py:904\u001b[39m, in \u001b[36mHTTPResponse._raw_read\u001b[39m\u001b[34m(self, amt, read1)\u001b[39m\n\u001b[32m 901\u001b[39m fp_closed = \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m._fp, \u001b[33m\"\u001b[39m\u001b[33mclosed\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[32m 903\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m._error_catcher():\n\u001b[32m--> \u001b[39m\u001b[32m904\u001b[39m data = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_fp_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mread1\u001b[49m\u001b[43m=\u001b[49m\u001b[43mread1\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m fp_closed \u001b[38;5;28;01melse\u001b[39;00m \u001b[33mb\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 905\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m amt != \u001b[32m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m data:\n\u001b[32m 906\u001b[39m \u001b[38;5;66;03m# Platform-specific: Buggy versions of Python.\u001b[39;00m\n\u001b[32m 907\u001b[39m \u001b[38;5;66;03m# Close the connection when no data is returned\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 912\u001b[39m \u001b[38;5;66;03m# not properly close the connection in all cases. There is\u001b[39;00m\n\u001b[32m 913\u001b[39m \u001b[38;5;66;03m# no harm in redundantly calling close.\u001b[39;00m\n\u001b[32m 914\u001b[39m \u001b[38;5;28mself\u001b[39m._fp.close()\n", "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/biomni_fixed/lib/python3.11/site-packages/urllib3/response.py:887\u001b[39m, in \u001b[36mHTTPResponse._fp_read\u001b[39m\u001b[34m(self, amt, read1)\u001b[39m\n\u001b[32m 884\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._fp.read1(amt) \u001b[38;5;28;01mif\u001b[39;00m amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m._fp.read1()\n\u001b[32m 885\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 886\u001b[39m \u001b[38;5;66;03m# StringIO doesn't like amt=None\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m887\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_fp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m._fp.read()\n", "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/biomni_fixed/lib/python3.11/http/client.py:473\u001b[39m, in \u001b[36mHTTPResponse.read\u001b[39m\u001b[34m(self, amt)\u001b[39m\n\u001b[32m 470\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.length \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m amt > \u001b[38;5;28mself\u001b[39m.length:\n\u001b[32m 471\u001b[39m \u001b[38;5;66;03m# clip the read to the \"end of response\"\u001b[39;00m\n\u001b[32m 472\u001b[39m amt = \u001b[38;5;28mself\u001b[39m.length\n\u001b[32m--> \u001b[39m\u001b[32m473\u001b[39m s = \u001b[38;5;28mself\u001b[39m.fp.read(amt)\n\u001b[32m 474\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m s \u001b[38;5;129;01mand\u001b[39;00m amt:\n\u001b[32m 475\u001b[39m \u001b[38;5;66;03m# Ideally, we would raise IncompleteRead if the content-length\u001b[39;00m\n\u001b[32m 476\u001b[39m \u001b[38;5;66;03m# wasn't satisfied, but it might break compatibility.\u001b[39;00m\n\u001b[32m 477\u001b[39m \u001b[38;5;28mself\u001b[39m._close_conn()\n", "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/biomni_fixed/lib/python3.11/socket.py:718\u001b[39m, in \u001b[36mSocketIO.readinto\u001b[39m\u001b[34m(self, b)\u001b[39m\n\u001b[32m 716\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[32m 717\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m718\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_sock\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrecv_into\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 719\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n\u001b[32m 720\u001b[39m \u001b[38;5;28mself\u001b[39m._timeout_occurred = \u001b[38;5;28;01mTrue\u001b[39;00m\n", "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/biomni_fixed/lib/python3.11/ssl.py:1314\u001b[39m, in \u001b[36mSSLSocket.recv_into\u001b[39m\u001b[34m(self, buffer, nbytes, flags)\u001b[39m\n\u001b[32m 1310\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m flags != \u001b[32m0\u001b[39m:\n\u001b[32m 1311\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 1312\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m\"\u001b[39m %\n\u001b[32m 1313\u001b[39m \u001b[38;5;28mself\u001b[39m.\u001b[34m__class__\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m1314\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnbytes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1315\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1316\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m().recv_into(buffer, nbytes, flags)\n", "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/biomni_fixed/lib/python3.11/ssl.py:1166\u001b[39m, in \u001b[36mSSLSocket.read\u001b[39m\u001b[34m(self, len, buffer)\u001b[39m\n\u001b[32m 1164\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 1165\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m buffer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1166\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_sslobj\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1167\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1168\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._sslobj.read(\u001b[38;5;28mlen\u001b[39m)\n", "\u001b[31mKeyboardInterrupt\u001b[39m: " ] } ], "source": [ "import sys\n", "\n", "sys.path.append(\"../\")\n", "\n", "from biomni.agent import A1\n", "\n", "agent = A1(path=\"./biomni_data\", llm=\"claude-sonnet-4-5-20250929\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, you can simply start prompting the agent with the desired biomedical research task!" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using prompt-based retrieval with the agent's LLM\n", "================================\u001b[1m Human Message \u001b[0m=================================\n", "\n", "Plan a CRISPR screen to identify genes that regulate T cell exhaustion, \n", " measured by the change in T cell receptor (TCR) signaling between acute \n", " (interleukin-2 [IL-2] only) and chronic (anti-CD3 and IL-2) stimulation conditions. \n", " Generate 32 genes that maximize the perturbation effect.\n", "==================================\u001b[1m Ai Message \u001b[0m==================================\n", "\n", "I'll help you plan a CRISPR screen to identify genes that regulate T cell exhaustion. Let me break this down into a systematic approach.\n", "\n", "## Plan:\n", "\n", "1. [ ] Research T cell exhaustion biology and identify key pathways\n", "2. [ ] Query databases for genes involved in T cell exhaustion, TCR signaling, and chronic stimulation\n", "3. [ ] Identify genes from exhaustion-related pathways (checkpoint inhibitors, metabolic regulators, transcription factors)\n", "4. [ ] Select genes that are likely to show differential effects between acute vs chronic stimulation\n", "5. [ ] Design sgRNAs for the selected 32 genes\n", "6. [ ] Validate the gene list and provide experimental design recommendations\n", "\n", "Let me start by researching the biological basis of T cell exhaustion and identifying relevant genes.\n", "\n", "\n", "# First, let's import necessary libraries and query databases for T cell exhaustion-related genes\n", "from biomni.tool.database import query_uniprot, query_kegg, query_reactome, query_ensembl\n", "from biomni.tool.genomics import gene_set_enrichment_analysis, get_gene_set_enrichment_analysis_supported_database_list\n", "from biomni.tool.molecular_biology import design_knockout_sgrna\n", "import pandas as pd\n", "import numpy as np\n", "\n", "print(\"=== Step 1: Researching T cell exhaustion biology ===\")\n", "\n", "# Query for T cell exhaustion pathways\n", "exhaustion_pathways = query_reactome(\n", " prompt=\"Find pathways related to T cell exhaustion, TCR signaling, and chronic T cell stimulation\"\n", ")\n", "print(\"Reactome pathways for T cell exhaustion:\")\n", "print(exhaustion_pathways)\n", "\n", "==================================\u001b[1m Ai Message \u001b[0m==================================\n", "\n", "The output is too long to be added to context. Here are the first 10K characters...\n", "=== Step 1: Researching T cell exhaustion biology ===\n", "Reactome pathways for T cell exhaustion:\n", "{'success': True, 'query_info': {'endpoint': 'https://reactome.org/ContentService/search/query', 'method': 'GET', 'description': 'Search Reactome for pathways related to T cell exhaustion, T cell receptor signaling, and chronic T cell stimulation'}, 'result': {'results': [{'entries': [{'dbId': '202403', 'stId': 'R-HSA-202403', 'id': 'R-HSA-202403', 'name': 'TCR signaling', 'exactType': 'Pathway', 'species': ['Homo sapiens'], 'summation': 'The TCR is a multisubunit complex that consists of clonotypic alpha/beta chains noncovalently associated with the invariant CD3 delta/epsilon/gamma and TCR zeta chains. T cell activation by antigen presenting cells (APCs) results in the activation of protein tyrosine kinases (PTKs) that associate with CD3 and TCR zeta subunits and the co-receptor CD4. Members of the Src kinases (Lck), Syk kinases (ZAP-70), Tec (Itk) and Csk families of nonreceptor PTKs play a crucial role in T cell activation. Activation of PTKs following TCR engagement results in the recruitment and tyrosine phosphorylation of enzymes such as phospholipase C gamma1 and Vav as well as critical adaptor proteins such as LAT, SLP-76 and Gads. These proximal activation leads to reorganization of the cytoskeleton as well as transcription activation of multiple genes leading to T lymphocyte proliferation, differentiation and/or effector function. ', 'isDisease': False, 'disease': False, 'icon': False}, {'dbId': '10069478', 'stId': 'R-MMU-202403', 'id': 'R-MMU-202403', 'name': 'TCR signaling', 'exactType': 'Pathway', 'species': ['Mus musculus'], 'summation': \"This event has been computationally inferred from an event that has been demonstrated in another species.

The inference is based on the homology mapping from PANTHER. Briefly, reactions for which all involved PhysicalEntities (in input, output and catalyst) have a mapped orthologue/paralogue (for complexes at least 75% of components must have a mapping) are inferred to the other species. High level events are also inferred for these events to allow for easier navigation.

More details and caveats of the event inference in Reactome. For details on PANTHER see also: http://www.pantherdb.org/about.jsp\", 'isDisease': False, 'disease': False, 'icon': False}, {'dbId': '10185057', 'stId': 'R-RNO-202403', 'id': 'R-RNO-202403', 'name': 'TCR signaling', 'exactType': 'Pathway', 'species': ['Rattus norvegicus'], 'summation': \"This event has been computationally inferred from an event that has been demonstrated in another species.

The inference is based on the homology mapping from PANTHER. Briefly, reactions for which all involved PhysicalEntities (in input, output and catalyst) have a mapped orthologue/paralogue (for complexes at least 75% of components must have a mapping) are inferred to the other species. High level events are also inferred for these events to allow for easier navigation.

More details and caveats of the event inference in Reactome. For details on PANTHER see also: http://www.pantherdb.org/about.jsp\", 'isDisease': False, 'disease': False, 'icon': False}, {'dbId': '10293315', 'stId': 'R-CFA-202403', 'id': 'R-CFA-202403', 'name': 'TCR signaling', 'exactType': 'Pathway', 'species': ['Canis familiaris'], 'summation': \"This event has been computationally inferred from an event that has been demonstrated in another species.

The inference is based on the homology mapping from PANTHER. Briefly, reactions for which all involved PhysicalEntities (in input, output and catalyst) have a mapped orthologue/paralogue (for complexes at least 75% of components must have a mapping) are inferred to the other species. High level events are also inferred for these events to allow for easier navigation.

More details and caveats of the event inference in Reactome. For details on PANTHER see also: http://www.pantherdb.org/about.jsp\", 'isDisease': False, 'disease': False, 'icon': False}, {'dbId': '10407569', 'stId': 'R-BTA-202403', 'id': 'R-BTA-202403', 'name': 'TCR signaling', 'exactType': 'Pathway', 'species': ['Bos taurus'], 'summation': \"This event has been computationally inferred from an event that has been demonstrated in another species.

The inference is based on the homology mapping from PANTHER. Briefly, reactions for which all involved PhysicalEntities (in input, output and catalyst) have a mapped orthologue/paralogue (for complexes at least 75% of components must have a mapping) are inferred to the other species. High level events are also inferred for these events to allow for easier navigation.

More details and caveats of the event inference in Reactome. For details on PANTHER see also: http://www.pantherdb.org/about.jsp\", 'isDisease': False, 'disease': False, 'icon': False}, {'dbId': '10523735', 'stId': 'R-SSC-202403', 'id': 'R-SSC-202403', 'name': 'TCR signaling', 'exactType': 'Pathway', 'species': ['Sus scrofa'], 'summation': \"This event has been computationally inferred from an event that has been demonstrated in another species.

The inference is based on the homology mapping from PANTHER. Briefly, reactions for which all involved PhysicalEntities (in input, output and catalyst) have a mapped orthologue/paralogue (for complexes at least 75% of components must have a mapping) are inferred to the other species. High level events are also inferred for these events to allow for easier navigation.

More details and caveats of the event inference in Reactome. For details on PANTHER see also: http://www.pantherdb.org/about.jsp\", 'isDisease': False, 'disease': False, 'icon': False}, {'dbId': '10585787', 'stId': 'R-DRE-202403', 'id': 'R-DRE-202403', 'name': 'TCR signaling', 'exactType': 'Pathway', 'species': ['Danio rerio'], 'summation': \"This event has been computationally inferred from an event that has been demonstrated in another species.

The inference is based on the homology mapping from PANTHER. Briefly, reactions for which all involved PhysicalEntities (in input, output and catalyst) have a mapped orthologue/paralogue (for complexes at least 75% of components must have a mapping) are inferred to the other species. High level events are also inferred for these events to allow for easier navigation.

More details and caveats of the event inference in Reactome. For details on PANTHER see also: http://www.pantherdb.org/about.jsp\", 'isDisease': False, 'disease': False, 'icon': False}, {'dbId': '10661801', 'stId': 'R-XTR-202403', 'id': 'R-XTR-202403', 'name': 'TCR signaling', 'exactType': 'Pathway', 'species': ['Xenopus tropicalis'], 'summation': \"This event has been computationally inferred from an event that has been demonstrated in another species.

The inference is based on the homology mapping from PANTHER. Briefly, reactions for which all involved PhysicalEntities (in input, output and catalyst) have a mapped orthologue/paralogue (for complexes at least 75% of components must have a mapping) are inferred to the other species. High level events are also inferred for these events to allow for easier navigation.

More details and caveats of the event inference in Reactome. For details on PANTHER see also: http://www.pantherdb.org/about.jsp\", 'isDisease': False, 'disease': False, 'icon': False}, {'dbId': '10759741', 'stId': 'R-GGA-202403', 'id': 'R-GGA-202403', 'name': 'TCR signaling', 'exactType': 'Pathway', 'species': ['Gallus gallus'], 'summation': \"This event has been computationally inferred from an event that has been demonstrated in another species.

The inference is based on the homology mapping from PANTHER. Briefly, reactions for which all involved PhysicalEntities (in input, output and catalyst) have a mapped orthologue/paralogue (for complexes at least 75% of components must have a mapping) are inferred to the other species. High level events are also inferred for these events to allow for easier navigation.

More details and caveats of the event inference in Reactome. For details on PANTHER see also: http://www.pantherdb.org/about.jsp\", 'isDisease': False, 'disease': False, 'icon': False}, {'dbId': '10848043', 'stId': 'R-D\n", "==================================\u001b[1m Ai Message \u001b[0m==================================\n", "\n", "\n", "# Let's also query for immune checkpoint and T cell activation pathways\n", "print(\"\\n=== Querying for immune checkpoint pathways ===\")\n", "checkpoint_pathways = query_reactome(\n", " prompt=\"Find pathways related to immune checkpoint inhibition, PD-1, CTLA-4, and T cell activation\"\n", ")\n", "print(\"Checkpoint pathways found:\", len(checkpoint_pathways.get('result', {}).get('results', [])))\n", "\n", "==================================\u001b[1m Ai Message \u001b[0m==================================\n", "\n", "\n", "=== Querying for immune checkpoint pathways ===\n", "Checkpoint pathways found: 10\n", "\n", "==================================\u001b[1m Ai Message \u001b[0m==================================\n", "\n", "\n", "# Now let's identify key genes involved in T cell exhaustion from literature and databases\n", "print(\"=== Step 2: Identifying key T cell exhaustion genes ===\")\n", "\n", "# Key categories of genes involved in T cell exhaustion:\n", "# 1. Immune checkpoint receptors and ligands\n", "checkpoint_genes = [\n", " 'PDCD1', # PD-1\n", " 'CD274', # PD-L1\n", " 'PDCD1LG2', # PD-L2\n", " 'CTLA4', # CTLA-4\n", " 'HAVCR2', # TIM-3\n", " 'LAG3', # LAG-3\n", " 'TIGIT', # TIGIT\n", " 'CD96', # CD96\n", " 'BTLA', # BTLA\n", " 'CD160' # CD160\n", "]\n", "\n", "# 2. Transcription factors regulating exhaustion\n", "transcription_factors = [\n", " 'TOX', # TOX - master regulator of exhaustion\n", " 'TOX2', # TOX2\n", " 'EOMES', # Eomesodermin\n", " 'TBX21', # T-bet\n", " 'BATF', # BATF\n", " 'IRF4', # IRF4\n", " 'NFAT5', # NFAT5\n", " 'NFATC1', # NFATC1\n", " 'RUNX3', # RUNX3\n", " 'STAT3' # STAT3\n", "]\n", "\n", "# 3. Metabolic regulators\n", "metabolic_genes = [\n", " 'PPAR', # PPAR family\n", " 'HIF1A', # HIF-1α\n", " 'MYC', # c-Myc\n", " 'MTOR', # mTOR\n", " 'AMPK', # AMPK (PRKAA1)\n", " 'LDHA', # Lactate dehydrogenase A\n", " 'PKM', # Pyruvate kinase M\n", " 'GLUT1' # Glucose transporter 1 (SLC2A1)\n", "]\n", "\n", "# Let's refine the metabolic genes with proper gene symbols\n", "metabolic_genes_refined = [\n", " 'PPARA', # PPAR alpha\n", " 'HIF1A', # HIF-1α\n", " 'MYC', # c-Myc\n", " 'MTOR', # mTOR\n", " 'PRKAA1', # AMPK alpha 1\n", " 'LDHA', # Lactate dehydrogenase A\n", " 'PKM', # Pyruvate kinase M\n", " 'SLC2A1' # GLUT1\n", "]\n", "\n", "# 4. TCR signaling components\n", "tcr_signaling_genes = [\n", " 'LCK', # Lck\n", " 'ZAP70', # ZAP-70\n", " 'LAT', # LAT\n", " 'SLP76', # SLP-76 (LCP2)\n", " 'PLCG1', # PLCγ1\n", " 'VAV1', # Vav1\n", " 'ITK', # Itk\n", " 'CD3E', # CD3ε\n", " 'CD3G', # CD3γ\n", " 'CD3D' # CD3δ\n", "]\n", "\n", "# Refine TCR signaling genes\n", "tcr_signaling_genes_refined = [\n", " 'LCK',\n", " 'ZAP70',\n", " 'LAT',\n", " 'LCP2', # SLP-76\n", " 'PLCG1',\n", " 'VAV1',\n", " 'ITK',\n", " 'CD3E',\n", " 'CD3G',\n", " 'CD3D'\n", "]\n", "\n", "print(\"Checkpoint genes:\", len(checkpoint_genes))\n", "print(\"Transcription factors:\", len(transcription_factors))\n", "print(\"Metabolic genes:\", len(metabolic_genes_refined))\n", "print(\"TCR signaling genes:\", len(tcr_signaling_genes_refined))\n", "\n", "# Combine all categories\n", "all_candidate_genes = checkpoint_genes + transcription_factors + metabolic_genes_refined + tcr_signaling_genes_refined\n", "print(f\"\\nTotal candidate genes: {len(all_candidate_genes)}\")\n", "print(\"All candidates:\", all_candidate_genes)\n", "\n", "==================================\u001b[1m Ai Message \u001b[0m==================================\n", "\n", "=== Step 2: Identifying key T cell exhaustion genes ===\n", "Checkpoint genes: 10\n", "Transcription factors: 10\n", "Metabolic genes: 8\n", "TCR signaling genes: 10\n", "\n", "Total candidate genes: 38\n", "All candidates: ['PDCD1', 'CD274', 'PDCD1LG2', 'CTLA4', 'HAVCR2', 'LAG3', 'TIGIT', 'CD96', 'BTLA', 'CD160', 'TOX', 'TOX2', 'EOMES', 'TBX21', 'BATF', 'IRF4', 'NFAT5', 'NFATC1', 'RUNX3', 'STAT3', 'PPARA', 'HIF1A', 'MYC', 'MTOR', 'PRKAA1', 'LDHA', 'PKM', 'SLC2A1', 'LCK', 'ZAP70', 'LAT', 'LCP2', 'PLCG1', 'VAV1', 'ITK', 'CD3E', 'CD3G', 'CD3D']\n", "\n", "==================================\u001b[1m Ai Message \u001b[0m==================================\n", "\n", "\n", "# Now let's prioritize and select the most impactful 32 genes\n", "print(\"=== Step 3: Prioritizing genes for maximum perturbation effect ===\")\n", "\n", "# Priority scoring based on:\n", "# 1. Known role in T cell exhaustion (literature evidence)\n", "# 2. Potential for differential effect between acute vs chronic stimulation\n", "# 3. Druggability/therapeutic relevance\n", "\n", "high_priority_genes = {\n", " # Master regulators of exhaustion (highest priority)\n", " 'TOX': {'priority': 10, 'category': 'Master TF', 'rationale': 'Master regulator of T cell exhaustion'},\n", " 'PDCD1': {'priority': 10, 'category': 'Checkpoint', 'rationale': 'PD-1, primary exhaustion checkpoint'},\n", " 'EOMES': {'priority': 9, 'category': 'TF', 'rationale': 'Key exhaustion transcription factor'},\n", " 'HAVCR2': {'priority': 9, 'category': 'Checkpoint', 'rationale': 'TIM-3, critical exhaustion marker'},\n", " \n", " # High impact checkpoint receptors\n", " 'LAG3': {'priority': 9, 'category': 'Checkpoint', 'rationale': 'LAG-3, major exhaustion checkpoint'},\n", " 'TIGIT': {'priority': 8, 'category': 'Checkpoint', 'rationale': 'TIGIT, emerging exhaustion target'},\n", " 'CTLA4': {'priority': 8, 'category': 'Checkpoint', 'rationale': 'CTLA-4, early checkpoint inhibitor'},\n", " \n", " # Transcription factors\n", " 'BATF': {'priority': 8, 'category': 'TF', 'rationale': 'BATF, exhaustion-promoting TF'},\n", " 'TBX21': {'priority': 7, 'category': 'TF', 'rationale': 'T-bet, effector vs exhaustion balance'},\n", " 'IRF4': {'priority': 7, 'category': 'TF', 'rationale': 'IRF4, T cell differentiation'},\n", " 'NFATC1': {'priority': 7, 'category': 'TF', 'rationale': 'NFAT, chronic stimulation response'},\n", " 'TOX2': {'priority': 7, 'category': 'TF', 'rationale': 'TOX2, TOX family member'},\n", " \n", " # Metabolic regulators (key for chronic vs acute differences)\n", " 'HIF1A': {'priority': 8, 'category': 'Metabolism', 'rationale': 'HIF-1α, hypoxia and exhaustion'},\n", " 'MYC': {'priority': 8, 'category': 'Metabolism', 'rationale': 'c-Myc, metabolic reprogramming'},\n", " 'MTOR': {'priority': 7, 'category': 'Metabolism', 'rationale': 'mTOR, metabolic sensing'},\n", " 'PRKAA1': {'priority': 7, 'category': 'Metabolism', 'rationale': 'AMPK, energy sensing'},\n", " 'LDHA': {'priority': 6, 'category': 'Metabolism', 'rationale': 'Lactate metabolism in exhaustion'},\n", " \n", " # TCR signaling (differential between acute/chronic)\n", " 'LCK': {'priority': 8, 'category': 'TCR', 'rationale': 'Lck, proximal TCR signaling'},\n", " 'ZAP70': {'priority': 8, 'category': 'TCR', 'rationale': 'ZAP-70, TCR signal strength'},\n", " 'LAT': {'priority': 7, 'category': 'TCR', 'rationale': 'LAT, TCR signaling hub'},\n", " 'PLCG1': {'priority': 7, 'category': 'TCR', 'rationale': 'PLCγ1, downstream TCR signaling'},\n", " 'LCP2': {'priority': 6, 'category': 'TCR', 'rationale': 'SLP-76, TCR signaling adaptor'},\n", " \n", " # Additional checkpoint molecules\n", " 'CD274': {'priority': 6, 'category': 'Checkpoint', 'rationale': 'PD-L1, PD-1 ligand'},\n", " 'BTLA': {'priority': 6, 'category': 'Checkpoint', 'rationale': 'BTLA, inhibitory receptor'},\n", " 'CD96': {'priority': 6, 'category': 'Checkpoint', 'rationale': 'CD96, TIGIT pathway'},\n", " \n", " # Additional metabolic/signaling\n", " 'STAT3': {'priority': 7, 'category': 'Signaling', 'rationale': 'STAT3, cytokine signaling'},\n", " 'VAV1': {'priority': 6, 'category': 'TCR', 'rationale': 'Vav1, TCR signaling'},\n", " 'ITK': {'priority': 6, 'category': 'TCR', 'rationale': 'Itk, TCR signaling kinase'},\n", " 'RUNX3': {'priority': 6, 'category': 'TF', 'rationale': 'RUNX3, T cell differentiation'},\n", " 'PKM': {'priority': 5, 'category': 'Metabolism', 'rationale': 'Pyruvate kinase, glycolysis'},\n", " 'SLC2A1': {'priority': 5, 'category': 'Metabolism', 'rationale': 'GLUT1, glucose uptake'},\n", " 'PPARA': {'priority': 5, 'category': 'Metabolism', 'rationale': 'PPARα, fatty acid oxidation'},\n", " 'CD160': {'priority': 5, 'category': 'Checkpoint', 'rationale': 'CD160, inhibitory receptor'}\n", "}\n", "\n", "# Sort by priority and select top 32\n", "sorted_genes = sorted(high_priority_genes.items(), key=lambda x: x[1]['priority'], reverse=True)\n", "selected_32_genes = [gene for gene, info in sorted_genes[:32]]\n", "\n", "print(f\"Selected 32 genes for CRISPR screen:\")\n", "for i, (gene, info) in enumerate(sorted_genes[:32], 1):\n", " print(f\"{i:2d}. {gene:<8} (Priority: {info['priority']}, {info['category']}) - {info['rationale']}\")\n", "\n", "print(f\"\\nFinal gene list: {selected_32_genes}\")\n", "\n", "==================================\u001b[1m Ai Message \u001b[0m==================================\n", "\n", "=== Step 3: Prioritizing genes for maximum perturbation effect ===\n", "Selected 32 genes for CRISPR screen:\n", " 1. TOX (Priority: 10, Master TF) - Master regulator of T cell exhaustion\n", " 2. PDCD1 (Priority: 10, Checkpoint) - PD-1, primary exhaustion checkpoint\n", " 3. EOMES (Priority: 9, TF) - Key exhaustion transcription factor\n", " 4. HAVCR2 (Priority: 9, Checkpoint) - TIM-3, critical exhaustion marker\n", " 5. LAG3 (Priority: 9, Checkpoint) - LAG-3, major exhaustion checkpoint\n", " 6. TIGIT (Priority: 8, Checkpoint) - TIGIT, emerging exhaustion target\n", " 7. CTLA4 (Priority: 8, Checkpoint) - CTLA-4, early checkpoint inhibitor\n", " 8. BATF (Priority: 8, TF) - BATF, exhaustion-promoting TF\n", " 9. HIF1A (Priority: 8, Metabolism) - HIF-1α, hypoxia and exhaustion\n", "10. MYC (Priority: 8, Metabolism) - c-Myc, metabolic reprogramming\n", "11. LCK (Priority: 8, TCR) - Lck, proximal TCR signaling\n", "12. ZAP70 (Priority: 8, TCR) - ZAP-70, TCR signal strength\n", "13. TBX21 (Priority: 7, TF) - T-bet, effector vs exhaustion balance\n", "14. IRF4 (Priority: 7, TF) - IRF4, T cell differentiation\n", "15. NFATC1 (Priority: 7, TF) - NFAT, chronic stimulation response\n", "16. TOX2 (Priority: 7, TF) - TOX2, TOX family member\n", "17. MTOR (Priority: 7, Metabolism) - mTOR, metabolic sensing\n", "18. PRKAA1 (Priority: 7, Metabolism) - AMPK, energy sensing\n", "19. LAT (Priority: 7, TCR) - LAT, TCR signaling hub\n", "20. PLCG1 (Priority: 7, TCR) - PLCγ1, downstream TCR signaling\n", "21. STAT3 (Priority: 7, Signaling) - STAT3, cytokine signaling\n", "22. LDHA (Priority: 6, Metabolism) - Lactate metabolism in exhaustion\n", "23. LCP2 (Priority: 6, TCR) - SLP-76, TCR signaling adaptor\n", "24. CD274 (Priority: 6, Checkpoint) - PD-L1, PD-1 ligand\n", "25. BTLA (Priority: 6, Checkpoint) - BTLA, inhibitory receptor\n", "26. CD96 (Priority: 6, Checkpoint) - CD96, TIGIT pathway\n", "27. VAV1 (Priority: 6, TCR) - Vav1, TCR signaling\n", "28. ITK (Priority: 6, TCR) - Itk, TCR signaling kinase\n", "29. RUNX3 (Priority: 6, TF) - RUNX3, T cell differentiation\n", "30. PKM (Priority: 5, Metabolism) - Pyruvate kinase, glycolysis\n", "31. SLC2A1 (Priority: 5, Metabolism) - GLUT1, glucose uptake\n", "32. PPARA (Priority: 5, Metabolism) - PPARα, fatty acid oxidation\n", "\n", "Final gene list: ['TOX', 'PDCD1', 'EOMES', 'HAVCR2', 'LAG3', 'TIGIT', 'CTLA4', 'BATF', 'HIF1A', 'MYC', 'LCK', 'ZAP70', 'TBX21', 'IRF4', 'NFATC1', 'TOX2', 'MTOR', 'PRKAA1', 'LAT', 'PLCG1', 'STAT3', 'LDHA', 'LCP2', 'CD274', 'BTLA', 'CD96', 'VAV1', 'ITK', 'RUNX3', 'PKM', 'SLC2A1', 'PPARA']\n", "\n", "==================================\u001b[1m Ai Message \u001b[0m==================================\n", "\n", "1. [✓] Research T cell exhaustion biology and identify key pathways (completed)\n", "2. [✓] Query databases for genes involved in T cell exhaustion, TCR signaling, and chronic stimulation (completed)\n", "3. [✓] Identify genes from exhaustion-related pathways (checkpoint inhibitors, metabolic regulators, transcription factors) (completed)\n", "4. [✓] Select genes that are likely to show differential effects between acute vs chronic stimulation (completed)\n", "5. [ ] Design sgRNAs for the selected 32 genes\n", "6. [ ] Validate the gene list and provide experimental design recommendations\n", "\n", "\n", "# Now let's design sgRNAs for our selected genes\n", "print(\"=== Step 4: Designing sgRNAs for selected genes ===\")\n", "\n", "data_lake_path = \"/dfs/project/bioagentos/biomni_data_test/biomni_data/data_lake\"\n", "\n", "# We'll design sgRNAs for each of our 32 selected genes\n", "selected_genes = ['TOX', 'PDCD1', 'EOMES', 'HAVCR2', 'LAG3', 'TIGIT', 'CTLA4', 'BATF', 'HIF1A', 'MYC', \n", " 'LCK', 'ZAP70', 'TBX21', 'IRF4', 'NFATC1', 'TOX2', 'MTOR', 'PRKAA1', 'LAT', 'PLCG1', \n", " 'STAT3', 'LDHA', 'LCP2', 'CD274', 'BTLA', 'CD96', 'VAV1', 'ITK', 'RUNX3', 'PKM', \n", " 'SLC2A1', 'PPARA']\n", "\n", "print(f\"Designing sgRNAs for {len(selected_genes)} genes...\")\n", "\n", "# Let's try to design sgRNAs for a few key genes as examples\n", "key_genes_for_sgrna = ['TOX', 'PDCD1', 'EOMES', 'HAVCR2', 'LAG3']\n", "\n", "sgrna_results = {}\n", "for gene in key_genes_for_sgrna:\n", " try:\n", " print(f\"\\nDesigning sgRNAs for {gene}...\")\n", " result = design_knockout_sgrna(\n", " gene_name=gene,\n", " data_lake_path=data_lake_path,\n", " species=\"human\",\n", " num_guides=3 # Get 3 guides per gene for redundancy\n", " )\n", " sgrna_results[gene] = result\n", " print(f\"Successfully designed sgRNAs for {gene}\")\n", " if result:\n", " print(f\"Result type: {type(result)}\")\n", " except Exception as e:\n", " print(f\"Error designing sgRNAs for {gene}: {str(e)}\")\n", " sgrna_results[gene] = None\n", "\n", "print(f\"\\nsgRNA design completed for {len(sgrna_results)} genes\")\n", "\n", "==================================\u001b[1m Ai Message \u001b[0m==================================\n", "\n", "=== Step 4: Designing sgRNAs for selected genes ===\n", "Designing sgRNAs for 32 genes...\n", "\n", "Designing sgRNAs for TOX...\n", "Error designing sgRNAs for TOX: Library file for human not found at path: /dfs/project/bioagentos/biomni_data_test/biomni_data/data_lake/sgRNA/KO_SP_human.txt\n", "\n", "Designing sgRNAs for PDCD1...\n", "Error designing sgRNAs for PDCD1: Library file for human not found at path: /dfs/project/bioagentos/biomni_data_test/biomni_data/data_lake/sgRNA/KO_SP_human.txt\n", "\n", "Designing sgRNAs for EOMES...\n", "Error designing sgRNAs for EOMES: Library file for human not found at path: /dfs/project/bioagentos/biomni_data_test/biomni_data/data_lake/sgRNA/KO_SP_human.txt\n", "\n", "Designing sgRNAs for HAVCR2...\n", "Error designing sgRNAs for HAVCR2: Library file for human not found at path: /dfs/project/bioagentos/biomni_data_test/biomni_data/data_lake/sgRNA/KO_SP_human.txt\n", "\n", "Designing sgRNAs for LAG3...\n", "Error designing sgRNAs for LAG3: Library file for human not found at path: /dfs/project/bioagentos/biomni_data_test/biomni_data/data_lake/sgRNA/KO_SP_human.txt\n", "\n", "sgRNA design completed for 5 genes\n", "\n", "==================================\u001b[1m Ai Message \u001b[0m==================================\n", "\n", "\n", "# Let's check what's available in the data lake for sgRNA design\n", "import os\n", "\n", "print(\"=== Checking data lake contents for sgRNA libraries ===\")\n", "data_lake_path = \"/dfs/project/bioagentos/biomni_data_test/biomni_data/data_lake\"\n", "\n", "# Check if the directory exists\n", "if os.path.exists(data_lake_path):\n", " print(f\"Data lake path exists: {data_lake_path}\")\n", " \n", " # List contents\n", " contents = os.listdir(data_lake_path)\n", " print(f\"Contents of data lake: {contents}\")\n", " \n", " # Check for sgRNA directory\n", " sgrna_path = os.path.join(data_lake_path, \"sgRNA\")\n", " if os.path.exists(sgrna_path):\n", " print(f\"sgRNA directory exists\")\n", " sgrna_contents = os.listdir(sgrna_path)\n", " print(f\"sgRNA directory contents: {sgrna_contents}\")\n", " else:\n", " print(\"sgRNA directory does not exist\")\n", " \n", " # Check for any files that might contain sgRNA data\n", " for item in contents:\n", " item_path = os.path.join(data_lake_path, item)\n", " if os.path.isfile(item_path) and ('sgrna' in item.lower() or 'guide' in item.lower() or 'crispr' in item.lower()):\n", " print(f\"Found potential sgRNA file: {item}\")\n", "else:\n", " print(f\"Data lake path does not exist: {data_lake_path}\")\n", "\n", "==================================\u001b[1m Ai Message \u001b[0m==================================\n", "\n", "=== Checking data lake contents for sgRNA libraries ===\n", "Data lake path exists: /dfs/project/bioagentos/biomni_data_test/biomni_data/data_lake\n", "Contents of data lake: ['co-fractionation.csv', 'Cosmic_CompleteCNA_v101_GRCh38.tsv.gz', 'Cosmic_Breakpoints_v101_GRCh38.csv', 'two-hybrid.csv', 'Cosmic_CompleteGeneExpression_v101_GRCh38.tsv.gz', 'broad_repurposing_hub_phase_moa_target_info.csv', 'hp.obo', 'BindingDB_All_202409.tsv', 'genebass_synonymous_filtered.pkl', 'omim.csv', 'miRTarBase_microRNA_target_interaction.csv', 'gtex_tissue_gene_tpm.csv', 'Cosmic_CompleteDifferentialMethylation_v101_GRCh38.tsv.gz', 'synthetic_lethality.csv', 'proximity_label-ms.csv', 'synthetic_growth_defect.csv', 'broad_repurposing_hub_molecule_with_smiles.csv', 'czi_census_datasets_v4.csv', 'mousemine_mh_hallmark_geneset.csv', 'mousemine_m1_positional_geneset.csv', 'genebass_pLoF_filtered.pkl', 'Virus-Host_PPI_P-HIPSTER_2020.csv', 'msigdb_human_c8_celltype_signature_geneset.csv', 'msigdb_human_c6_oncogenic_signature_geneset.csv', 'Cosmic_CancerGeneCensus_v101_GRCh38.csv', 'gwas_catalog.pkl', 'miRTarBase_microRNA_target_interaction_pubmed_abtract.txt', 'Cosmic_CancerGeneCensusHallmarksOfCancer_v101_GRCh38.csv', 'mousemine_m5_ontology_geneset.csv', 'go-plus.json', 'gene_info.csv', 'msigdb_human_h_hallmark_geneset.csv', 'msigdb_human_c5_ontology_geneset.csv', 'variant_table.csv', 'enamine_cloud_library_smiles.pkl', 'Cosmic_MutantCensus_v101_GRCh38.csv', 'reconstituted_complex.csv', 'DisGeNET.csv', 'msigdb_human_c3_regulatory_target_geneset.csv', 'genebass_missense_LC_filtered.pkl', 'Cosmic_Genes_v101_GRCh38.csv', 'mousemine_m3_regulatory_target_geneset.csv', 'msigdb_human_c1_positional_geneset.csv', 'msigdb_human_c4_computational_geneset.csv', 'miRDB_v6.0_results.csv', 'Cosmic_ResistanceMutations_v101_GRCh38.csv', 'msigdb_human_c2_curated_geneset.csv', 'Cosmic_GenomeScreensMutant_v101_GRCh38.tsv.gz', 'mousemine_m2_curated_geneset.csv', 'affinity_capture-ms.csv', 'msigdb_human_c3_subset_transcription_factor_targets_from_GTRD.csv', 'miRTarBase_MicroRNA_Target_Sites.csv', 'msigdb_human_c7_immunologic_signature_geneset.csv', 'McPAS-TCR.csv', 'dosage_growth_defect.csv', 'Cosmic_Classification_v101_GRCh38.csv', 'affinity_capture-rna.csv', 'proteinatlas.tsv', 'genetic_interaction.csv', 'synthetic_rescue.csv', 'Cosmic_Fusion_v101_GRCh38.csv', 'Cosmic_ClassificationPaper_v101_GRCh38.csv', 'mousemine_m8_celltype_signature_geneset.csv', 'marker_celltype.csv']\n", "sgRNA directory does not exist\n", "\n", "==================================\u001b[1m Ai Message \u001b[0m==================================\n", "\n", "5. [✗] Design sgRNAs for the selected 32 genes (failed because sgRNA library not available in data lake)\n", "5. [ ] Provide theoretical sgRNA design recommendations and validate gene selection\n", "6. [ ] Provide comprehensive experimental design recommendations\n", "\n", "\n", "# Since we don't have access to the sgRNA library, let's provide theoretical sgRNA design recommendations\n", "# and create a comprehensive experimental plan\n", "\n", "print(\"=== Step 5: Theoretical sgRNA Design and Validation ===\")\n", "\n", "selected_genes = ['TOX', 'PDCD1', 'EOMES', 'HAVCR2', 'LAG3', 'TIGIT', 'CTLA4', 'BATF', 'HIF1A', 'MYC', \n", " 'LCK', 'ZAP70', 'TBX21', 'IRF4', 'NFATC1', 'TOX2', 'MTOR', 'PRKAA1', 'LAT', 'PLCG1', \n", " 'STAT3', 'LDHA', 'LCP2', 'CD274', 'BTLA', 'CD96', 'VAV1', 'ITK', 'RUNX3', 'PKM', \n", " 'SLC2A1', 'PPARA']\n", "\n", "# Categorize genes by functional category for analysis\n", "gene_categories = {\n", " 'Immune Checkpoints': ['PDCD1', 'HAVCR2', 'LAG3', 'TIGIT', 'CTLA4', 'CD274', 'BTLA', 'CD96'],\n", " 'Transcription Factors': ['TOX', 'EOMES', 'BATF', 'TBX21', 'IRF4', 'NFATC1', 'TOX2', 'RUNX3'],\n", " 'Metabolic Regulators': ['HIF1A', 'MYC', 'MTOR', 'PRKAA1', 'LDHA', 'PKM', 'SLC2A1', 'PPARA'],\n", " 'TCR Signaling': ['LCK', 'ZAP70', 'LAT', 'PLCG1', 'LCP2', 'VAV1', 'ITK'],\n", " 'Cytokine Signaling': ['STAT3']\n", "}\n", "\n", "print(\"Gene categorization:\")\n", "for category, genes in gene_categories.items():\n", " print(f\"{category}: {len(genes)} genes\")\n", " print(f\" {', '.join(genes)}\")\n", " print()\n", "\n", "# Expected effects on T cell exhaustion\n", "expected_effects = {\n", " 'TOX': 'KO should reduce exhaustion (master exhaustion TF)',\n", " 'PDCD1': 'KO should reduce exhaustion (remove PD-1 checkpoint)',\n", " 'EOMES': 'KO should reduce exhaustion (exhaustion-promoting TF)',\n", " 'HAVCR2': 'KO should reduce exhaustion (remove TIM-3 checkpoint)',\n", " 'LAG3': 'KO should reduce exhaustion (remove LAG-3 checkpoint)',\n", " 'TIGIT': 'KO should reduce exhaustion (remove TIGIT checkpoint)',\n", " 'CTLA4': 'KO should enhance early activation',\n", " 'BATF': 'KO should reduce exhaustion (exhaustion-promoting TF)',\n", " 'HIF1A': 'KO may reduce exhaustion (metabolic stress response)',\n", " 'MYC': 'KO may increase exhaustion (metabolic reprogramming)',\n", " 'LCK': 'KO should reduce TCR signaling strength',\n", " 'ZAP70': 'KO should reduce TCR signaling strength',\n", " 'TBX21': 'KO should shift toward exhaustion (effector TF)',\n", " 'IRF4': 'KO may affect T cell differentiation balance',\n", " 'NFATC1': 'KO may reduce chronic stimulation response',\n", " 'MTOR': 'KO should affect metabolic reprogramming',\n", " 'STAT3': 'KO should affect cytokine responsiveness'\n", "}\n", "\n", "print(\"Expected functional outcomes:\")\n", "for gene in selected_genes[:17]: # Show first 17 as examples\n", " if gene in expected_effects:\n", " print(f\"{gene}: {expected_effects[gene]}\")\n", "\n", "print(\"\\nsgRNA Design Recommendations:\")\n", "print(\"1. Use 3-4 sgRNAs per gene for redundancy\")\n", "print(\"2. Target early exons to ensure knockout\")\n", "print(\"3. Include non-targeting controls (scrambled guides)\")\n", "print(\"4. Include positive controls (known exhaustion regulators)\")\n", "print(\"5. Use established sgRNA design tools (e.g., Broad GPP portal, CHOPCHOP)\")\n", "\n", "==================================\u001b[1m Ai Message \u001b[0m==================================\n", "\n", "=== Step 5: Theoretical sgRNA Design and Validation ===\n", "Gene categorization:\n", "Immune Checkpoints: 8 genes\n", " PDCD1, HAVCR2, LAG3, TIGIT, CTLA4, CD274, BTLA, CD96\n", "\n", "Transcription Factors: 8 genes\n", " TOX, EOMES, BATF, TBX21, IRF4, NFATC1, TOX2, RUNX3\n", "\n", "Metabolic Regulators: 8 genes\n", " HIF1A, MYC, MTOR, PRKAA1, LDHA, PKM, SLC2A1, PPARA\n", "\n", "TCR Signaling: 7 genes\n", " LCK, ZAP70, LAT, PLCG1, LCP2, VAV1, ITK\n", "\n", "Cytokine Signaling: 1 genes\n", " STAT3\n", "\n", "Expected functional outcomes:\n", "TOX: KO should reduce exhaustion (master exhaustion TF)\n", "PDCD1: KO should reduce exhaustion (remove PD-1 checkpoint)\n", "EOMES: KO should reduce exhaustion (exhaustion-promoting TF)\n", "HAVCR2: KO should reduce exhaustion (remove TIM-3 checkpoint)\n", "LAG3: KO should reduce exhaustion (remove LAG-3 checkpoint)\n", "TIGIT: KO should reduce exhaustion (remove TIGIT checkpoint)\n", "CTLA4: KO should enhance early activation\n", "BATF: KO should reduce exhaustion (exhaustion-promoting TF)\n", "HIF1A: KO may reduce exhaustion (metabolic stress response)\n", "MYC: KO may increase exhaustion (metabolic reprogramming)\n", "LCK: KO should reduce TCR signaling strength\n", "ZAP70: KO should reduce TCR signaling strength\n", "TBX21: KO should shift toward exhaustion (effector TF)\n", "IRF4: KO may affect T cell differentiation balance\n", "NFATC1: KO may reduce chronic stimulation response\n", "MTOR: KO should affect metabolic reprogramming\n", "\n", "sgRNA Design Recommendations:\n", "1. Use 3-4 sgRNAs per gene for redundancy\n", "2. Target early exons to ensure knockout\n", "3. Include non-targeting controls (scrambled guides)\n", "4. Include positive controls (known exhaustion regulators)\n", "5. Use established sgRNA design tools (e.g., Broad GPP portal, CHOPCHOP)\n", "\n", "==================================\u001b[1m Ai Message \u001b[0m==================================\n", "\n", "\n", "# Now let's create a comprehensive experimental design\n", "print(\"=== Step 6: Comprehensive Experimental Design for CRISPR Screen ===\")\n", "\n", "experimental_design = {\n", " 'objective': 'Identify genes regulating T cell exhaustion through differential TCR signaling between acute and chronic stimulation',\n", " 'readout': 'TCR signaling strength measured by phospho-flow cytometry, cytokine production, and transcriptional markers',\n", " 'conditions': {\n", " 'acute': 'IL-2 only (maintains T cell survival without chronic stimulation)',\n", " 'chronic': 'Anti-CD3 + IL-2 (chronic TCR stimulation leading to exhaustion)'\n", " }\n", "}\n", "\n", "print(\"EXPERIMENTAL DESIGN OVERVIEW\")\n", "print(\"=\"*50)\n", "print(f\"Objective: {experimental_design['objective']}\")\n", "print(f\"Primary Readout: {experimental_design['readout']}\")\n", "print()\n", "\n", "print(\"STIMULATION CONDITIONS:\")\n", "print(f\"1. Acute condition: {experimental_design['conditions']['acute']}\")\n", "print(f\"2. Chronic condition: {experimental_design['conditions']['chronic']}\")\n", "print()\n", "\n", "print(\"SELECTED 32 GENES FOR CRISPR SCREEN:\")\n", "print(\"=\"*50)\n", "\n", "# Create a detailed gene table\n", "gene_info = []\n", "for i, gene in enumerate(selected_genes, 1):\n", " category = None\n", " for cat, genes in gene_categories.items():\n", " if gene in genes:\n", " category = cat\n", " break\n", " \n", " gene_info.append({\n", " 'rank': i,\n", " 'gene': gene,\n", " 'category': category,\n", " 'expected_effect': expected_effects.get(gene, 'To be determined')\n", " })\n", "\n", "# Print gene table\n", "print(f\"{'Rank':<4} {'Gene':<8} {'Category':<20} {'Expected Effect'}\")\n", "print(\"-\" * 80)\n", "for info in gene_info:\n", " print(f\"{info['rank']:<4} {info['gene']:<8} {info['category']:<20} {info['expected_effect'][:50]}\")\n", "\n", "print()\n", "print(\"EXPERIMENTAL PROTOCOL:\")\n", "print(\"=\"*50)\n", "\n", "protocol_steps = [\n", " \"1. T Cell Isolation and Activation\",\n", " \" - Isolate primary CD8+ T cells from healthy donors\",\n", " \" - Activate with anti-CD3/CD28 for 2-3 days\",\n", " \"\",\n", " \"2. CRISPR-Cas9 Delivery\",\n", " \" - Electroporate Cas9 RNP complexes with sgRNA library\",\n", " \" - 3-4 sgRNAs per gene + non-targeting controls\",\n", " \" - Allow 3-5 days for gene knockout\",\n", " \"\",\n", " \"3. Stimulation Conditions (7-14 days)\",\n", " \" - Acute: IL-2 (100 U/ml) only\",\n", " \" - Chronic: Anti-CD3 (1 μg/ml) + IL-2 (100 U/ml)\",\n", " \" - Refresh media every 2-3 days\",\n", " \"\",\n", " \"4. Readout Measurements\",\n", " \" - TCR signaling: Phospho-flow for pZAP70, pERK, pS6\",\n", " \" - Exhaustion markers: PD-1, TIM-3, LAG-3 expression\",\n", " \" - Function: IFN-γ, TNF-α, IL-2 production\",\n", " \" - Transcription: RNA-seq for exhaustion signatures\",\n", " \"\",\n", " \"5. Data Analysis\",\n", " \" - Calculate perturbation scores for each gene\",\n", " \" - Identify genes with differential effects between conditions\",\n", " \" - Validate top hits with individual knockouts\"\n", "]\n", "\n", "for step in protocol_steps:\n", " print(step)\n", "\n", "print()\n", "print(\"KEY CONTROLS:\")\n", "print(\"=\"*20)\n", "controls = [\n", " \"- Non-targeting sgRNAs (negative control)\",\n", " \"- Known exhaustion genes (positive control): TOX, PDCD1\",\n", " \"- Essential genes (toxicity control): RPL23, RPS19\",\n", " \"- Unstimulated cells (baseline)\",\n", " \"- Wild-type cells (no CRISPR)\"\n", "]\n", "\n", "for control in controls:\n", " print(control)\n", "\n", "print()\n", "print(\"EXPECTED OUTCOMES:\")\n", "print(\"=\"*20)\n", "outcomes = [\n", " \"1. Genes that reduce exhaustion when knocked out:\",\n", " \" - Should show enhanced TCR signaling in chronic condition\",\n", " \" - Examples: TOX, PDCD1, HAVCR2, LAG3, EOMES\",\n", " \"\",\n", " \"2. Genes that promote effector function:\",\n", " \" - Should show reduced signaling when knocked out\",\n", " \" - Examples: TBX21, MYC (context-dependent)\",\n", " \"\",\n", " \"3. Metabolic regulators:\",\n", " \" - Should show differential effects on sustained signaling\",\n", " \" - Examples: MTOR, HIF1A, PRKAA1\",\n", " \"\",\n", " \"4. TCR signaling components:\",\n", " \" - Should show proportional effects on signaling strength\",\n", " \" - Examples: LCK, ZAP70, LAT\"\n", "]\n", "\n", "for outcome in outcomes:\n", " print(outcome)\n", "\n", "==================================\u001b[1m Ai Message \u001b[0m==================================\n", "\n", "=== Step 6: Comprehensive Experimental Design for CRISPR Screen ===\n", "EXPERIMENTAL DESIGN OVERVIEW\n", "==================================================\n", "Objective: Identify genes regulating T cell exhaustion through differential TCR signaling between acute and chronic stimulation\n", "Primary Readout: TCR signaling strength measured by phospho-flow cytometry, cytokine production, and transcriptional markers\n", "\n", "STIMULATION CONDITIONS:\n", "1. Acute condition: IL-2 only (maintains T cell survival without chronic stimulation)\n", "2. Chronic condition: Anti-CD3 + IL-2 (chronic TCR stimulation leading to exhaustion)\n", "\n", "SELECTED 32 GENES FOR CRISPR SCREEN:\n", "==================================================\n", "Rank Gene Category Expected Effect\n", "--------------------------------------------------------------------------------\n", "1 TOX Transcription Factors KO should reduce exhaustion (master exhaustion TF)\n", "2 PDCD1 Immune Checkpoints KO should reduce exhaustion (remove PD-1 checkpoin\n", "3 EOMES Transcription Factors KO should reduce exhaustion (exhaustion-promoting \n", "4 HAVCR2 Immune Checkpoints KO should reduce exhaustion (remove TIM-3 checkpoi\n", "5 LAG3 Immune Checkpoints KO should reduce exhaustion (remove LAG-3 checkpoi\n", "6 TIGIT Immune Checkpoints KO should reduce exhaustion (remove TIGIT checkpoi\n", "7 CTLA4 Immune Checkpoints KO should enhance early activation\n", "8 BATF Transcription Factors KO should reduce exhaustion (exhaustion-promoting \n", "9 HIF1A Metabolic Regulators KO may reduce exhaustion (metabolic stress respons\n", "10 MYC Metabolic Regulators KO may increase exhaustion (metabolic reprogrammin\n", "11 LCK TCR Signaling KO should reduce TCR signaling strength\n", "12 ZAP70 TCR Signaling KO should reduce TCR signaling strength\n", "13 TBX21 Transcription Factors KO should shift toward exhaustion (effector TF)\n", "14 IRF4 Transcription Factors KO may affect T cell differentiation balance\n", "15 NFATC1 Transcription Factors KO may reduce chronic stimulation response\n", "16 TOX2 Transcription Factors To be determined\n", "17 MTOR Metabolic Regulators KO should affect metabolic reprogramming\n", "18 PRKAA1 Metabolic Regulators To be determined\n", "19 LAT TCR Signaling To be determined\n", "20 PLCG1 TCR Signaling To be determined\n", "21 STAT3 Cytokine Signaling KO should affect cytokine responsiveness\n", "22 LDHA Metabolic Regulators To be determined\n", "23 LCP2 TCR Signaling To be determined\n", "24 CD274 Immune Checkpoints To be determined\n", "25 BTLA Immune Checkpoints To be determined\n", "26 CD96 Immune Checkpoints To be determined\n", "27 VAV1 TCR Signaling To be determined\n", "28 ITK TCR Signaling To be determined\n", "29 RUNX3 Transcription Factors To be determined\n", "30 PKM Metabolic Regulators To be determined\n", "31 SLC2A1 Metabolic Regulators To be determined\n", "32 PPARA Metabolic Regulators To be determined\n", "\n", "EXPERIMENTAL PROTOCOL:\n", "==================================================\n", "1. T Cell Isolation and Activation\n", " - Isolate primary CD8+ T cells from healthy donors\n", " - Activate with anti-CD3/CD28 for 2-3 days\n", "\n", "2. CRISPR-Cas9 Delivery\n", " - Electroporate Cas9 RNP complexes with sgRNA library\n", " - 3-4 sgRNAs per gene + non-targeting controls\n", " - Allow 3-5 days for gene knockout\n", "\n", "3. Stimulation Conditions (7-14 days)\n", " - Acute: IL-2 (100 U/ml) only\n", " - Chronic: Anti-CD3 (1 μg/ml) + IL-2 (100 U/ml)\n", " - Refresh media every 2-3 days\n", "\n", "4. Readout Measurements\n", " - TCR signaling: Phospho-flow for pZAP70, pERK, pS6\n", " - Exhaustion markers: PD-1, TIM-3, LAG-3 expression\n", " - Function: IFN-γ, TNF-α, IL-2 production\n", " - Transcription: RNA-seq for exhaustion signatures\n", "\n", "5. Data Analysis\n", " - Calculate perturbation scores for each gene\n", " - Identify genes with differential effects between conditions\n", " - Validate top hits with individual knockouts\n", "\n", "KEY CONTROLS:\n", "====================\n", "- Non-targeting sgRNAs (negative control)\n", "- Known exhaustion genes (positive control): TOX, PDCD1\n", "- Essential genes (toxicity control): RPL23, RPS19\n", "- Unstimulated cells (baseline)\n", "- Wild-type cells (no CRISPR)\n", "\n", "EXPECTED OUTCOMES:\n", "====================\n", "1. Genes that reduce exhaustion when knocked out:\n", " - Should show enhanced TCR signaling in chronic condition\n", " - Examples: TOX, PDCD1, HAVCR2, LAG3, EOMES\n", "\n", "2. Genes that promote effector function:\n", " - Should show reduced signaling when knocked out\n", " - Examples: TBX21, MYC (context-dependent)\n", "\n", "3. Metabolic regulators:\n", " - Should show differential effects on sustained signaling\n", " - Examples: MTOR, HIF1A, PRKAA1\n", "\n", "4. TCR signaling components:\n", " - Should show proportional effects on signaling strength\n", " - Examples: LCK, ZAP70, LAT\n", "\n", "==================================\u001b[1m Ai Message \u001b[0m==================================\n", "\n", "5. [✓] Provide theoretical sgRNA design recommendations and validate gene selection (completed)\n", "6. [✓] Provide comprehensive experimental design recommendations (completed)\n", "\n", "\n", "# CRISPR Screen to Identify T Cell Exhaustion Regulators\n", "\n", "## Selected 32 Genes for Maximum Perturbation Effect\n", "\n", "Based on comprehensive analysis of T cell exhaustion biology, I have identified 32 genes that will maximize perturbation effects in your CRISPR screen measuring differential TCR signaling between acute (IL-2 only) and chronic (anti-CD3 + IL-2) stimulation conditions:\n", "\n", "### Gene List (Ranked by Priority):\n", "\n", "**Immune Checkpoint Molecules (8 genes):**\n", "1. **PDCD1** (PD-1) - Primary exhaustion checkpoint receptor\n", "2. **HAVCR2** (TIM-3) - Critical exhaustion marker and checkpoint\n", "3. **LAG3** - Major exhaustion checkpoint receptor\n", "4. **TIGIT** - Emerging exhaustion checkpoint target\n", "5. **CTLA4** - Early checkpoint inhibitor\n", "6. **CD274** (PD-L1) - PD-1 ligand\n", "7. **BTLA** - Inhibitory receptor\n", "8. **CD96** - TIGIT pathway component\n", "\n", "**Transcription Factors (8 genes):**\n", "9. **TOX** - Master regulator of T cell exhaustion\n", "10. **EOMES** - Key exhaustion-promoting transcription factor\n", "11. **BATF** - Exhaustion-promoting transcription factor\n", "12. **TBX21** (T-bet) - Effector vs exhaustion balance\n", "13. **IRF4** - T cell differentiation regulator\n", "14. **NFATC1** - Chronic stimulation response factor\n", "15. **TOX2** - TOX family member\n", "16. **RUNX3** - T cell differentiation factor\n", "\n", "**Metabolic Regulators (8 genes):**\n", "17. **HIF1A** - Hypoxia and exhaustion regulator\n", "18. **MYC** - Metabolic reprogramming master regulator\n", "19. **MTOR** - Metabolic sensing and signaling\n", "20. **PRKAA1** (AMPK) - Energy sensing kinase\n", "21. **LDHA** - Lactate metabolism in exhaustion\n", "22. **PKM** - Pyruvate kinase, glycolysis regulator\n", "23. **SLC2A1** (GLUT1) - Glucose uptake transporter\n", "24. **PPARA** - Fatty acid oxidation regulator\n", "\n", "**TCR Signaling Components (7 genes):**\n", "25. **LCK** - Proximal TCR signaling kinase\n", "26. **ZAP70** - TCR signal strength regulator\n", "27. **LAT** - TCR signaling hub adaptor\n", "28. **PLCG1** (PLCγ1) - Downstream TCR signaling\n", "29. **LCP2** (SLP-76) - TCR signaling adaptor\n", "30. **VAV1** - TCR signaling regulator\n", "31. **ITK** - TCR signaling kinase\n", "\n", "**Cytokine Signaling (1 gene):**\n", "32. **STAT3** - Cytokine signaling transcription factor\n", "\n", "## Experimental Design Overview\n", "\n", "### Objective:\n", "Identify genes that regulate T cell exhaustion by measuring differential TCR signaling responses between acute and chronic stimulation conditions.\n", "\n", "### Key Features:\n", "- **Balanced gene selection** across 4 major functional categories\n", "- **High-priority targets** based on literature evidence and therapeutic relevance\n", "- **Expected differential effects** between acute vs chronic conditions\n", "- **Multiple readouts** for comprehensive phenotyping\n", "\n", "### Stimulation Conditions:\n", "- **Acute**: IL-2 only (maintains survival without exhaustion)\n", "- **Chronic**: Anti-CD3 + IL-2 (induces exhaustion through chronic TCR stimulation)\n", "\n", "### Primary Readouts:\n", "1. **TCR signaling strength**: Phospho-flow cytometry (pZAP70, pERK, pS6)\n", "2. **Exhaustion markers**: PD-1, TIM-3, LAG-3 surface expression\n", "3. **Functional capacity**: Cytokine production (IFN-γ, TNF-α, IL-2)\n", "4. **Transcriptional state**: RNA-seq for exhaustion gene signatures\n", "\n", "### Expected Outcomes:\n", "- **Checkpoint inhibitors** (PDCD1, HAVCR2, LAG3): KO should enhance signaling in chronic condition\n", "- **Exhaustion TFs** (TOX, EOMES, BATF): KO should reduce exhaustion phenotype\n", "- **Metabolic regulators** (MTOR, HIF1A, MYC): Context-dependent effects on sustained signaling\n", "- **TCR components** (LCK, ZAP70, LAT): Proportional effects on signaling strength\n", "\n", "### sgRNA Design Recommendations:\n", "- Use 3-4 sgRNAs per gene for redundancy\n", "- Include non-targeting controls and positive controls (TOX, PDCD1)\n", "- Target early exons for effective knockouts\n", "- Use established design tools (Broad GPP portal, CHOPCHOP)\n", "\n", "This gene set maximizes the potential to identify novel regulators of T cell exhaustion while covering all major pathways involved in the acute-to-chronic transition of T cell responses.\n", "\n", "==================================\u001b[1m Ai Message \u001b[0m==================================\n", "\n", "5. [✓] Provide theoretical sgRNA design recommendations and validate gene selection (completed)\n", "6. [✓] Provide comprehensive experimental design recommendations (completed)\n", "\n", "\n", "# CRISPR Screen to Identify T Cell Exhaustion Regulators\n", "\n", "## Selected 32 Genes for Maximum Perturbation Effect\n", "\n", "Based on comprehensive analysis of T cell exhaustion biology, I have identified 32 genes that will maximize perturbation effects in your CRISPR screen measuring differential TCR signaling between acute (IL-2 only) and chronic (anti-CD3 + IL-2) stimulation conditions:\n", "\n", "### Gene List (Ranked by Priority):\n", "\n", "**Immune Checkpoint Molecules (8 genes):**\n", "1. **PDCD1** (PD-1) - Primary exhaustion checkpoint receptor\n", "2. **HAVCR2** (TIM-3) - Critical exhaustion marker and checkpoint\n", "3. **LAG3** - Major exhaustion checkpoint receptor\n", "4. **TIGIT** - Emerging exhaustion checkpoint target\n", "5. **CTLA4** - Early checkpoint inhibitor\n", "6. **CD274** (PD-L1) - PD-1 ligand\n", "7. **BTLA** - Inhibitory receptor\n", "8. **CD96** - TIGIT pathway component\n", "\n", "**Transcription Factors (8 genes):**\n", "9. **TOX** - Master regulator of T cell exhaustion\n", "10. **EOMES** - Key exhaustion-promoting transcription factor\n", "11. **BATF** - Exhaustion-promoting transcription factor\n", "12. **TBX21** (T-bet) - Effector vs exhaustion balance\n", "13. **IRF4** - T cell differentiation regulator\n", "14. **NFATC1** - Chronic stimulation response factor\n", "15. **TOX2** - TOX family member\n", "16. **RUNX3** - T cell differentiation factor\n", "\n", "**Metabolic Regulators (8 genes):**\n", "17. **HIF1A** - Hypoxia and exhaustion regulator\n", "18. **MYC** - Metabolic reprogramming master regulator\n", "19. **MTOR** - Metabolic sensing and signaling\n", "20. **PRKAA1** (AMPK) - Energy sensing kinase\n", "21. **LDHA** - Lactate metabolism in exhaustion\n", "22. **PKM** - Pyruvate kinase, glycolysis regulator\n", "23. **SLC2A1** (GLUT1) - Glucose uptake transporter\n", "24. **PPARA** - Fatty acid oxidation regulator\n", "\n", "**TCR Signaling Components (7 genes):**\n", "25. **LCK** - Proximal TCR signaling kinase\n", "26. **ZAP70** - TCR signal strength regulator\n", "27. **LAT** - TCR signaling hub adaptor\n", "28. **PLCG1** (PLCγ1) - Downstream TCR signaling\n", "29. **LCP2** (SLP-76) - TCR signaling adaptor\n", "30. **VAV1** - TCR signaling regulator\n", "31. **ITK** - TCR signaling kinase\n", "\n", "**Cytokine Signaling (1 gene):**\n", "32. **STAT3** - Cytokine signaling transcription factor\n", "\n", "## Experimental Design Overview\n", "\n", "### Objective:\n", "Identify genes that regulate T cell exhaustion by measuring differential TCR signaling responses between acute and chronic stimulation conditions.\n", "\n", "### Key Features:\n", "- **Balanced gene selection** across 4 major functional categories\n", "- **High-priority targets** based on literature evidence and therapeutic relevance\n", "- **Expected differential effects** between acute vs chronic conditions\n", "- **Multiple readouts** for comprehensive phenotyping\n", "\n", "### Stimulation Conditions:\n", "- **Acute**: IL-2 only (maintains survival without exhaustion)\n", "- **Chronic**: Anti-CD3 + IL-2 (induces exhaustion through chronic TCR stimulation)\n", "\n", "### Primary Readouts:\n", "1. **TCR signaling strength**: Phospho-flow cytometry (pZAP70, pERK, pS6)\n", "2. **Exhaustion markers**: PD-1, TIM-3, LAG-3 surface expression\n", "3. **Functional capacity**: Cytokine production (IFN-γ, TNF-α, IL-2)\n", "4. **Transcriptional state**: RNA-seq for exhaustion gene signatures\n", "\n", "### Expected Outcomes:\n", "- **Checkpoint inhibitors** (PDCD1, HAVCR2, LAG3): KO should enhance signaling in chronic condition\n", "- **Exhaustion TFs** (TOX, EOMES, BATF): KO should reduce exhaustion phenotype\n", "- **Metabolic regulators** (MTOR, HIF1A, MYC): Context-dependent effects on sustained signaling\n", "- **TCR components** (LCK, ZAP70, LAT): Proportional effects on signaling strength\n", "\n", "### sgRNA Design Recommendations:\n", "- Use 3-4 sgRNAs per gene for redundancy\n", "- Include non-targeting controls and positive controls (TOX, PDCD1)\n", "- Target early exons for effective knockouts\n", "- Use established design tools (Broad GPP portal, CHOPCHOP)\n", "\n", "This gene set maximizes the potential to identify novel regulators of T cell exhaustion while covering all major pathways involved in the acute-to-chronic transition of T cell responses.\n", "\n" ] } ], "source": [ "log = agent.go(\"\"\"Plan a CRISPR screen to identify genes that regulate T cell exhaustion,\n", " measured by the change in T cell receptor (TCR) signaling between acute\n", " (interleukin-2 [IL-2] only) and chronic (anti-CD3 and IL-2) stimulation conditions.\n", " Generate 32 genes that maximize the perturbation effect.\"\"\")" ] } ], "metadata": { "kernelspec": { "display_name": "biomni_fixed", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 2 }