{ "cells": [ { "cell_type": "markdown", "id": "9tcjtJ8cvaPv", "metadata": { "id": "9tcjtJ8cvaPv" }, "source": [ "# Query to get intergenic regions of a TARA sample in the OcéanIA Platform" ] }, { "cell_type": "markdown", "id": "yFdcX6L9vmkE", "metadata": { "id": "yFdcX6L9vmkE" }, "source": [ "### 1. Install oceania-query-fasta package" ] }, { "cell_type": "code", "execution_count": 1, "id": "55fdb172", "metadata": { "id": "55fdb172" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting oceania-query-fasta\n", " Using cached oceania_query_fasta-0.1.7-py3-none-any.whl (14 kB)\n", "Requirement already satisfied: pandas==1.* in /opt/conda/lib/python3.9/site-packages (from oceania-query-fasta) (1.2.5)\n", "Requirement already satisfied: requests==2.* in /opt/conda/lib/python3.9/site-packages (from oceania-query-fasta) (2.25.1)\n", "Collecting click==7.*\n", " Using cached click-7.1.2-py2.py3-none-any.whl (82 kB)\n", "Requirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.9/site-packages (from pandas==1.*->oceania-query-fasta) (2.8.1)\n", "Requirement already satisfied: pytz>=2017.3 in /opt/conda/lib/python3.9/site-packages (from pandas==1.*->oceania-query-fasta) (2021.1)\n", "Requirement already satisfied: numpy>=1.16.5 in /opt/conda/lib/python3.9/site-packages (from pandas==1.*->oceania-query-fasta) (1.21.0)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.9/site-packages (from requests==2.*->oceania-query-fasta) (1.26.5)\n", "Requirement already satisfied: chardet<5,>=3.0.2 in /opt/conda/lib/python3.9/site-packages (from requests==2.*->oceania-query-fasta) (4.0.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.9/site-packages (from requests==2.*->oceania-query-fasta) (2021.5.30)\n", "Requirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.9/site-packages (from requests==2.*->oceania-query-fasta) (2.10)\n", "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.9/site-packages (from python-dateutil>=2.7.3->pandas==1.*->oceania-query-fasta) (1.16.0)\n", "Installing collected packages: click, oceania-query-fasta\n", " Attempting uninstall: click\n", " Found existing installation: click 8.0.1\n", " Uninstalling click-8.0.1:\n", " Successfully uninstalled click-8.0.1\n", "Successfully installed click-7.1.2 oceania-query-fasta-0.1.7\n", "Collecting openpyxl\n", " Using cached openpyxl-3.0.7-py2.py3-none-any.whl (243 kB)\n", "Collecting et-xmlfile\n", " Using cached et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)\n", "Installing collected packages: et-xmlfile, openpyxl\n", "Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.7\n" ] } ], "source": [ "!pip install oceania-query-fasta\n", "!pip install openpyxl # required to load supplementary info into pandas" ] }, { "cell_type": "markdown", "id": "5bdf46f4-fbba-4df4-9450-d4e4ab1dc415", "metadata": {}, "source": [ "### 2. Load supplementary information for samples and data of Ocean Microbial Reference Gene Catalog v2" ] }, { "cell_type": "code", "execution_count": 2, "id": "42df4bb5-ac1c-4137-bfae-0e40596f9cc4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " PANGAEA sample id BioSamples_ID ENA_ID ENA_Run_ID MetaG/MetaT Station \\\n", "0 TARA_Y100000004 SAMEA2619888 ERS488658 ERR594328 MetaG 34 \n", "\n", " Layer Size_fraction Size_fraction_name \\\n", "0 SRF 0.1-0.22 Girus/Prokaryote enriched \n", "\n", " Used_in_OM-RGC.v1 (Sunagawa_et_al_2015) Used_for_OM-RGC.v2 (current work) \\\n", "0 Used_in_OM-RGC.v1 Used_for_OM-RGC.v2 (current work) \n", "\n", " Used_for_profiling (current work) Polar \\\n", "0 Not_used_for_profiling (current work) Non polar \n", "\n", " Sample ID (registered at the BioSamples ...) \\\n", "0 SAMEA2619888 \n", "\n", " Sample ID (registered at the European Nu...) Date/Time \\\n", "0 ERS488658 2010-01-20T04:27:00Z \n", "\n", " Latitude Longitude Depth, nominal OS region \n", "0 18.3967 39.875 5 [RS] Red Sea (MRGID:4264) \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.9/site-packages/openpyxl/worksheet/_reader.py:312: UserWarning: Unknown extension is not supported and will be removed\n", " warn(msg)\n" ] } ], "source": [ "#@title Double click to see the cell of the Python program\n", "\n", "import pandas\n", "\n", "SUPP_INFO_LINK = \"https://zenodo.org/record/3539258/files/Salazar_et_al_2019_Suppl_Info.xlsx?download=1\"\n", "\n", "table_W1 = pandas.read_excel(SUPP_INFO_LINK, sheet_name=\"Table_W1\")\n", "\n", "# Select first sample_id at the surface water layer\n", "selected_sample = table_W1[table_W1.Layer==\"SRF\"].head(1)\n", "print(selected_sample)\n", "\n", "sample_id = selected_sample[\"PANGAEA sample id\"].item()" ] }, { "cell_type": "markdown", "id": "25c31393-cfed-419b-b02e-b0b26ae01971", "metadata": {}, "source": [ "### 3. Get metadata for the first 10 intergenic regions of size greater than 100 for the selected sample" ] }, { "cell_type": "code", "execution_count": 3, "id": "f9effea1-ea3b-4cc5-a289-db95795c6eda", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sequence start stop length \\\n", "0 TARA_Y100000004_G_scaffold1_1 509 811 302 \n", "1 TARA_Y100000004_G_scaffold5_1 0 319 319 \n", "2 TARA_Y100000004_G_scaffold16_1 0 114 114 \n", "3 TARA_Y100000004_G_scaffold37_1 8888 9021 133 \n", "4 TARA_Y100000004_G_scaffold37_1 9311 9554 243 \n", "5 TARA_Y100000004_G_scaffold54_1 2013 2386 373 \n", "6 TARA_Y100000004_G_scaffold54_1 3939 4083 144 \n", "7 TARA_Y100000004_G_scaffold55_1 373 482 109 \n", "8 TARA_Y100000004_G_scaffold60_1 3036 3232 196 \n", "9 TARA_Y100000004_G_scaffold76_1 257 465 208 \n", "\n", " gen_before \\\n", "0 TARA_Y100000004_G_scaffold1_1_gene1 \n", "1 TARA_Y100000004_G_scaffold4_1_gene12 \n", "2 TARA_Y100000004_G_scaffold15_1_gene28 \n", "3 TARA_Y100000004_G_scaffold37_1_gene73 \n", "4 TARA_Y100000004_G_scaffold37_1_gene74 \n", "5 TARA_Y100000004_G_scaffold54_1_gene87 \n", "6 TARA_Y100000004_G_scaffold54_1_gene91 \n", "7 TARA_Y100000004_G_scaffold55_1_gene104 \n", "8 TARA_Y100000004_G_scaffold60_1_gene120 \n", "9 TARA_Y100000004_G_scaffold76_1_gene130 \n", "\n", " gen_after \n", "0 TARA_Y100000004_G_scaffold1_1_gene2 \n", "1 TARA_Y100000004_G_scaffold5_1_gene13 \n", "2 TARA_Y100000004_G_scaffold16_1_gene30 \n", "3 TARA_Y100000004_G_scaffold37_1_gene74 \n", "4 TARA_Y100000004_G_scaffold37_1_gene75 \n", "5 TARA_Y100000004_G_scaffold54_1_gene88 \n", "6 TARA_Y100000004_G_scaffold54_1_gene92 \n", "7 TARA_Y100000004_G_scaffold55_1_gene105 \n", "8 TARA_Y100000004_G_scaffold60_1_gene121 \n", "9 TARA_Y100000004_G_scaffold76_1_gene131 \n" ] } ], "source": [ "#@title Double click to see the cell of the Python program\n", "\n", "from oceania import list_intergenic_regions\n", "\n", "intergenic_regions_metadata = list_intergenic_regions(sample_id, min_length=100, page=1, page_size=10)\n", "\n", "print(intergenic_regions_metadata)" ] }, { "cell_type": "markdown", "id": "5ea0f4b3", "metadata": {}, "source": [ "### 4. Prepare request params" ] }, { "cell_type": "code", "execution_count": 4, "id": "4132badf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('TARA_Y100000004_G_scaffold1_1', 509, 811), ('TARA_Y100000004_G_scaffold5_1', 0, 319), ('TARA_Y100000004_G_scaffold16_1', 0, 114), ('TARA_Y100000004_G_scaffold37_1', 8888, 9021), ('TARA_Y100000004_G_scaffold37_1', 9311, 9554), ('TARA_Y100000004_G_scaffold54_1', 2013, 2386), ('TARA_Y100000004_G_scaffold54_1', 3939, 4083), ('TARA_Y100000004_G_scaffold55_1', 373, 482), ('TARA_Y100000004_G_scaffold60_1', 3036, 3232), ('TARA_Y100000004_G_scaffold76_1', 257, 465)]\n" ] } ], "source": [ "#@title Double click to see the cell of the Python program\n", "\n", "request_regions = intergenic_regions_metadata[['sequence', 'start', 'stop']].copy()\n", "request_params = []\n", "for row in request_regions.itertuples():\n", " request_params.append(\n", " (row[1], int(row[2]), int(row[3]))\n", " )\n", "\n", "print(request_params)" ] }, { "cell_type": "markdown", "id": "0159d5e7", "metadata": {}, "source": [ "### 5. Perform call to the OcéanIA services" ] }, { "cell_type": "code", "execution_count": 5, "id": "6344f9c6", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[30-06-2021 14:09:33] Sending request for fasta sequences\n", "[30-06-2021 14:09:34] Request accepted\n", "[30-06-2021 14:09:34] Waiting for results...\n", "[30-06-2021 14:12:24] Done. Elapsed time: 170.82166524301283 seconds\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " id start end type \\\n", "0 TARA_Y100000004_G_scaffold1_1 509 811 raw \n", "1 TARA_Y100000004_G_scaffold5_1 0 319 raw \n", "2 TARA_Y100000004_G_scaffold16_1 0 114 raw \n", "3 TARA_Y100000004_G_scaffold37_1 8888 9021 raw \n", "4 TARA_Y100000004_G_scaffold37_1 9311 9554 raw \n", "5 TARA_Y100000004_G_scaffold54_1 2013 2386 raw \n", "6 TARA_Y100000004_G_scaffold54_1 3939 4083 raw \n", "7 TARA_Y100000004_G_scaffold55_1 373 482 raw \n", "8 TARA_Y100000004_G_scaffold60_1 3036 3232 raw \n", "9 TARA_Y100000004_G_scaffold76_1 257 465 raw \n", "\n", " sequence \n", "0 ATTGTATAGAATGTAGATCTTCGTTATTGGAGATTCAATGATGTGG... \n", "1 TCTGCTGTGCCTTGCATCCTACCTGCCACGCTGTAAGGCTGACAAG... \n", "2 TAATTATACAGGAGGCACCTCACTACGAGCTAAACGAGGTGCAAGA... \n", "3 TCTGTAGACCAGAATAAGAAAGGGAGCCTTCGGGCTCCCTTTTTTT... \n", "4 ATTGTGTGTATTATACAGATATAAACAAAAAATGTCAAGCGTTAAA... \n", "5 GCTGTTGACTATGCTTTGGCTTCTTCATCCTGAAAAGGGCGAAGGA... \n", "6 TGCTGCCTTCGTTGAGCGTTGTAGAACGCTTTTTCTAATGCAGTCA... \n", "7 TCATGCCACTTAAAGGAAAACAGTACAAACTAGATGTTGATGGTGA... \n", "8 TACATCCAAATCTTATATAGATATTAAATTTATTAACATCATCTCT... \n", "9 ATCATTTTTAATTCTATACAGATATGCTCTAGCTTTTGTTTTAGAC... \n" ] } ], "source": [ "from oceania import get_sequences_from_fasta\n", "\n", "request_result = get_sequences_from_fasta(\n", " sample_id,\n", " request_params\n", ")\n", "\n", "# request_result is loaded as a pandas.DataFrame\n", "print(request_result)" ] } ], "metadata": { "colab": { "name": "Copia de query_dependecy.ipynb", "provenance": [ { "file_id": "https://github.com/Inria-Chile/oceania-lib-demo/blob/main/notebooks/query_dependecy.ipynb", "timestamp": 1623342428289 } ] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.5" } }, "nbformat": 4, "nbformat_minor": 5 }