{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Notebook to generate a coverage file for use in generating a [Krona Tools](https://github.com/marbl/Krona/wiki) taxonomy plot" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### File format should be like this (tab-separated), where \"score\" is replaced with a coverage value:\n", "\n", "```\n", "#queryID #taxID #score\n", "query1 9606 0.9\n", "query2 9534 0.8\n", "```" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TODAY'S DATE:\n", "Wed Jun 12 10:48:41 PDT 2019\n", "------------\n", "\n", "Distributor ID:\tUbuntu\n", "Description:\tUbuntu 16.04.6 LTS\n", "Release:\t16.04\n", "Codename:\txenial\n", "\n", "------------\n", "HOSTNAME: \n", "swoose\n", "\n", "------------\n", "Computer Specs:\n", "\n", "Architecture: x86_64\n", "CPU op-mode(s): 32-bit, 64-bit\n", "Byte Order: Little Endian\n", "CPU(s): 24\n", "On-line CPU(s) list: 0-23\n", "Thread(s) per core: 2\n", "Core(s) per socket: 6\n", "Socket(s): 2\n", "NUMA node(s): 1\n", "Vendor ID: GenuineIntel\n", "CPU family: 6\n", "Model: 44\n", "Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz\n", "Stepping: 2\n", "CPU MHz: 2926.094\n", "BogoMIPS: 5851.96\n", "Virtualization: VT-x\n", "L1d cache: 32K\n", "L1i cache: 32K\n", "L2 cache: 256K\n", "L3 cache: 12288K\n", "NUMA node0 CPU(s): 0-23\n", "Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d\n", "\n", "------------\n", "\n", "Memory Specs\n", "\n", " total used free shared buff/cache available\n", "Mem: 70G 4.5G 61G 297M 5.0G 65G\n", "Swap: 4.7G 0B 4.7G\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "No LSB modules are available.\n" ] } ], "source": [ "%%bash\n", "echo \"TODAY'S DATE:\"\n", "date\n", "echo \"------------\"\n", "echo \"\"\n", "#Display operating system info\n", "lsb_release -a\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"HOSTNAME: \"; hostname \n", "echo \"\"\n", "echo \"------------\"\n", "echo \"Computer Specs:\"\n", "echo \"\"\n", "lscpu\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"\"\n", "echo \"Memory Specs\"\n", "echo \"\"\n", "free -mh" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: work_dir=/home/sam/analyses/20190612_metagenomics_ind_coverage_krona\n" ] } ], "source": [ "# Set working directory - %env is useful for bash\n", "%env work_dir = /home/sam/analyses/20190612_metagenomics_ind_coverage_krona\n", "work_dir = \"/home/sam/analyses/20190612_metagenomics_ind_coverage_krona\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: krona=/home/sam/programs/KronaTools-2.7/bin/ktImportTaxonomy\n" ] } ], "source": [ "# Set Krona Tools Taxonomy path - %env is useful for bash\n", "%env krona = /home/sam/programs/KronaTools-2.7/bin/ktImportTaxonomy" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "mkdir --parents \"${work_dir}\"" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/sam/analyses/20190612_metagenomics_ind_coverage_krona\n" ] } ], "source": [ "cd $work_dir" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 505M\n", "-rw-rw-r-- 1 sam sam 65M Mar 28 11:36 MG1.coverage.txt\n", "-rw-rw-r-- 1 sam sam 64M Mar 28 15:32 MG2.coverage.txt\n", "-rw-rw-r-- 1 sam sam 73M Mar 28 19:55 MG3.coverage.txt\n", "-rw-rw-r-- 1 sam sam 43M Mar 28 22:53 MG5.coverage.txt\n", "-rw-rw-r-- 1 sam sam 49M Mar 29 02:02 MG6.coverage.txt\n", "-rw-rw-r-- 1 sam sam 61M Mar 29 05:43 MG7.coverage.txt\n", "-rw-rw-r-- 1 sam sam 30M May 17 15:41 MG1_pH82.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 28M May 18 14:21 MG2_pH82.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 16M May 19 21:26 MG3_pH71.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 22M May 20 14:07 MG5_pH82.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 26M May 21 09:58 MG6_pH71.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 33M May 22 09:36 MG7_pH71.blastx.outfmt6\n" ] } ], "source": [ "%%bash\n", "# Download BLAST output files\n", "wget \\\n", "--no-directories \\\n", "--recursive \\\n", "--no-parent \\\n", "--quiet \\\n", "--accept outfmt6 \\\n", "http://gannet.fish.washington.edu/Atumefaciens/20190516_metagenomics_pgen_blastx/\n", "\n", "# Download coverage files\n", "wget \\\n", "--no-directories \\\n", "--recursive \\\n", "--no-parent \\\n", "--quiet \\\n", "--accept coverage.txt \\\n", "https://gannet.fish.washington.edu/Atumefaciens/20190327_metagenomics_pgen_megahit/\n", " \n", "ls -ltrh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Check out BLASTx file format\n", "\n", "Column one contains the \"Query ID\" and column 13 contains the NCBI Taxonomy ID" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "k141_3\tP20315.1\t39.286\t112\t64\t1\t324\t1\t211\t322\t2.07e-15\t73.6\t10759\tEnterobacteria phage T3\n", "k141_7\tP35675.1\t49.587\t121\t61\t0\t6\t368\t20\t140\t6.22e-38\t128\t552\tErwinia amylovora\n", "k141_9\tP37247.1\t45.312\t128\t70\t0\t384\t1\t175\t302\t4.74e-30\t112\t817\tBacteroides fragilis\n", "k141_10\tO86428.2\t57.724\t123\t52\t0\t370\t2\t44\t166\t1.18e-49\t162\t208964\tPseudomonas aeruginosa PAO1\n", "k141_16\tP23883.2\t52.174\t184\t81\t3\t4\t543\t124\t304\t1.48e-59\t196\t83333\tEscherichia coli K-12\n", "k141_18\tP77810.1\t54.783\t115\t52\t0\t202\t546\t16\t130\t8.58e-37\t137\t192\tAzospirillum brasilense\n", "k141_23\tQ6L5C4.1\t33.218\t289\t107\t9\t2153\t1296\t236\t441\t3.69e-28\t122\t39947\tOryza sativa Japonica Group\n", "k141_25\tP24918.2\t74.869\t191\t46\t1\t2\t568\t81\t271\t5.69e-102\t312\t367110\tNeurospora crassa OR74A\n", "k141_27\tP23883.2\t51.948\t154\t74\t0\t464\t3\t107\t260\t3.31e-53\t178\t83333\tEscherichia coli K-12\n", "k141_30\tQ2LVI3.1\t35.811\t148\t95\t0\t452\t9\t102\t249\t8.51e-31\t117\t56780\tSyntrophus aciditrophicus SB\n" ] } ], "source": [ "%%bash\n", "head MG1_pH82.blastx.outfmt6" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Checkout coverage file format" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#ID Avg_fold Length Ref_GC Covered_percent Covered_bases Plus_reads Minus_reads Read_GC Median_fold Std_Dev\n", "k141_2 flag=1 multi=3.0000 len=304 3.8257 304 0.0000 100.0000 304 6 3 0.4084 4 1.03\n", "k141_3 flag=1 multi=3.0000 len=325 3.9569 325 0.0000 100.0000 325 5 5 0.5342 5 1.70\n", "k141_4 flag=1 multi=3.0000 len=334 3.5749 334 0.0000 100.0000 334 4 6 0.3836 3 1.34\n", "k141_5 flag=1 multi=11.0000 len=315 8.6984 315 0.0000 100.0000 315 12 11 0.3119 9 3.08\n", "k141_6 flag=1 multi=3.0000 len=367 4.2480 367 0.0000 100.0000 367 7 9 0.3532 3 3.26\n", "k141_7 flag=1 multi=7.0000 len=370 8.1216 370 0.0000 100.0000 370 13 14 0.3438 7 3.90\n", "k141_8 flag=3 multi=2.0286 len=211 1.6303 211 0.0000 82.9384 175 2 2 0.5814 2 0.95\n", "k141_9 flag=1 multi=3.0000 len=386 3.9819 386 0.0000 100.0000 386 6 6 0.3591 4 1.60\n", "k141_10 flag=1 multi=1.0000 len=370 19.6270 370 0.0000 100.0000 370 51 31 0.4142 21 9.22\n" ] } ], "source": [ "%%bash\n", "head MG1.coverage.txt | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Loop through coverage files to get sorted query ID and coverage info" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 578M\n", "-rw-rw-r-- 1 sam sam 65M Mar 28 11:36 MG1.coverage.txt\n", "-rw-rw-r-- 1 sam sam 64M Mar 28 15:32 MG2.coverage.txt\n", "-rw-rw-r-- 1 sam sam 73M Mar 28 19:55 MG3.coverage.txt\n", "-rw-rw-r-- 1 sam sam 43M Mar 28 22:53 MG5.coverage.txt\n", "-rw-rw-r-- 1 sam sam 49M Mar 29 02:02 MG6.coverage.txt\n", "-rw-rw-r-- 1 sam sam 61M Mar 29 05:43 MG7.coverage.txt\n", "-rw-rw-r-- 1 sam sam 30M May 17 15:41 MG1_pH82.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 28M May 18 14:21 MG2_pH82.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 16M May 19 21:26 MG3_pH71.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 22M May 20 14:07 MG5_pH82.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 26M May 21 09:58 MG6_pH71.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 33M May 22 09:36 MG7_pH71.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 14M Jun 12 10:48 MG1.ID.coverage.sorted.txt\n", "-rw-rw-r-- 1 sam sam 14M Jun 12 10:48 MG2.ID.coverage.sorted.txt\n", "-rw-rw-r-- 1 sam sam 16M Jun 12 10:48 MG3.ID.coverage.sorted.txt\n", "-rw-rw-r-- 1 sam sam 8.9M Jun 12 10:48 MG5.ID.coverage.sorted.txt\n", "-rw-rw-r-- 1 sam sam 11M Jun 12 10:49 MG6.ID.coverage.sorted.txt\n", "-rw-rw-r-- 1 sam sam 13M Jun 12 10:49 MG7.ID.coverage.sorted.txt\n", "\n", "k141_10\t19.6270\n", "k141_100\t6.4182\n", "k141_1000\t191.3530\n", "k141_10000\t6.3785\n", "k141_100000\t11.8115\n", "k141_100001\t19.4566\n", "k141_100002\t5.6765\n", "k141_100003\t8.6479\n", "k141_100004\t2.4945\n", "k141_100005\t53.2919\n" ] } ], "source": [ "%%bash\n", "for file in *.coverage.txt\n", "do\n", "\n", "# Parses sample name\n", "sample=$(echo ${file} | awk -F'.' '{print $1}')\n", "\n", "# Skips header line and prints ID and coverage\n", "# Default awk delimiter is spaces, so\n", "# Column 1 is Query ID and column five is Avg_fold coverage\n", "# Sort by first column only\n", "awk 'NR>1 {print $1 \"\\t\" $5}' \"${file}\" \\\n", "| sort -k1,1 \\\n", "> \"${sample}\".ID.coverage.sorted.txt\n", "done\n", "\n", "ls -ltrh\n", "\n", "echo \"\"\n", "\n", "# Check output format\n", "head MG1.ID.coverage.sorted.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sort BLASTx output files in preparation for joining with sorted coverage files." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 731M\n", "-rw-rw-r-- 1 sam sam 65M Mar 28 11:36 MG1.coverage.txt\n", "-rw-rw-r-- 1 sam sam 64M Mar 28 15:32 MG2.coverage.txt\n", "-rw-rw-r-- 1 sam sam 73M Mar 28 19:55 MG3.coverage.txt\n", "-rw-rw-r-- 1 sam sam 43M Mar 28 22:53 MG5.coverage.txt\n", "-rw-rw-r-- 1 sam sam 49M Mar 29 02:02 MG6.coverage.txt\n", "-rw-rw-r-- 1 sam sam 61M Mar 29 05:43 MG7.coverage.txt\n", "-rw-rw-r-- 1 sam sam 30M May 17 15:41 MG1_pH82.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 28M May 18 14:21 MG2_pH82.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 16M May 19 21:26 MG3_pH71.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 22M May 20 14:07 MG5_pH82.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 26M May 21 09:58 MG6_pH71.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 33M May 22 09:36 MG7_pH71.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 14M Jun 12 10:48 MG1.ID.coverage.sorted.txt\n", "-rw-rw-r-- 1 sam sam 14M Jun 12 10:48 MG2.ID.coverage.sorted.txt\n", "-rw-rw-r-- 1 sam sam 16M Jun 12 10:48 MG3.ID.coverage.sorted.txt\n", "-rw-rw-r-- 1 sam sam 8.9M Jun 12 10:48 MG5.ID.coverage.sorted.txt\n", "-rw-rw-r-- 1 sam sam 11M Jun 12 10:49 MG6.ID.coverage.sorted.txt\n", "-rw-rw-r-- 1 sam sam 13M Jun 12 10:49 MG7.ID.coverage.sorted.txt\n", "-rw-rw-r-- 1 sam sam 30M Jun 12 10:49 MG1_pH82.blastx.sorted.outfmt6\n", "-rw-rw-r-- 1 sam sam 28M Jun 12 10:49 MG2_pH82.blastx.sorted.outfmt6\n", "-rw-rw-r-- 1 sam sam 16M Jun 12 10:49 MG3_pH71.blastx.sorted.outfmt6\n", "-rw-rw-r-- 1 sam sam 22M Jun 12 10:49 MG5_pH82.blastx.sorted.outfmt6\n", "-rw-rw-r-- 1 sam sam 26M Jun 12 10:49 MG6_pH71.blastx.sorted.outfmt6\n", "-rw-rw-r-- 1 sam sam 33M Jun 12 10:49 MG7_pH71.blastx.sorted.outfmt6\n", "\n", "k141_10\tO86428.2\t57.724\t123\t52\t0\t370\t2\t44\t166\t1.18e-49\t162\t208964\tPseudomonas aeruginosa PAO1\n", "k141_100001\tQ98G09.1\t59.223\t103\t42\t0\t3\t311\t75\t177\t1.25e-46\t156\t266835\tMesorhizobium japonicum MAFF 303099\n", "k141_100006\tQ0TUZ2.1\t45.946\t111\t60\t0\t398\t66\t271\t381\t9.09e-26\t104\t195103\tClostridium perfringens ATCC 13124\n", "k141_100010\tP0A0Z5.1\t29.801\t151\t100\t3\t460\t11\t300\t445\t6.75e-14\t71.2\t122587\tNeisseria meningitidis Z2491\n", "k141_100015\tO67178.1\t26.904\t394\t272\t6\t148\t1290\t331\t721\t6.37e-39\t154\t224324\tAquifex aeolicus VF5\n", "k141_100016\tG3XD46.1\t50.588\t85\t41\t1\t529\t278\t474\t558\t5.06e-20\t89.7\t208964\tPseudomonas aeruginosa PAO1\n", "k141_100017\tA1TYU8.1\t49.042\t261\t122\t2\t759\t1\t1\t258\t1.21e-82\t261\t351348\tMarinobacter hydrocarbonoclasticus VT8\n", "k141_100020\tP0AE52.1\t34.586\t133\t79\t2\t761\t384\t4\t135\t1.90e-14\t72.4\t83333\tEscherichia coli K-12\n", "k141_100022\tO85133.1\t56.000\t100\t44\t0\t302\t3\t144\t243\t8.22e-34\t120\t1063\tRhodobacter sphaeroides\n", "k141_100023\tQ9M2U3.1\t29.259\t270\t165\t7\t3744\t4517\t72\t327\t5.11e-21\t101\t3702\tArabidopsis thaliana\n" ] } ], "source": [ "%%bash\n", "for file in *outfmt6\n", "do\n", "# Parse sample name\n", "sample=$(echo ${file} | awk -F'.' '{print $1}')\n", "sort -k1,1 \"${file}\" \\\n", "> \"${sample}\".blastx.sorted.outfmt6\n", "done\n", "\n", "ls -ltrh\n", "\n", "echo \"\"\n", "\n", "head MG1_pH82.blastx.sorted.outfmt6" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create blastx and coverage file arrays for looping" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BLASTx array:\n", "MG1_pH82.blastx.sorted.outfmt6 MG2_pH82.blastx.sorted.outfmt6 MG3_pH71.blastx.sorted.outfmt6 MG5_pH82.blastx.sorted.outfmt6 MG6_pH71.blastx.sorted.outfmt6 MG7_pH71.blastx.sorted.outfmt6\n", "----------------------------------------------------------------------------------------------------\n", "\n", "Coverage array:\n", "MG1.ID.coverage.sorted.txt MG2.ID.coverage.sorted.txt MG3.ID.coverage.sorted.txt MG5.ID.coverage.sorted.txt MG6.ID.coverage.sorted.txt MG7.ID.coverage.sorted.txt\n", "----------------------------------------------------------------------------------------------------\n", "\n", "Joining MG1.ID.coverage.sorted.txt and MG1_pH82.blastx.sorted.outfmt6\n", "\n", "Joining MG2.ID.coverage.sorted.txt and MG2_pH82.blastx.sorted.outfmt6\n", "\n", "Joining MG3.ID.coverage.sorted.txt and MG3_pH71.blastx.sorted.outfmt6\n", "\n", "Joining MG5.ID.coverage.sorted.txt and MG5_pH82.blastx.sorted.outfmt6\n", "\n", "Joining MG6.ID.coverage.sorted.txt and MG6_pH71.blastx.sorted.outfmt6\n", "\n", "Joining MG7.ID.coverage.sorted.txt and MG7_pH71.blastx.sorted.outfmt6\n", "\n", "\n", "----------------------------------------------------------------------------------------------------\n", "total 769M\n", "-rw-rw-r-- 1 sam sam 65M Mar 28 11:36 MG1.coverage.txt\n", "-rw-rw-r-- 1 sam sam 64M Mar 28 15:32 MG2.coverage.txt\n", "-rw-rw-r-- 1 sam sam 73M Mar 28 19:55 MG3.coverage.txt\n", "-rw-rw-r-- 1 sam sam 43M Mar 28 22:53 MG5.coverage.txt\n", "-rw-rw-r-- 1 sam sam 49M Mar 29 02:02 MG6.coverage.txt\n", "-rw-rw-r-- 1 sam sam 61M Mar 29 05:43 MG7.coverage.txt\n", "-rw-rw-r-- 1 sam sam 30M May 17 15:41 MG1_pH82.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 28M May 18 14:21 MG2_pH82.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 16M May 19 21:26 MG3_pH71.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 22M May 20 14:07 MG5_pH82.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 26M May 21 09:58 MG6_pH71.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 33M May 22 09:36 MG7_pH71.blastx.outfmt6\n", "-rw-rw-r-- 1 sam sam 14M Jun 12 10:48 MG1.ID.coverage.sorted.txt\n", "-rw-rw-r-- 1 sam sam 14M Jun 12 10:48 MG2.ID.coverage.sorted.txt\n", "-rw-rw-r-- 1 sam sam 16M Jun 12 10:48 MG3.ID.coverage.sorted.txt\n", "-rw-rw-r-- 1 sam sam 8.9M Jun 12 10:48 MG5.ID.coverage.sorted.txt\n", "-rw-rw-r-- 1 sam sam 11M Jun 12 10:49 MG6.ID.coverage.sorted.txt\n", "-rw-rw-r-- 1 sam sam 13M Jun 12 10:49 MG7.ID.coverage.sorted.txt\n", "-rw-rw-r-- 1 sam sam 30M Jun 12 10:49 MG1_pH82.blastx.sorted.outfmt6\n", "-rw-rw-r-- 1 sam sam 28M Jun 12 10:49 MG2_pH82.blastx.sorted.outfmt6\n", "-rw-rw-r-- 1 sam sam 16M Jun 12 10:49 MG3_pH71.blastx.sorted.outfmt6\n", "-rw-rw-r-- 1 sam sam 22M Jun 12 10:49 MG5_pH82.blastx.sorted.outfmt6\n", "-rw-rw-r-- 1 sam sam 26M Jun 12 10:49 MG6_pH71.blastx.sorted.outfmt6\n", "-rw-rw-r-- 1 sam sam 33M Jun 12 10:49 MG7_pH71.blastx.sorted.outfmt6\n", "-rw-rw-r-- 1 sam sam 7.6M Jun 12 10:49 MG1_pH82.krona-coverage.tsv\n", "-rw-rw-r-- 1 sam sam 7.1M Jun 12 10:49 MG2_pH82.krona-coverage.tsv\n", "-rw-rw-r-- 1 sam sam 4.1M Jun 12 10:49 MG3_pH71.krona-coverage.tsv\n", "-rw-rw-r-- 1 sam sam 5.4M Jun 12 10:49 MG5_pH82.krona-coverage.tsv\n", "-rw-rw-r-- 1 sam sam 6.6M Jun 12 10:49 MG6_pH71.krona-coverage.tsv\n", "-rw-rw-r-- 1 sam sam 8.3M Jun 12 10:49 MG7_pH71.krona-coverage.tsv\n", "----------------------------------------------------------------------------------------------------\n", "\n", "k141_10 208964 19.6270\n", "k141_100001 266835 19.4566\n", "k141_100006 195103 5.5274\n", "k141_100010 122587 11.3650\n", "k141_100015 224324 20.2997\n", "k141_100016 208964 4.1049\n", "k141_100017 351348 157.6364\n", "k141_100020 83333 7.2350\n", "k141_100022 1063 2.7500\n", "k141_100023 3702 33.4936\n" ] } ], "source": [ "%%bash\n", "\n", "# Array of all sorted blastx output files\n", "blastx_array=(MG*.blastx.sorted.outfmt6)\n", "printf -- \"BLASTx array:\\n\"\n", "echo \"${blastx_array[@]}\"\n", "\n", "# Insert some dashes to improve viewing of output in cell below\n", "printf '%.0s-' {1..100}\n", "printf -- \"\\n\"\n", "echo \"\"\n", "\n", "# Array of all sorted coverage files\n", "coverage_array=(MG*.ID.coverage.sorted.txt)\n", "printf -- \"Coverage array:\\n\"\n", "echo \"${coverage_array[@]}\"\n", "\n", "# Insert some dashes to improve viewing of output in cell below\n", "printf '%.0s-' {1..100}\n", "printf -- \"\\n\"\n", "echo \"\"\n", "\n", "\n", "# Join with tab-delimiter\n", "# Output column 1 from the first file, column 13 from 2nd file, column 2 from first file\n", "for index in \"${!blastx_array[@]}\"\n", "do\n", "sample=$(echo \"${blastx_array[index]}\" | awk -F'.' '{print $1}')\n", "join -t $'\\t' \\\n", "-o 1.1,2.13,1.2 \\\n", "\"${coverage_array[index]}\" \"${blastx_array[index]}\" \\\n", "> \"${sample}\".krona-coverage.tsv\n", "\n", "# Insert some dashes to improve viewing of output in cell below\n", "printf -- \"Joining ${coverage_array[index]} and ${blastx_array[index]}\\n\\n\"\n", "done\n", "\n", "echo \"\"\n", "\n", "# Insert some dashes to improve viewing of output in cell below\n", "printf '%.0s-' {1..100}\n", "printf -- \"\\n\"\n", "\n", "ls -ltrh\n", "\n", "# Insert some dashes to improve viewing of output in cell below\n", "printf '%.0s-' {1..100}\n", "printf -- \"\\n\"\n", "\n", "echo \"\"\n", "\n", "head MG1_pH82.krona-coverage.tsv | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Cleanup" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 39M\n", "-rw-rw-r-- 1 sam sam 7.6M Jun 12 10:49 MG1_pH82.krona-coverage.tsv\n", "-rw-rw-r-- 1 sam sam 7.1M Jun 12 10:49 MG2_pH82.krona-coverage.tsv\n", "-rw-rw-r-- 1 sam sam 4.1M Jun 12 10:49 MG3_pH71.krona-coverage.tsv\n", "-rw-rw-r-- 1 sam sam 5.4M Jun 12 10:49 MG5_pH82.krona-coverage.tsv\n", "-rw-rw-r-- 1 sam sam 6.6M Jun 12 10:49 MG6_pH71.krona-coverage.tsv\n", "-rw-rw-r-- 1 sam sam 8.3M Jun 12 10:49 MG7_pH71.krona-coverage.tsv\n" ] } ], "source": [ "%%bash\n", "# Remove all files except .tsv\n", "find . ! -name \"*.tsv\" -type f -exec rm -f {} +\n", "ls -ltrh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Generate Krona taxonomy plots with coverage \"score\"" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loading taxonomy...\n", "Importing MG1_pH82.krona-coverage.tsv...\n", "Importing MG2_pH82.krona-coverage.tsv...\n", "Importing MG3_pH71.krona-coverage.tsv...\n", "Importing MG5_pH82.krona-coverage.tsv...\n", "Importing MG6_pH71.krona-coverage.tsv...\n", "Importing MG7_pH71.krona-coverage.tsv...\n", "Writing taxonomy.krona.html...\n", "----------------------------------------------------------------------------------------------------\n", "total 43M\n", "-rw-rw-r-- 1 sam sam 7.6M Jun 12 10:49 MG1_pH82.krona-coverage.tsv\n", "-rw-rw-r-- 1 sam sam 7.1M Jun 12 10:49 MG2_pH82.krona-coverage.tsv\n", "-rw-rw-r-- 1 sam sam 4.1M Jun 12 10:49 MG3_pH71.krona-coverage.tsv\n", "-rw-rw-r-- 1 sam sam 5.4M Jun 12 10:49 MG5_pH82.krona-coverage.tsv\n", "-rw-rw-r-- 1 sam sam 6.6M Jun 12 10:49 MG6_pH71.krona-coverage.tsv\n", "-rw-rw-r-- 1 sam sam 8.3M Jun 12 10:49 MG7_pH71.krona-coverage.tsv\n", "drwxrwxr-x 2 sam sam 556K Jun 12 10:50 taxonomy.krona.html.files\n", "-rw-rw-r-- 1 sam sam 3.2M Jun 12 10:50 taxonomy.krona.html\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " [ WARNING ] Too many query IDs to store in chart; storing supplemental\n", " files in 'taxonomy.krona.html.files'.\n" ] } ], "source": [ "%%bash\n", "\"${krona}\" \\\n", "MG1_pH82.krona-coverage.tsv \\\n", "MG2_pH82.krona-coverage.tsv \\\n", "MG3_pH71.krona-coverage.tsv \\\n", "MG5_pH82.krona-coverage.tsv \\\n", "MG6_pH71.krona-coverage.tsv \\\n", "MG7_pH71.krona-coverage.tsv\n", "\n", "# Insert some dashes to improve viewing of output in cell below\n", "printf '%.0s-' {1..100}\n", "printf -- \"\\n\"\n", "\n", "ls -ltrh" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }