{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Notebook to take `blast` output from transcriptome v 3.1 to GOslim" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set working directory" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/Users/graciecrandall/Documents/GitHub/paper-tanner-crab/notebooks'" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pwd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "wd = \"../analyses/BLAST-to-GOslim/\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/graciecrandall/Documents/GitHub/paper-tanner-crab/analyses/BLAST-to-GOslim\n" ] } ], "source": [ "cd $wd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Need three files in this directory" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1. `blast` output from transcriptome v 3.1 in format -6\n", "2. Uniprot GO annotation file (340M) available here: http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/uniprot-SP-GO.sorted\n", "3. GOslim file available here: http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/GO-GOslim.sorted" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 4391k 100 4391k 0 0 2531k 0 0:00:01 0:00:01 --:--:-- 4186k\n" ] } ], "source": [ "#`curl` `blast` output\n", "!curl --insecure https://gannet.fish.washington.edu/Atumefaciens/20200608_cbai_diamond_blastx_v2.1_v3.1/cbai_transcriptome_v3.1.blastx.outfmt6 \\\n", "-o 20200608.C_bairdi.Trinity.blastx.outfmt6" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 340M 100 340M 0 0 7996k 0 0:00:43 0:00:43 --:--:-- 14.2M 0 0:00:47 0:00:40 0:00:07 8104k\n" ] } ], "source": [ "!curl http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/uniprot-SP-GO.sorted -o uniprot-SP-GO.sorted" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 2314k 100 2314k 0 0 1182k 0 0:00:01 0:00:01 --:--:-- 3464k\n" ] } ], "source": [ "!curl http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/GO-GOslim.sorted -o GO-GOslim.sorted" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "20200608.C_bairdi.Trinity.blastx.outfmt6\r\n", "Blastquery-GOslim.tab\r\n", "GO-GOslim.sorted\r\n", "GOslim-P-pie.txt\r\n", "GOslim-pie-transcriptome-v3.1.pdf\r\n", "_blast-sep.tab\r\n", "allterms.GOslim-pie-transcriptome-v3.1.pdf\r\n", "readme.md\r\n", "transcriptome-stress-genes.tab\r\n", "uniprot-SP-GO.sorted\r\n" ] } ], "source": [ "#check that files are in directory\n", "!ls" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 48551\r\n" ] } ], "source": [ "#need to remove rows that are duplicate Trinity IDs\n", "#Sam's notes on what below code does:\n", "#1. Sort BLAST output file by Trinity ID (column 1) and then by e-value (column 11), while setting a tab as the field separator (sort defaults to space as field separator).\n", "#2. Pipe to awk. Runs an array where the first field (Trinity ID) is loaded into the array and compared to the value of the first field in the next line. If they do not match, it will print the line.\n", "!sort -k1,1 -k11,11 --field-separator $'\\t' 20200608.C_bairdi.Trinity.blastx.outfmt6 | awk '!($1 in array){array[$1]; print}' | wc -l " ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "!sort -u -k1,1 --field-separator $'\\t' 20200608.C_bairdi.Trinity.blastx.outfmt6 > blastout" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 48551 blastout\r\n" ] } ], "source": [ "!wc -l blastout" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#set `blast` output file as a variable\n", "blastout=\"20200608.C_bairdi.Trinity.blastx.outfmt6\"" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRINITY_DN100026_c0_g1_i1\tsp|P60322|NANO2_MOUSE\t79.3\t29\t6\t0\t319\t233\t88\t116\t4.1e-08\t58.9\r\n", "TRINITY_DN100027_c0_g1_i1\tsp|Q27597|NCPR_DROME\t48.4\t62\t32\t0\t547\t362\t460\t521\t2.2e-11\t70.5\r\n" ] } ], "source": [ "!head -2 blastout" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "#convert pipes to tab\n", "!tr '|' '\\t' < blastout \\\n", "> _blast-sep.tab" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRINITY_DN100026_c0_g1_i1\tsp\tP60322\tNANO2_MOUSE\t79.3\t29\t6\t0\t319\t233\t88\t116\t4.1e-08\t58.9\r\n", "TRINITY_DN100027_c0_g1_i1\tsp\tQ27597\tNCPR_DROME\t48.4\t62\t32\t0\t547\t362\t460\t521\t2.2e-11\t70.5\r\n" ] } ], "source": [ "!head -2 _blast-sep.tab\n", "#commit _blast-sep.tab to directory --> will be used to get list of Trinity_IDs from GOslim 'stress response'" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#reducing number of columns and sorting \n", "!awk -v OFS='\\t' '{print $3, $1, $13}' < _blast-sep.tab | sort \\\n", "> _blast-sort.tab" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A0A087WPF7\tTRINITY_DN5521_c0_g1_i1\t7.9e-06\r\n", "A0A0B4J2F2\tTRINITY_DN97262_c0_g1_i1\t3.6e-10\r\n" ] } ], "source": [ "!head -2 _blast-sort.tab" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A0A023GPI8\tLECA_CANBL\treviewed\tLectin alpha chain (CboL) [Cleaved into: Lectin beta chain; Lectin gamma chain]\t\tCanavalia boliviana\t237\t\t\tmannose binding [GO:0005537]; metal ion binding [GO:0046872]\tmannose binding [GO:0005537]; metal ion binding [GO:0046872]\tGO:0005537; GO:0046872\r\n", "A0A023GPJ0\tCDII_ENTCC\treviewed\tImmunity protein CdiI\tcdiI ECL_04450.1\tEnterobacter cloacae subsp. cloacae (strain ATCC 13047 / DSM 30054 / NBRC 13535 / NCDC 279-56)\t145\t\t\t\t\t\r\n" ] } ], "source": [ "!head -2 uniprot-SP-GO.sorted" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#joining blast with uniprot annoation file and reducing to three columns UniprotID, Query, All GO terms\n", "!join -t $'\\t' \\\n", "_blast-sort.tab \\\n", "uniprot-SP-GO.sorted \\\n", "| cut -f1,2,14 \\\n", "> _blast-annot.tab" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A0A0F7YZI5\tTRINITY_DN115719_c0_g1_i1\tGO:0005179; GO:0005576\r\n", "A0A0G2JZ79\tTRINITY_DN4191_c0_g1_i1\tGO:0005634; GO:0005829; GO:0006351; GO:0006355; GO:0006364; GO:0006476; GO:0006915; GO:0007517; GO:0007569; GO:0010046; GO:0010460; GO:0010667; GO:0010976; GO:0014858; GO:0016605; GO:0017136; GO:0019899; GO:0030424; GO:0030426; GO:0031667; GO:0032720; GO:0035774; GO:0043392; GO:0043422; GO:0043524; GO:0045471; GO:0045722; GO:0046872; GO:0048511; GO:0060125; GO:0060548; GO:0070301; GO:0070403; GO:0070932; GO:0071236; GO:0071303; GO:0071407; GO:0090312; GO:0097755; GO:1900181; GO:1901984; GO:1902617; GO:1903427; GO:1904373; GO:1904638; GO:1904644; GO:1904646; GO:1904648; GO:2000270; GO:2000505; GO:2000614\r\n" ] } ], "source": [ "!head -2 _blast-annot.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### The following is a script modified by Sam White" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%bash \n", "\n", "# This script was originally written to address a specific problem that Rhonda was having\n", "\n", "\n", "\n", "# input_file is the initial, \"problem\" file\n", "# file is an intermediate file that most of the program works upon\n", "# output_file is the final file produced by the script\n", "input_file=\"_blast-annot.tab\"\n", "file=\"_intermediate.file\"\n", "output_file=\"_blast-GO-unfolded.tab\"\n", "\n", "# sed command substitutes the \"; \" sequence to a tab and writes the new format to a new file.\n", "# This character sequence is how the GO terms are delimited in their field.\n", "sed $'s/; /\\t/g' \"$input_file\" > \"$file\"\n", "\n", "# Identify first field containing a GO term.\n", "# Search file with grep for \"GO:\" and pipe to awk.\n", "# Awk sets tab as field delimiter (-F'\\t'), runs a for loop that looks for \"GO:\" (~/GO:/), and then prints the field number).\n", "# Awk results are piped to sort, which sorts unique by number (-ug).\n", "# Sort results are piped to head to retrieve the lowest value (i.e. the top of the list; \"-n1\").\n", "begin_goterms=$(grep \"GO:\" \"$file\" | awk -F'\\t' '{for (i=1;i<=NF;i++) if($i ~/GO:/) print i}' | sort -ug | head -n1)\n", "\n", "# While loop to process each line of the input file.\n", "while read -r line\n", "\tdo\n", "\t\n", "\t# Send contents of the current line to awk.\n", "\t# Set the field separator as a tab (-F'\\t') and print the number of fields in that line.\n", "\t# Save the results of the echo/awk pipe (i.e. number of fields) to the variable \"max_field\".\n", "\tmax_field=$(echo \"$line\" | awk -F'\\t' '{print NF}')\n", "\n", "\t# Send contents of current line to cut.\n", "\t# Cut fields (i.e. retain those fields) 1-12.\n", "\t# Save the results of the echo/cut pipe (i.e. fields 1-12) to the variable \"fixed_fields\"\n", "\tfixed_fields=$(echo \"$line\" | cut -f1-2)\n", "\n", "\t# Since not all the lines contain the same number of fields (e.g. may not have GO terms),\n", "\t# evaluate the number of fields in each line to determine how to handle current line.\n", "\n", "\t# If the value in max_field is less than the field number where the GO terms begin,\n", "\t# then just print the current line (%s) followed by a newline (\\n).\n", "\tif (( \"$max_field\" < \"$begin_goterms\" ))\n", "\t\tthen printf \"%s\\n\" \"$line\"\n", "\t\t\telse\n", "\n", "\t\t\t# Send contents of current line (which contains GO terms) to cut.\n", "\t\t\t# Cut fields (i.e. retain those fields) 13 to whatever the last field is in the curent line.\n", "\t\t\t# Save the results of the echo/cut pipe (i.e. all the GO terms fields) to the variable \"goterms\".\n", "\t\t\tgoterms=$(echo \"$line\" | cut -f\"$begin_goterms\"-\"$max_field\")\n", "\t\t\t\n", "\t\t\t# Assign values in the variable \"goterms\" to a new indexed array (called \"array\"), \n", "\t\t\t# with tab delimiter (IFS=$'\\t')\n", "\t\t\tIFS=$'\\t' read -r -a array <<<\"$goterms\"\n", "\t\t\t\n", "\t\t\t# Iterate through each element of the array.\n", "\t\t\t# Print the first 12 fields (i.e. the fields stored in \"fixed_fields\") followed by a tab (%s\\t).\n", "\t\t\t# Print the current element in the array (i.e. the current GO term) followed by a new line (%s\\n).\n", "\t\t\tfor element in \"${!array[@]}\"\t\n", "\t\t\t\tdo printf \"%s\\t%s\\n\" \"$fixed_fields\" \"${array[$element]}\"\n", "\t\t\tdone\n", "\tfi\n", "\n", "# Send the input file into the while loop and send the output to a file named \"rhonda_fixed.txt\".\n", "done < \"$file\" > \"$output_file\"" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A0A0F7YZI5\tTRINITY_DN115719_c0_g1_i1\tGO:0005179\r\n", "A0A0F7YZI5\tTRINITY_DN115719_c0_g1_i1\tGO:0005576\r\n", "A0A0G2JZ79\tTRINITY_DN4191_c0_g1_i1\tGO:0005634\r\n", "A0A0G2JZ79\tTRINITY_DN4191_c0_g1_i1\tGO:0005829\r\n", "A0A0G2JZ79\tTRINITY_DN4191_c0_g1_i1\tGO:0006351\r\n", "A0A0G2JZ79\tTRINITY_DN4191_c0_g1_i1\tGO:0006355\r\n", "A0A0G2JZ79\tTRINITY_DN4191_c0_g1_i1\tGO:0006364\r\n", "A0A0G2JZ79\tTRINITY_DN4191_c0_g1_i1\tGO:0006476\r\n", "A0A0G2JZ79\tTRINITY_DN4191_c0_g1_i1\tGO:0006915\r\n", "A0A0G2JZ79\tTRINITY_DN4191_c0_g1_i1\tGO:0007517\r\n" ] } ], "source": [ "!head _blast-GO-unfolded.tab" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#gets rid of lines with no GOIDs\n", "!sort -k3 _blast-GO-unfolded.tab | grep \"GO:\" > _blast-GO-unfolded.sorted" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#joining files to get GOslim for each query (with duplicate GOslim / query removed)\n", "!join -1 3 -2 1 -t $'\\t' \\\n", "_blast-GO-unfolded.sorted \\\n", "GO-GOslim.sorted \\\n", "| uniq | awk -F'\\t' -v OFS='\\t' '{print $3, $1, $5, $6}' \\\n", "> Blastquery-GOslim.tab" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRINITY_DN15054_c1_g1_i1\tGO:0000001\tcell organization and biogenesis\tP\r\n", "TRINITY_DN1054_c1_g1_i1\tGO:0000001\tcell organization and biogenesis\tP\r\n", "TRINITY_DN1054_c1_g1_i3\tGO:0000001\tcell organization and biogenesis\tP\r\n", "TRINITY_DN1054_c1_g1_i5\tGO:0000001\tcell organization and biogenesis\tP\r\n", "TRINITY_DN1054_c1_g1_i7\tGO:0000001\tcell organization and biogenesis\tP\r\n", "TRINITY_DN2169_c3_g1_i2\tGO:0000002\tcell organization and biogenesis\tP\r\n", "TRINITY_DN2169_c3_g1_i3\tGO:0000002\tcell organization and biogenesis\tP\r\n", "TRINITY_DN1742_c0_g1_i19\tGO:0000002\tcell organization and biogenesis\tP\r\n", "TRINITY_DN15054_c1_g1_i1\tGO:0000002\tcell organization and biogenesis\tP\r\n", "TRINITY_DN23579_c0_g1_i1\tGO:0000002\tcell organization and biogenesis\tP\r\n" ] } ], "source": [ "!head Blastquery-GOslim.tab" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "198745 P\r\n" ] } ], "source": [ "!cat Blastquery-GOslim.tab | grep \"\tP\" | awk -F $'\\t' '{print $4}' | sort | uniq -c" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "8841 cell cycle and proliferation\r\n", "6507 DNA metabolism\r\n", "3858 death\r\n", "27904 other biological processes\r\n", "26964 developmental processes\r\n", "23297 RNA metabolism\r\n", "23265 cell organization and biogenesis\r\n", "2129 cell-cell signaling\r\n", "20834 other metabolic processes\r\n", "1768 cell adhesion\r\n", "17143 protein metabolism\r\n", "15006 transport\r\n", "10910 stress response\r\n", "10319 signal transduction\r\n" ] } ], "source": [ "!cat Blastquery-GOslim.tab | grep \"\tP\" | awk -F $'\\t' '{print $3}' | sort | uniq -c | sort -r" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!cat Blastquery-GOslim.tab | grep \"\tP\" | awk -F $'\\t' '{print $3}' | sort | uniq -c | sort -r > GOslim-P-pie.txt" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.6" } }, "nbformat": 4, "nbformat_minor": 2 }