{ "cells": [ { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## A notebook to seamlessly take blast output to GO Slim list" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true, "deletable": true, "editable": true }, "source": [ "This is a notebook meant to run in a working directory. Please set working directory as variable in next cell" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "wd=\"/Volumes/Alanine/wd/17-07-20b/\"\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Volumes/Alanine/wd/17-07-20b\n" ] } ], "source": [ "cd $wd" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### In this directory you will need three files\n", "1) blastout file in format `-6` \n", "2) Uniprot GO annotation file (340M) available here `http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/uniprot-SP-GO.sorted` \n", "3) GOslim file available here `http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/GO-GOslim.sorted`" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GO-GOslim.sorted giga-uniprot-blast.tab uniprot-SP-GO.sorted\r\n" ] } ], "source": [ "#checking if files in directory\n", "!ls " ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Set blastout file as variable" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "blastout=\"giga-uniprot-blast.tab\"\n" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# That should be the last thing you have to Type!" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CHOYP_043R.5.5|m.64252\tsp|Q06852|SLAP1_CLOTH\t57.45\t235\t61\t19\t611\t816\t1467\t1691\t2e-15\t85.1\r\n", "CHOYP_14332.1.2|m.5643\tsp|Q2F637|1433Z_BOMMO\t66.03\t262\t74\t2\t19\t280\t1\t247\t1e-117\t 344\r\n" ] } ], "source": [ "!head -2 $blastout" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "#convert pipes to tab\n", "!tr '|' '\\t' < $blastout \\\n", "> _blast-sep.tab" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false, "deletable": true, "editable": true, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CHOYP_043R.5.5\tm.64252\tsp\tQ06852\tSLAP1_CLOTH\t57.45\t235\t61\t19\t611\t816\t1467\t1691\t2e-15\t85.1\r\n", "CHOYP_14332.1.2\tm.5643\tsp\tQ2F637\t1433Z_BOMMO\t66.03\t262\t74\t2\t19\t280\t1\t247\t1e-117\t 344\r\n" ] } ], "source": [ "!head -2 _blast-sep.tab" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "#reducing number of columns and sorting \n", "!awk -v OFS='\\t' '{print $4, $1, $14}' < _blast-sep.tab | sort \\\n", "> _blast-sort.tab" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A1BHN5\tCHOYP_AAAS.3.3\t2.1\r\n", "A5PLL7\tCHOYP_AAEL_AAEL001208.1.1\t6e-124\r\n" ] } ], "source": [ "!head -2 _blast-sort.tab" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false, "deletable": true, "editable": true, "scrolled": true }, "outputs": [], "source": [ "#joining blast with uniprot annoation file and reducing to three columns UniprotID, Query, All GO terms\n", "!join -t $'\\t' \\\n", "_blast-sort.tab \\\n", "uniprot-SP-GO.sorted \\\n", "| cut -f1,2,14 \\\n", "> _blast-annot.tab" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A1BHN5\tCHOYP_AAAS.3.3\tGO:0005524; GO:0006298; GO:0030983\r\n", "A5PLL7\tCHOYP_AAEL_AAEL001208.1.1\tGO:0005737; GO:0005783; GO:0005789; GO:0016021; GO:0031625; GO:0061630\r\n" ] } ], "source": [ "!head -2 _blast-annot.tab" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## The following is a script modidified from Sam" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "%%bash \n", "\n", "# This script was originally written to address a specific problem that Rhonda was having\n", "\n", "\n", "\n", "# input_file is the initial, \"problem\" file\n", "# file is an intermediate file that most of the program works upon\n", "# output_file is the final file produced by the script\n", "input_file=\"_blast-annot.tab\"\n", "file=\"_intermediate.file\"\n", "output_file=\"_blast-GO-unfolded.tab\"\n", "\n", "# sed command substitutes the \"; \" sequence to a tab and writes the new format to a new file.\n", "# This character sequence is how the GO terms are delimited in their field.\n", "sed $'s/; /\\t/g' \"$input_file\" > \"$file\"\n", "\n", "# Identify first field containing a GO term.\n", "# Search file with grep for \"GO:\" and pipe to awk.\n", "# Awk sets tab as field delimiter (-F'\\t'), runs a for loop that looks for \"GO:\" (~/GO:/), and then prints the field number).\n", "# Awk results are piped to sort, which sorts unique by number (-ug).\n", "# Sort results are piped to head to retrieve the lowest value (i.e. the top of the list; \"-n1\").\n", "begin_goterms=$(grep \"GO:\" \"$file\" | awk -F'\\t' '{for (i=1;i<=NF;i++) if($i ~/GO:/) print i}' | sort -ug | head -n1)\n", "\n", "# While loop to process each line of the input file.\n", "while read -r line\n", "\tdo\n", "\t\n", "\t# Send contents of the current line to awk.\n", "\t# Set the field separator as a tab (-F'\\t') and print the number of fields in that line.\n", "\t# Save the results of the echo/awk pipe (i.e. number of fields) to the variable \"max_field\".\n", "\tmax_field=$(echo \"$line\" | awk -F'\\t' '{print NF}')\n", "\n", "\t# Send contents of current line to cut.\n", "\t# Cut fields (i.e. retain those fields) 1-12.\n", "\t# Save the results of the echo/cut pipe (i.e. fields 1-12) to the variable \"fixed_fields\"\n", "\tfixed_fields=$(echo \"$line\" | cut -f1-2)\n", "\n", "\t# Since not all the lines contain the same number of fields (e.g. may not have GO terms),\n", "\t# evaluate the number of fields in each line to determine how to handle current line.\n", "\n", "\t# If the value in max_field is less than the field number where the GO terms begin,\n", "\t# then just print the current line (%s) followed by a newline (\\n).\n", "\tif (( \"$max_field\" < \"$begin_goterms\" ))\n", "\t\tthen printf \"%s\\n\" \"$line\"\n", "\t\t\telse\n", "\n", "\t\t\t# Send contents of current line (which contains GO terms) to cut.\n", "\t\t\t# Cut fields (i.e. retain those fields) 13 to whatever the last field is in the curent line.\n", "\t\t\t# Save the results of the echo/cut pipe (i.e. all the GO terms fields) to the variable \"goterms\".\n", "\t\t\tgoterms=$(echo \"$line\" | cut -f\"$begin_goterms\"-\"$max_field\")\n", "\t\t\t\n", "\t\t\t# Assign values in the variable \"goterms\" to a new indexed array (called \"array\"), \n", "\t\t\t# with tab delimiter (IFS=$'\\t')\n", "\t\t\tIFS=$'\\t' read -r -a array <<<\"$goterms\"\n", "\t\t\t\n", "\t\t\t# Iterate through each element of the array.\n", "\t\t\t# Print the first 12 fields (i.e. the fields stored in \"fixed_fields\") followed by a tab (%s\\t).\n", "\t\t\t# Print the current element in the array (i.e. the current GO term) followed by a new line (%s\\n).\n", "\t\t\tfor element in \"${!array[@]}\"\t\n", "\t\t\t\tdo printf \"%s\\t%s\\n\" \"$fixed_fields\" \"${array[$element]}\"\n", "\t\t\tdone\n", "\tfi\n", "\n", "# Send the input file into the while loop and send the output to a file named \"rhonda_fixed.txt\".\n", "done < \"$file\" > \"$output_file\"" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A1BHN5\tCHOYP_AAAS.3.3\tGO:0005524\r\n", "A1BHN5\tCHOYP_AAAS.3.3\tGO:0006298\r\n", "A1BHN5\tCHOYP_AAAS.3.3\tGO:0030983\r\n", "A5PLL7\tCHOYP_AAEL_AAEL001208.1.1\tGO:0005737\r\n", "A5PLL7\tCHOYP_AAEL_AAEL001208.1.1\tGO:0005783\r\n", "A5PLL7\tCHOYP_AAEL_AAEL001208.1.1\tGO:0005789\r\n", "A5PLL7\tCHOYP_AAEL_AAEL001208.1.1\tGO:0016021\r\n", "A5PLL7\tCHOYP_AAEL_AAEL001208.1.1\tGO:0031625\r\n", "A5PLL7\tCHOYP_AAEL_AAEL001208.1.1\tGO:0061630\r\n", "B0S104\tCHOYP_AAAT.1.1\tGO:0000049\r\n" ] } ], "source": [ "!head _blast-GO-unfolded.tab" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "#gets rid of lines with no GOIDs\n", "!sort -k3 _blast-GO-unfolded.tab | grep \"GO:\" > _blast-GO-unfolded.sorted" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "#joining files to get GOslim for each query (with duplicate GOslim / query removed)\n", "!join -1 3 -2 1 -t $'\\t' \\\n", "_blast-GO-unfolded.sorted \\\n", "GO-GOslim.sorted \\\n", "| uniq | awk -F'\\t' -v OFS='\\t' '{print $3, $1, $5, $6}' \\\n", "> Blastquery-GOslim.tab" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false, "deletable": true, "editable": true, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CHOYP_AAAT.1.1\tGO:0000049\tnucleic acid binding activity\tF\r\n", "CHOYP_AAEL_AAEL004839.1.1\tGO:0000049\tnucleic acid binding activity\tF\r\n", "CHOYP_1433E.2.2\tGO:0000077\tcell cycle and proliferation\tP\r\n", "CHOYP_1433E.2.2\tGO:0000077\tsignal transduction\tP\r\n", "CHOYP_1433E.2.2\tGO:0000077\tstress response\tP\r\n", "CHOYP_AAEL_AAEL001593.1.1\tGO:0000082\tcell cycle and proliferation\tP\r\n", "CHOYP_4EBP1.1.1\tGO:0000082\tcell cycle and proliferation\tP\r\n", "CHOYP_AAEL_AAEL004557.1.1\tGO:0000105\tother metabolic processes\tP\r\n", "CHOYP_AAEL_AAEL003539.1.1\tGO:0000118\tnucleus\tC\r\n", "CHOYP_AAEL_AAEL003539.1.1\tGO:0000122\tRNA metabolism\tP\r\n" ] } ], "source": [ "!head Blastquery-GOslim.tab" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 0 }