{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## A notebook to seamlessly take blast output to GO Slim list"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "This is a notebook meant to run in a working directory. Please set working directory as variable in next cell"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "wd=\"/Volumes/Alanine/wd/17-07-20b/\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Volumes/Alanine/wd/17-07-20b\n"
     ]
    }
   ],
   "source": [
    "cd $wd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### In this directory you will need three files\n",
    "1) blastout file in format `-6`    \n",
    "2) Uniprot GO annotation file (340M) available here `http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/uniprot-SP-GO.sorted`    \n",
    "3) GOslim file available here `http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/GO-GOslim.sorted`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GO-GOslim.sorted       giga-uniprot-blast.tab uniprot-SP-GO.sorted\n"
     ]
    }
   ],
   "source": [
    "#checking if files in directory\n",
    "!ls "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Set blastout file as variable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "blastout=\"giga-uniprot-blast.tab\"\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# That should be the last thing you have to Type!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CHOYP_043R.5.5|m.64252\tsp|Q06852|SLAP1_CLOTH\t57.45\t235\t61\t19\t611\t816\t1467\t1691\t2e-15\t85.1\n",
      "CHOYP_14332.1.2|m.5643\tsp|Q2F637|1433Z_BOMMO\t66.03\t262\t74\t2\t19\t280\t1\t247\t1e-117\t  344\n"
     ]
    }
   ],
   "source": [
    "!head -2 $blastout"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "#convert pipes to tab\n",
    "!tr '|' '\\t' < $blastout \\\n",
    "> _blast-sep.tab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "jupyter": {
     "outputs_hidden": false
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CHOYP_043R.5.5\tm.64252\tsp\tQ06852\tSLAP1_CLOTH\t57.45\t235\t61\t19\t611\t816\t1467\t1691\t2e-15\t85.1\n",
      "CHOYP_14332.1.2\tm.5643\tsp\tQ2F637\t1433Z_BOMMO\t66.03\t262\t74\t2\t19\t280\t1\t247\t1e-117\t  344\n"
     ]
    }
   ],
   "source": [
    "!head -2 _blast-sep.tab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "#reducing number of columns and sorting \n",
    "!awk -v OFS='\\t' '{print $4, $1, $14}' < _blast-sep.tab | sort \\\n",
    "> _blast-sort.tab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A1BHN5\tCHOYP_AAAS.3.3\t2.1\n",
      "A5PLL7\tCHOYP_AAEL_AAEL001208.1.1\t6e-124\n"
     ]
    }
   ],
   "source": [
    "!head -2 _blast-sort.tab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "jupyter": {
     "outputs_hidden": false
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#joining blast with uniprot annoation file and reducing to three columns UniprotID, Query, All GO terms\n",
    "!join -t $'\\t' \\\n",
    "_blast-sort.tab \\\n",
    "uniprot-SP-GO.sorted \\\n",
    "| cut -f1,2,14 \\\n",
    "> _blast-annot.tab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A1BHN5\tCHOYP_AAAS.3.3\tGO:0005524; GO:0006298; GO:0030983\n",
      "A5PLL7\tCHOYP_AAEL_AAEL001208.1.1\tGO:0005737; GO:0005783; GO:0005789; GO:0016021; GO:0031625; GO:0061630\n"
     ]
    }
   ],
   "source": [
    "!head -2 _blast-annot.tab"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## The following is a script modidified from Sam"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "%%bash \n",
    "\n",
    "# This script was originally written to address a specific problem that Rhonda was having\n",
    "\n",
    "\n",
    "\n",
    "# input_file is the initial, \"problem\" file\n",
    "# file is an intermediate file that most of the program works upon\n",
    "# output_file is the final file produced by the script\n",
    "input_file=\"_blast-annot.tab\"\n",
    "file=\"_intermediate.file\"\n",
    "output_file=\"_blast-GO-unfolded.tab\"\n",
    "\n",
    "# sed command substitutes the \"; \" sequence to a tab and writes the new format to a new file.\n",
    "# This character sequence is how the GO terms are delimited in their field.\n",
    "sed $'s/; /\\t/g' \"$input_file\" > \"$file\"\n",
    "\n",
    "# Identify first field containing a GO term.\n",
    "# Search file with grep for \"GO:\" and pipe to awk.\n",
    "# Awk sets tab as field delimiter (-F'\\t'), runs a for loop that looks for \"GO:\" (~/GO:/), and then prints the field number).\n",
    "# Awk results are piped to sort, which sorts unique by number (-ug).\n",
    "# Sort results are piped to head to retrieve the lowest value (i.e. the top of the list; \"-n1\").\n",
    "begin_goterms=$(grep \"GO:\" \"$file\" | awk -F'\\t' '{for (i=1;i<=NF;i++) if($i ~/GO:/) print i}' | sort -ug | head -n1)\n",
    "\n",
    "# While loop to process each line of the input file.\n",
    "while read -r line\n",
    "\tdo\n",
    "\t\n",
    "\t# Send contents of the current line to awk.\n",
    "\t# Set the field separator as a tab (-F'\\t') and print the number of fields in that line.\n",
    "\t# Save the results of the echo/awk pipe (i.e. number of fields) to the variable \"max_field\".\n",
    "\tmax_field=$(echo \"$line\" | awk -F'\\t' '{print NF}')\n",
    "\n",
    "\t# Send contents of current line to cut.\n",
    "\t# Cut fields (i.e. retain those fields) 1-12.\n",
    "\t# Save the results of the echo/cut pipe (i.e. fields 1-12) to the variable \"fixed_fields\"\n",
    "\tfixed_fields=$(echo \"$line\" | cut -f1-2)\n",
    "\n",
    "\t# Since not all the lines contain the same number of fields (e.g. may not have GO terms),\n",
    "\t# evaluate the number of fields in each line to determine how to handle current line.\n",
    "\n",
    "\t# If the value in max_field is less than the field number where the GO terms begin,\n",
    "\t# then just print the current line (%s) followed by a newline (\\n).\n",
    "\tif (( \"$max_field\" < \"$begin_goterms\" ))\n",
    "\t\tthen printf \"%s\\n\" \"$line\"\n",
    "\t\t\telse\n",
    "\n",
    "\t\t\t# Send contents of current line (which contains GO terms) to cut.\n",
    "\t\t\t# Cut fields (i.e. retain those fields) 13 to whatever the last field is in the curent line.\n",
    "\t\t\t# Save the results of the echo/cut pipe (i.e. all the GO terms fields) to the variable \"goterms\".\n",
    "\t\t\tgoterms=$(echo \"$line\" | cut -f\"$begin_goterms\"-\"$max_field\")\n",
    "\t\t\t\n",
    "\t\t\t# Assign values in the variable \"goterms\" to a new indexed array (called \"array\"), \n",
    "\t\t\t# with tab delimiter (IFS=$'\\t')\n",
    "\t\t\tIFS=$'\\t' read -r -a array <<<\"$goterms\"\n",
    "\t\t\t\n",
    "\t\t\t# Iterate through each element of the array.\n",
    "\t\t\t# Print the first 12 fields (i.e. the fields stored in \"fixed_fields\") followed by a tab (%s\\t).\n",
    "\t\t\t# Print the current element in the array (i.e. the current GO term) followed by a new line (%s\\n).\n",
    "\t\t\tfor element in \"${!array[@]}\"\t\n",
    "\t\t\t\tdo printf \"%s\\t%s\\n\" \"$fixed_fields\" \"${array[$element]}\"\n",
    "\t\t\tdone\n",
    "\tfi\n",
    "\n",
    "# Send the input file into the while loop and send the output to a file named \"rhonda_fixed.txt\".\n",
    "done < \"$file\" > \"$output_file\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A1BHN5\tCHOYP_AAAS.3.3\tGO:0005524\n",
      "A1BHN5\tCHOYP_AAAS.3.3\tGO:0006298\n",
      "A1BHN5\tCHOYP_AAAS.3.3\tGO:0030983\n",
      "A5PLL7\tCHOYP_AAEL_AAEL001208.1.1\tGO:0005737\n",
      "A5PLL7\tCHOYP_AAEL_AAEL001208.1.1\tGO:0005783\n",
      "A5PLL7\tCHOYP_AAEL_AAEL001208.1.1\tGO:0005789\n",
      "A5PLL7\tCHOYP_AAEL_AAEL001208.1.1\tGO:0016021\n",
      "A5PLL7\tCHOYP_AAEL_AAEL001208.1.1\tGO:0031625\n",
      "A5PLL7\tCHOYP_AAEL_AAEL001208.1.1\tGO:0061630\n",
      "B0S104\tCHOYP_AAAT.1.1\tGO:0000049\n"
     ]
    }
   ],
   "source": [
    "!head _blast-GO-unfolded.tab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "#gets rid of lines with no GOIDs\n",
    "!sort -k3 _blast-GO-unfolded.tab | grep \"GO:\" > _blast-GO-unfolded.sorted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "#joining files to get GOslim for each query (with duplicate GOslim / query removed)\n",
    "!join -1 3 -2 1 -t $'\\t' \\\n",
    "_blast-GO-unfolded.sorted \\\n",
    "GO-GOslim.sorted \\\n",
    "| uniq | awk -F'\\t' -v OFS='\\t' '{print $3, $1, $5, $6}' \\\n",
    "> Blastquery-GOslim.tab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "jupyter": {
     "outputs_hidden": false
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CHOYP_AAAT.1.1\tGO:0000049\tnucleic acid binding activity\tF\n",
      "CHOYP_AAEL_AAEL004839.1.1\tGO:0000049\tnucleic acid binding activity\tF\n",
      "CHOYP_1433E.2.2\tGO:0000077\tcell cycle and proliferation\tP\n",
      "CHOYP_1433E.2.2\tGO:0000077\tsignal transduction\tP\n",
      "CHOYP_1433E.2.2\tGO:0000077\tstress response\tP\n",
      "CHOYP_AAEL_AAEL001593.1.1\tGO:0000082\tcell cycle and proliferation\tP\n",
      "CHOYP_4EBP1.1.1\tGO:0000082\tcell cycle and proliferation\tP\n",
      "CHOYP_AAEL_AAEL004557.1.1\tGO:0000105\tother metabolic processes\tP\n",
      "CHOYP_AAEL_AAEL003539.1.1\tGO:0000118\tnucleus\tC\n",
      "CHOYP_AAEL_AAEL003539.1.1\tGO:0000122\tRNA metabolism\tP\n"
     ]
    }
   ],
   "source": [
    "!head Blastquery-GOslim.tab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}