{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##FASTQC Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "FastQC v0.11.2\r\n"
     ]
    }
   ],
   "source": [
    "!fastqc --version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Started analysis of 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 5% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 10% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 15% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 20% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 25% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 30% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 35% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 40% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 45% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 50% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 55% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 60% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 65% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 70% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 75% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 80% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 85% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 90% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 95% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Approx 100% complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n",
      "Analysis complete for 2112_lane1_NoIndex_L001_R1_001.fastq.gz\n"
     ]
    }
   ],
   "source": [
    "!fastqc 2112_lane1_NoIndex_L001_R1_001.fastq.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "!cp 2112_lane1_NoIndex_L001_R1_001_fastqc.* \\\n",
    "/Volumes/Eagle/Arabidopsis/iPythonNotebooks/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "!mv /Volumes/Eagle/Arabidopsis/iPythonNotebooks/2112_lane1_NoIndex_L001_R1_001_fastqc.html \\\n",
    "/Volumes/Eagle/Arabidopsis/iPythonNotebooks/20150313_2112_lane1_NoIndex_L001_R1_001_fastqc.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "!mv /Volumes/Eagle/Arabidopsis/iPythonNotebooks/2112_lane1_NoIndex_L001_R1_001_fastqc.zip \\\n",
    "/Volumes/Eagle/Arabidopsis/iPythonNotebooks/20150313_2112_lane1_NoIndex_L001_R1_001_fastqc.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<iframe src=http://eagle.fish.washington.edu/Arabidopsis/iPythonNotebooks/20150313_2112_lane1_NoIndex_L001_R1_001_fastqc.html width=100% height=700></iframe>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from IPython.display import HTML\n",
    "HTML('<iframe src=http://eagle.fish.washington.edu/Arabidopsis/iPythonNotebooks/20150313_2112_lane1_NoIndex_L001_R1_001_fastqc.html width=100% height=700></iframe>')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##Illumina Index Identification Methods Comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "u'/Users/Sam/Documents'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pwd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "####Count the total number of sequences in the FASTQ file and store in variable"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This command uses bash commands to count the number of lines in the FASTQ file (```wc-l```),\n",
    "divides the total number of lines by ```4``` (there are 4 lines per read in Illumina FASTQ files).\n",
    "The ```echo``` command is used to print the result to the screen, which gets stored in the variable:\n",
    "```TotalSeqs```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "TotalSeqs = !echo $((`wc -l < 2112_lane1_NoIndex_L001_R1_001.fastq` / 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['16000000']\n"
     ]
    }
   ],
   "source": [
    "#Prints the value stored in TotalSeqs.\n",
    "#Notice that this is a Python string list and is not an integer!\n",
    "print TotalSeqs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Converts the value in the TotalSeqs string list at index 0 (TotalSeqs[0]) to \n",
    "#an integer value of base 10.\n",
    "#This conversion will be used repeatedly throughout this notebook to allow \n",
    "#mathematical calculations using the numbers generated by bash commands.\n",
    "TotalSeqs = int(TotalSeqs[0], 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "16000000\n"
     ]
    }
   ],
   "source": [
    "print TotalSeqs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##Use bash ```grep``` and ```wc -l``` to count all the instances of each of the <em>full-length</em> TruSeq adaptor/index sequences.\n",
    "\n",
    "The index sequence is indicated in each of the respective variable names.\n",
    "\n",
    "Additionally, the Epigentek barcode number is indicated in the variable names (e.g. BC1 = barcode 1)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "BC1_ATCACG_full = !grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq \\\n",
    "| wc -l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Converts the value in the BC1_ATCACG_full string list at index 0 (BC1_ATCACG_full[0]) to \n",
    "#an integer value of base 10.\n",
    "BC1_ATCACG_full = int(BC1_ATCACG_full[0] ,10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "294374\n"
     ]
    }
   ],
   "source": [
    "print BC1_ATCACG_full"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "BC3_TTAGGC_full = !grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq \\\n",
    "| wc -l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Converts the value in the BC3_TTAGGC_full string list at index 0 (BC3_TTAGGC_full[0]) to \n",
    "#an integer value of base 10.\n",
    "BC3_TTAGGC_full = int(BC3_TTAGGC_full[0], 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "244638\n"
     ]
    }
   ],
   "source": [
    "print BC3_TTAGGC_full"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "BC4_TGACCA_full = !grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq \\\n",
    "| wc -l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Converts the value in the BC4_TGACCA_full string list at index 0 (BC4_TGACCA_full[0]) to \n",
    "#an integer value of base 10.\n",
    "BC4_TGACCA_full = int(BC4_TGACCA_full[0], 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "388498\n"
     ]
    }
   ],
   "source": [
    "print BC4_TGACCA_full"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "BC5_ACAGTG_full = !grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq \\\n",
    "| wc -l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Converts the value in the BC5_ACAGTG_full string list at index 0 (BC5_ACAGTG_full[0]) to \n",
    "#an integer value of base 10.\n",
    "BC5_ACAGTG_full = int(BC5_ACAGTG_full[0], 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "308463\n"
     ]
    }
   ],
   "source": [
    "print BC5_ACAGTG_full"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "BC6_GCCAAT_full = !grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq \\\n",
    "| wc -l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Converts the value in the BC6_GCCAAT_full string list at index 0 (BC6_GCCAAT_full[0]) to \n",
    "#an integer value of base 10.\n",
    "BC6_GCCAAT_full = int(BC6_GCCAAT_full[0], 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "211205\n"
     ]
    }
   ],
   "source": [
    "print BC6_GCCAAT_full"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "BC7_CAGATC_full = !grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq \\\n",
    "| wc -l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Converts the value in the BC7_CAGATC_full string list at index 0 (BC7_CAGATC_full[0]) to \n",
    "#an integer value of base 10.\n",
    "BC7_CAGATC_full = int(BC7_CAGATC_full[0], 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "504685\n"
     ]
    }
   ],
   "source": [
    "print BC7_CAGATC_full"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###Total Number of Full-length Barcodes Identified"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Adds all of the counts from each full-length Illumina adaptor/index sequence.\n",
    "#Saves to variable \"sum_full\".\n",
    "sum_full = BC1_ATCACG_full + BC3_TTAGGC_full + BC4_TGACCA_full + BC5_ACAGTG_full + BC6_GCCAAT_full + BC7_CAGATC_full"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1951863\n"
     ]
    }
   ],
   "source": [
    "print sum_full"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###Percentage of Reads Containing Full-length Barcodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "12.19914375\n"
     ]
    }
   ],
   "source": [
    "#Calculates percentage of reads having full-lenght Illumina adaptor/index sequences.\n",
    "#Uses \"float\" to convert integer values to floating point decimals. Necessary since \n",
    "#the calculation on integers would be < 1 & would result in an answer of '0'.\n",
    "print ((float(sum_full)/TotalSeqs)*100)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##Use bash ```grep``` and ```wc -l``` to count all the instances of each of the TruSeq index sequences.\n",
    "\n",
    "The index sequence is indicated in each of the respective variable names.\n",
    "\n",
    "Additionally, the Epigentek barcode number is indicated in the variable names (e.g. BC1 = barcode 1)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "BC1_ATCACG = !grep -o 'ATCACG' 2112_lane1_NoIndex_L001_R1_001.fastq \\\n",
    "| wc -l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Converts the value in the BC1_ATCACG string list at index 0 (BC1_ATCACG[0]) to \n",
    "#an integer value of base 10.\n",
    "BC1_ATCACG = int(BC1_ATCACG[0] ,10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "700818\n"
     ]
    }
   ],
   "source": [
    "print BC1_ATCACG"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "BC3_TTAGGC = !grep -o 'TTAGGC' 2112_lane1_NoIndex_L001_R1_001.fastq \\\n",
    "| wc -l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Converts the value in the BC3_TTAGGC string list at index 0 (BC3_TTAGGC[0]) to \n",
    "#an integer value of base 10.\n",
    "BC3_TTAGGC = int(BC3_TTAGGC[0] ,10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "387329\n"
     ]
    }
   ],
   "source": [
    "print BC3_TTAGGC"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "BC4_TGACCA = !grep -o 'TGACCA' 2112_lane1_NoIndex_L001_R1_001.fastq \\\n",
    "| wc -l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Converts the value in the BC4_TGACCA string list at index 0 (BC4_TGACCA[0]) to \n",
    "#an integer value of base 10.\n",
    "BC4_TGACCA = int(BC4_TGACCA[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "727528\n"
     ]
    }
   ],
   "source": [
    "print BC4_TGACCA"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "BC5_ACAGTG = !grep -o 'ACAGTG' 2112_lane1_NoIndex_L001_R1_001.fastq \\\n",
    "| wc -l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Converts the value in the BC5_ACAGTG string list at index 0 (BC5_ACAGTG[0]) to \n",
    "#an integer value of base 10.\n",
    "BC5_ACAGTG = int(BC5_ACAGTG[0] ,10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "544521\n"
     ]
    }
   ],
   "source": [
    "print BC5_ACAGTG"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "BC6_GCCAAT = !grep -o 'GCCAAT' 2112_lane1_NoIndex_L001_R1_001.fastq \\\n",
    "| wc -l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Converts the value in the BC6_GCCAAT string list at index 0 (BC6_GCCAAT[0]) to \n",
    "#an integer value of base 10.\n",
    "BC6_GCCAAT = int(BC6_GCCAAT[0] ,10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "469213\n"
     ]
    }
   ],
   "source": [
    "print BC6_GCCAAT"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "BC7_CAGATC = !grep -o 'CAGATC' 2112_lane1_NoIndex_L001_R1_001.fastq \\\n",
    "| wc -l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Converts the value in the BC7_CAGATC string list at index 0 (BC7_CAGATC[0]) to \n",
    "#an integer value of base 10.\n",
    "BC7_CAGATC = int(BC7_CAGATC[0] ,10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1107028\n"
     ]
    }
   ],
   "source": [
    "print BC7_CAGATC"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###Total Number of Short Barcodes Identified"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Adds all of the counts from each Illumina adaptor/index sequence.\n",
    "#Saves to variable \"sum_short\".\n",
    "sum_short = BC1_ATCACG + BC3_TTAGGC + BC4_TGACCA + BC5_ACAGTG + BC6_GCCAAT + BC7_CAGATC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3936437\n"
     ]
    }
   ],
   "source": [
    "print sum_short"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###Percentage of Reads Containing Short Barcodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "24.60273125\n"
     ]
    }
   ],
   "source": [
    "#Calculates percentage of reads having full-lenght Illumina adaptor/index sequences.\n",
    "#Uses \"float\" to convert integer values to floating point decimals. Necessary since \n",
    "#the calculation on integers would be < 1 & would result in an answer of '0'.\n",
    "print ((float(sum_short)/TotalSeqs)*100)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##Use ```fastx_barcode_splitter``` to identify <em>full-length</em> TruSeq adaptor/index sequences."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "####The ```fastx_barcode_splitter``` is a component of fastx_toolkit-0.0.13.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BC7_CAGATC\tGATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGC\r\n",
      "BC4_TGACCA\tGATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGC\r\n",
      "BC5_ACAGTG\tGATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGC\r\n",
      "BC1_ATCACG\tGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGC\r\n",
      "BC3_TTAGGC\tGATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGC\r\n",
      "BC6_GCCAAT\tGATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGC"
     ]
    }
   ],
   "source": [
    "#The full-lengths barcode file used by fastx_barcode_splitter.\n",
    "!head TruSeqBarcodesLong.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###Look for full-length barcodes at beginning of lines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Barcode\tCount\tLocation\r\n",
      "BC1_ATCACG\t359478\t./bol_long_BC1_ATCACG.fastq\r\n",
      "BC3_TTAGGC\t299176\t./bol_long_BC3_TTAGGC.fastq\r\n",
      "BC4_TGACCA\t472062\t./bol_long_BC4_TGACCA.fastq\r\n",
      "BC5_ACAGTG\t378448\t./bol_long_BC5_ACAGTG.fastq\r\n",
      "BC6_GCCAAT\t257815\t./bol_long_BC6_GCCAAT.fastq\r\n",
      "BC7_CAGATC\t605680\t./bol_long_BC7_CAGATC.fastq\r\n",
      "unmatched\t13627341\t./bol_long_unmatched.fastq\r\n",
      "total\t16000000\r\n"
     ]
    }
   ],
   "source": [
    "#Gunzip the gzipped FASTQ file.\n",
    "#Pipe the output of that to fastx_barcode_splitter.pl\n",
    "#fastx_barcode_splitter uses a default mismatch value = 1\n",
    "#Specify barcode file (--bcfile TruSeqBarcodesLong.txt)\n",
    "#Specify to look for barcode at beginning of file (--bol)\n",
    "#Specify output location and append a prefix to new file name (--prefix ./bol_)\n",
    "#Specify new file name suffix (--suffix \".fastq\")\n",
    "!gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz | \\\n",
    "fastx_barcode_splitter.pl \\\n",
    "--bcfile TruSeqBarcodesLong.txt \\\n",
    "--bol \\\n",
    "--prefix ./bol_long_ \\\n",
    "--suffix \".fastq\" | \\\n",
    "tee bol_long_stats.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###Look for full-length barcodes at end of lines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Barcode\tCount\tLocation\r\n",
      "BC1_ATCACG\t136\t./eol_long_BC1_ATCACG.fastq\r\n",
      "BC3_TTAGGC\t180\t./eol_long_BC3_TTAGGC.fastq\r\n",
      "BC4_TGACCA\t388\t./eol_long_BC4_TGACCA.fastq\r\n",
      "BC5_ACAGTG\t138\t./eol_long_BC5_ACAGTG.fastq\r\n",
      "BC6_GCCAAT\t99\t./eol_long_BC6_GCCAAT.fastq\r\n",
      "BC7_CAGATC\t336\t./eol_long_BC7_CAGATC.fastq\r\n",
      "unmatched\t15998723\t./eol_long_unmatched.fastq\r\n",
      "total\t16000000\r\n"
     ]
    }
   ],
   "source": [
    "#Gunzip the gzipped FASTQ file.\n",
    "#Pipe the output of that to fastx_barcode_splitter.pl\n",
    "#fastx_barcode_splitter uses a default mismatch value = 1\n",
    "#Specify barcode file (--bcfile TruSeqBarcodesLong.txt)\n",
    "#Specify to look for barcode at beginning of file (--eol)\n",
    "#Specify output location and append a prefix to new file name (--prefix ./eol_)\n",
    "#Specify new file name suffix (--suffix \".fastq\")\n",
    "!gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz | \\\n",
    "fastx_barcode_splitter.pl \\\n",
    "--bcfile TruSeqBarcodes.txt \\\n",
    "--eol \\\n",
    "--prefix ./eol_long_ \\\n",
    "--suffix \".fastq\" | \\\n",
    "tee eol_long_stats.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##Use ```fastx_barcode_splitter``` to identify TruSeq index sequences."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BC7_CAGATC\tCAGATC\r\n",
      "BC4_TGACCA\tTGACCA\r\n",
      "BC5_ACAGTG\tACAGTG\r\n",
      "BC1_ATCACG\tATCACG\r\n",
      "BC3_TTAGGC\tTTAGGC\r\n",
      "BC6_GCCAAT\tGCCAAT"
     ]
    }
   ],
   "source": [
    "#The full-lenghts barcode file used by fastx_barcode_splitter.\n",
    "!head TruSeqBarcodesShort.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###Look for index sequences at beginning of lines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Barcode\tCount\tLocation\r\n",
      "BC1_ATCACG\t54841\t./bol_BC1_ATCACG.fastq\r\n",
      "BC3_TTAGGC\t24864\t./bol_BC3_TTAGGC.fastq\r\n",
      "BC4_TGACCA\t63480\t./bol_BC4_TGACCA.fastq\r\n",
      "BC5_ACAGTG\t12874\t./bol_BC5_ACAGTG.fastq\r\n",
      "BC6_GCCAAT\t25066\t./bol_BC6_GCCAAT.fastq\r\n",
      "BC7_CAGATC\t24092\t./bol_BC7_CAGATC.fastq\r\n",
      "unmatched\t15794783\t./bol_unmatched.fastq\r\n",
      "total\t16000000\r\n"
     ]
    }
   ],
   "source": [
    "#Gunzip the gzipped FASTQ file.\n",
    "#Pipe the output of that to fastx_barcode_splitter.pl\n",
    "#fastx_barcode_splitter uses a default mismatch value = 1\n",
    "#Specify barcode file (--bcfile TruSeqBarcodesShort.txt)\n",
    "#Specify to look for barcode at beginning of file (--bol)\n",
    "#Specify output location and append a prefix to new file name (--prefix ./bol_)\n",
    "#Specify new file name suffix (--suffix \".fastq\")\n",
    "!gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz | \\\n",
    "fastx_barcode_splitter.pl \\\n",
    "--bcfile TruSeqBarcodesShort.txt \\\n",
    "--bol \\\n",
    "--prefix ./bol_ \\\n",
    "--suffix \".fastq\" | \\\n",
    "tee bol_short_stats.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###Look for index sequences at end of lines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Barcode\tCount\tLocation\r\n",
      "BC1_ATCACG\t86091\t./eol_BC1_ATCACG.fastq\r\n",
      "BC3_TTAGGC\t27144\t./eol_BC3_TTAGGC.fastq\r\n",
      "BC4_TGACCA\t63478\t./eol_BC4_TGACCA.fastq\r\n",
      "BC5_ACAGTG\t30680\t./eol_BC5_ACAGTG.fastq\r\n",
      "BC6_GCCAAT\t54759\t./eol_BC6_GCCAAT.fastq\r\n",
      "BC7_CAGATC\t162844\t./eol_BC7_CAGATC.fastq\r\n",
      "unmatched\t15575004\t./eol_unmatched.fastq\r\n",
      "total\t16000000\r\n"
     ]
    }
   ],
   "source": [
    "#Gunzip the gzipped FASTQ file.\n",
    "#Pipe the output of that to fastx_barcode_splitter.pl\n",
    "#fastx_barcode_splitter uses a default mismatch value = 1\n",
    "#Specify barcode file (--bcfile TruSeqBarcodesShort.txt)\n",
    "#Specify to look for barcode at beginning of file (--eol)\n",
    "#Specify output location and append a prefix to new file name (--prefix ./eol_)\n",
    "#Specify new file name suffix (--suffix \".fastq\")\n",
    "!gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz | \\\n",
    "fastx_barcode_splitter.pl \\\n",
    "--bcfile TruSeqBarcodesShort.txt \\\n",
    "--eol \\\n",
    "--prefix ./eol_ \\\n",
    "--suffix \".fastq\" | \\\n",
    "tee eol_short_stats.txt"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}