{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "###Create C.gigas nucleotide BLAST database" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Volumes/Data/blast_db\n" ] } ], "source": [ "cd /Volumes/Data/blast_db/" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "Building a new DB, current time: 04/29/2015 12:04:24\n", "New DB name: /Volumes/Data/Sam/fish546_2015/gigasHSrnaSeq/raw_data/Crassostrea_gigas.GCA_000297895.1.24.fa\n", "New DB title: /Volumes/Data/Sam/fish546_2015/gigasHSrnaSeq/raw_data/Crassostrea_gigas.GCA_000297895.1.24.fa\n", "Sequence type: Nucleotide\n", "Keep Linkouts: T\n", "Keep MBits: T\n", "Maximum file size: 1000000000B\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: First data line in seq is about 45% ambiguous nucleotides (shouldn't be over 40%)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1139 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1138 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1046 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1781 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1127 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1059 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1458 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1029 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1113 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 2360 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1155 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1233 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1239 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1855 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1379 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 3311 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1001 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: First data line in seq is about 81% ambiguous nucleotides (shouldn't be over 40%)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1113 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1590 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1142 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1071 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1030 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1030 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1071 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1100 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1659 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1561 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1197 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1660 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1042 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1127 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: First data line in seq is about 70% ambiguous nucleotides (shouldn't be over 40%)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1113 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1323 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1072 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1323 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1085 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1225 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1043 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1911 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 1027 characters (max is 1000)\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: Title is very long: 2121 characters (max is 1000)\n", "Adding sequences from FASTA; added 28631 sequences in 2.92758 seconds.\n" ] } ], "source": [ "!makeblastdb -in /Volumes/Data/Sam/fish546_2015/gigasHSrnaSeq/raw_data/Crassostrea_gigas.GCA_000297895.1.24.fa \\\n", "-dbtype nucl -parse_seqids" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "####Confirm creation of BLAST db files" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Crassostrea_gigas.GCA_000297895.1.24.fa.nhr Crassostrea_gigas.GCA_000297895.1.24.fa.nsd\r\n", "Crassostrea_gigas.GCA_000297895.1.24.fa.nin Crassostrea_gigas.GCA_000297895.1.24.fa.nsi\r\n", "Crassostrea_gigas.GCA_000297895.1.24.fa.nog Crassostrea_gigas.GCA_000297895.1.24.fa.nsq\r\n" ] } ], "source": [ "!ls Crassostrea_gigas.GCA_000297895.1.24.fa.*" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/Sam/scratch\n" ] } ], "source": [ "cd /Users/Sam/scratch/" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###Run blastn with C.gigas BLAST DB" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!blastn -task blastn \\\n", "-query ../Owl/halfshell/EmmaBS400.fa \\\n", "-db Crassostrea_gigas.GCA_000297895.1.24.fa \\\n", "-outfmt 6 \\\n", "-max_target_seqs 1 \\\n", "-num_threads 16 \\\n", "-out 20150429_gigas_OA_blastn.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "####Look at first 10 entries in blastn ouptut file" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_1\t28611\t87.10\t31\t4\t0\t353\t383\t121\t91\t0.065\t39.2\r\n", "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_2\t16412\t83.72\t43\t5\t2\t673\t714\t1921\t1962\t0.30\t39.2\r\n", "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_3\t27229\t92.31\t26\t2\t0\t434\t459\t253\t228\t0.042\t39.2\r\n", "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_4\t27355\t95.65\t23\t1\t0\t1731\t1753\t1034\t1056\t0.51\t37.4\r\n", "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_5\t20747\t88.00\t25\t3\t0\t586\t610\t822\t846\t5.1\t31.9\r\n", "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_6\t17358\t100.00\t20\t0\t0\t419\t438\t676\t695\t0.11\t37.4\r\n", "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_7\t17875\t100.00\t18\t0\t0\t5\t22\t1718\t1735\t1.2\t33.7\r\n", "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_8\t16813\t71.03\t107\t19\t5\t1\t107\t312\t406\t0.003\t42.8\r\n", "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_9\t19960\t76.81\t69\t7\t4\t1112\t1180\t388\t447\t0.010\t42.8\r\n", "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_10\t27511\t87.50\t32\t3\t1\t287\t318\t36\t66\t0.12\t37.4\r\n" ] } ], "source": [ "!head 20150429_gigas_OA_blastn.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "####Look at last 10 entries in blastn ouptut file" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_37167\t6521\t100.00\t17\t0\t0\t140\t156\t660\t676\t4.8\t31.9\r\n", "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_37168\t7246\t100.00\t18\t0\t0\t160\t177\t1545\t1562\t1.3\t33.7\r\n", "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_37169\t9758\t100.00\t18\t0\t0\t17\t34\t381\t398\t1.3\t33.7\r\n", "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_37170\t14641\t100.00\t20\t0\t0\t103\t122\t448\t467\t0.12\t37.4\r\n", "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_37171\t7712\t64.99\t477\t159\t4\t7\t479\t716\t1188\t1e-19\t96.9\r\n", "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_37172\t2671\t91.30\t23\t2\t0\t375\t397\t6520\t6542\t1.4\t33.7\r\n", "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_37173\t20239\t92.00\t25\t2\t0\t187\t211\t474\t498\t0.11\t37.4\r\n", "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_37174\t16752\t85.71\t28\t4\t0\t228\t255\t178\t151\t1.2\t33.7\r\n", "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_37175\t10580\t100.00\t18\t0\t0\t501\t518\t2233\t2216\t1.4\t33.7\r\n", "20150414_trimmed_2212_lane2_400ppm_GCCAAT_contig_37176\t10936\t89.29\t28\t3\t0\t374\t401\t133\t160\t0.10\t37.4\r\n" ] } ], "source": [ "!tail 20150429_gigas_OA_blastn.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "####Count number of lines (i.e. blastn outputs) in output file" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 38307 20150429_gigas_OA_blastn.tab\r\n" ] } ], "source": [ "!wc -l 20150429_gigas_OA_blastn.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "####Count the number of sequences in input FASTA file" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "37176\r\n" ] } ], "source": [ "!grep -c '>' ../Owl/halfshell/EmmaBS400.fa" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "####Check e-values" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.0\r\n", "0.001\r\n", "0.001\r\n", "0.001\r\n", "0.001\r\n", "0.001\r\n", "0.001\r\n", "0.001\r\n", "0.001\r\n", "0.001\r\n", "0.001\r\n", "0.001\r\n", "0.001\r\n", "0.001\r\n", "0.001\r\n", "0.001\r\n", "0.001\r\n", "0.001\r\n", "sort: write failed: standard output: Broken pipe\r\n", "sort: write error\r\n" ] } ], "source": [ "!cut -f 11 20150429_gigas_OA_blastn.tab | sort -n | head -50" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "####Count the number of e-values that are 0.0" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The code below cuts out the 11th column (-f 11) of our BLAST output file. That column is piped to grep. Grep finds the fixed string specified (-F) and those lines the matches that match the entire line (-x) and counts the lines (-c)." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "33\r\n" ] } ], "source": [ "!cut -f 11 20150429_gigas_OA_blastn.tab | grep -cFx '0.0'" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }