{ "metadata": { "name": "", "signature": "sha256:45693ee2eb69848027cbff9f20306d8c3c66620c2067d546dff6b4fb4aaad5d7" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "BSMAP-methatio-methykit workflow\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Assumptions- as with most worflows in development - a working directory is used." ] }, { "cell_type": "code", "collapsed": false, "input": [ "#SR edits - adding direct link to ipynb viewer and raw files\n", "!date " ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Fri Feb 21 13:31:01 PST 2014\r\n" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "filename='BSMAP2MK_workflow'" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 43 }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "ipynb viewer format:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!echo 'http://nbviewer.ipython.org/github/sr320/ipython_nb/blob/master/''{filename}''.ipynb'" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "http://nbviewer.ipython.org/github/sr320/ipython_nb/blob/master/BSMAP2MK_workflow.ipynb\r\n" ] } ], "prompt_number": 44 }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "ipynb file:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!echo 'http://nbviewer.ipython.org/github/sr320/ipython_nb/blob/master/''{filename}''.ipynb'" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "http://nbviewer.ipython.org/github/sr320/ipython_nb/blob/master/BSMAP2MK_workflow.ipynb\r\n" ] } ], "prompt_number": 48 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Setting Variables" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#file ID\n", "fid=\"CgF_nov\"\n", "\n", "#TIMESTAMP\n", "date=!date +%m%d_%H%M\n", "\n", "#working directory (parent)\n", "wd=\"/Volumes/web/Mollusk/bs_larvae_exp/\"\n", "\n", "#where is bsmap\n", "bsmap=\"/Users/Shared/Apps/bsmap-2.73/\"\n", "\n", "#fastq files location R1 location\n", "R1=\"/Volumes/web/trilobite/Crassostrea_gigas_HTSdata/batterbox/FCC39EM/Sample_BS_CgF/filtered_BS_CgF_TTAGGC_L004_R1.fastq.gz\"\n", "\n", "#fastq files location R2 location\n", "#comment out if SE\n", "#R2=\"/Volumes/web/trilobite/Crassostrea_gigas_HTSdata/batterbox/FCC39EM/Sample_BS_CgF/filtered_BS_CgF_TTAGGC_L004_R1.fastq.gz\"\n", "\n", "\n", "#genome file \n", "genome=\"/Volumes/web/whale/ensembl/ftp.ensemblgenomes.org/pub/release-21/metazoa/fasta/crassostrea_gigas/dna/Crassostrea_gigas.GCA_000297895.1.21.dna_sm.genome.fa\"" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "cd {wd}" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/Volumes/web/Mollusk/bs_larvae_exp\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "mkdir {fid}_{date}" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "cd {fid}_{date}" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/Volumes/web/Mollusk/bs_larvae_exp/CgF_nov" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "#option - number of processes \n", "!{bsmap}bsmap -a {R1} -d {genome} -o bsmap_out.sam -p 1" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "BSMAP v2.73\r\n", "Start at: Wed Feb 12 16:25:40 2014\r\n", "\r\n", "Input reference file: /Volumes/web/whale/ensembl/ftp.ensemblgenomes.org/pub/release-21/metazoa/fasta/crassostrea_gigas/dna/Crassostrea_gigas.GCA_000297895.1.21.dna_sm.genome.fa" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " \t(format: FASTA)\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Load in 7658 db seqs, total size 557717710 bp. 9 secs passed\r\n", "total_kmers: 43046721\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Create seed table. 25 secs passed\r\n", "max number of mismatches: read_length * 8% \tmax gap size: 0\r\n", "kmer cut-off ratio:5e-07\r\n", "max multi-hits: 100\tmax Ns: 5\tseed size: 16\tindex interval: 4\r\n", "quality cutoff: 0\tbase quality char: '!'\r\n", "min fragment size:28\tmax fragemt size:500\r\n", "start from read #1\tend at read #4294967295\r\n", "additional alignment: T in reads => C in reference\r\n", "mapping strand: ++,-+\r\n", "Single-end alignment(1 threads)\r\n", "Input read file: /Volumes/web/trilobite/Crassostrea_gigas_HTSdata/batterbox/FCC39EM/Sample_BS_CgF/filtered_BS_CgF_TTAGGC_L004_R1.fastq.gz \t(format: gzipped FASTQ)\r\n", "Output file: bsmap_out.sam\t (format: SAM)\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Thread #0: \t50000 reads finished. 29 secs passed\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Thread #0: \t100000 reads finished. 34 secs passed\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Thread #0: \t150000 reads finished. 39 secs passed\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Thread #0: \t200000 reads finished. 43 secs passed\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Thread #0: \t250000 reads finished. 48 secs passed\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Thread #0: \t300000 reads finished. 53 secs passed\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Thread #0: \t345868 reads finished. 57 secs passed\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Total number of aligned reads: 218893 (63%)\r\n", "Done.\r\n", "Finished at Wed Feb 12 16:26:38 2014\r\n", "Total time consumed: 58 secs\r\n" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "!python {bsmap}methratio.py -d {genome} -u -z -g -o methratio_out.txt -s {bsmap}samtools bsmap_out.sam" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "@ Wed Feb 12 16:26:38 2014: reading reference /Volumes/web/whale/ensembl/ftp.ensemblgenomes.org/pub/release-21/metazoa/fasta/crassostrea_gigas/dna/Crassostrea_gigas.GCA_000297895.1.21.dna_sm.genome.fa ...\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "@ Wed Feb 12 16:27:11 2014: reading bsmap_out.sam ...\r\n", "[samopen] SAM header is present: 7658 sequences.\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "@ Wed Feb 12 16:27:19 2014: combining CpG methylation from both strands ...\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "@ Wed Feb 12 16:27:48 2014: writing methratio_out.txt ...\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "@ Wed Feb 12 16:29:45 2014: done.\r\n", "total 168921 valid mappings, 1626861 covered cytosines, average coverage: 1.11 fold.\r\n" ] } ], "prompt_number": 9 }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "Filter the methratio output file so that I'm only obtaining the context '__CG_' and loci with at least 5x coverage" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#command for only obtaining the context '__CG_'\n", "!grep \"[A-Z][A-Z]CG[A-Z]\" methratio_out_CG.txt" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "#obtaining a filtered file with at least 5x coverage\n", "!awk '$8 >= 5' methratio_out_CG5x.txt" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "#Now I need to format my files to be read into the methylKit package in R" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "!/Volumes/web/Mollusk/bs_larvae_exp/methratio.awk.sh methratio_out_CG5x.txt > methratio_out_CG_mkit.txt\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "!head methratio_out_CG_mkit.txt" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "chr.Base\tchr\tbase\tstrand\tcoverage\tfreqC\tfreqT\r\n", "C12802.90\tC12802\t90\tF\t1\t0.00\t1.00\r\n", "C12802.99\tC12802\t99\tF\t1\t0.00\t1.00\r\n", "C12802.119\tC12802\t119\tF\t1\t0.00\t1.00\r\n", "C12802.149\tC12802\t149\tF\t1\t0.00\t1.00\r\n", "C12960.67\tC12960\t67\tF\t1\t0.00\t1.00\r\n", "C12960.123\tC12960\t123\tF\t1\t0.00\t1.00\r\n", "C13208.148\tC13208\t148\tF\t1\t1.00\t0.00\r\n", "C13208.184\tC13208\t184\tF\t1\t1.00\t0.00\r\n", "C13766.145\tC13766\t145\tF\t1\t0.00\t1.00\r\n" ] } ], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }