{ "metadata": { "name": "04-diginorm" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "# Change what's inside the quotes into your MBL STAMPS server username, or something else unique\n", "# reminder! hit shift-ENTER to execute cell & move onto next cell\n", "username=\"CHANGEME\"" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# configure paths etc.\n", "import sys\n", "import os\n", "os.environ['PYTHONPATH'] = '/usr/local/share/khmer/python'\n", "sys.path.insert(0, '/usr/local/share/khmer/python')\n", "\n", "import khmer\n", "import screed" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "cd /mnt" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "mkdir -p $username" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "cd $username" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You should now be in '/mnt/USERNAME', where USERNAME is *not* CHANGEME" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Make two random genomes with a 10:1 abundance" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import random\n", "random.seed(1)\n", "\n", "x = [\"A\"] + [\"G\"] + [\"C\"] + [\"T\"]\n", "x = x*1000\n", "random.shuffle(x)\n", "x = \"\".join(x)\n", "\n", "y = [\"A\"] + [\"G\"] + [\"C\"] + [\"T\"]\n", "y = y*1000\n", "random.shuffle(y)\n", "y = \"\".join(y)\n", "\n", "print 'x is', x[:100]\n", "print 'y is', y[:100]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "outfp = open('metagenome.fa', 'w')\n", "print >>outfp, \">x 1\"\n", "print >>outfp, x\n", "print >>outfp, \">y 2\"\n", "print >>outfp, y\n", "outfp.close()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "!python /usr/local/share/2012-paper-diginorm/pipeline/make-biased-reads.py metagenome.fa | head -100000 > reads.fa" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "(Yes, you should see an error.)\n", "\n", "## Now, apply digital normalization & error trimming" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!/usr/local/share/khmer/scripts/normalize-by-median.py -k 20 -C 20 -x 1e8 reads.fa --savehash normC20k20.kh" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "!/usr/local/share/khmer/scripts/filter-abund.py normC20k20.kh reads.fa.keep" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "!/usr/local/share/khmer/scripts/normalize-by-median.py -k 20 -C 5 -x 1e8 reads.fa.keep.abundfilt" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "ls" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "!/usr/local/share/khmer/scripts/abundance-dist-single.py -x 1e8 -k 20 reads.fa.keep reads.fa.keep.hist\n", "!/usr/local/share/khmer/scripts/abundance-dist-single.py -x 1e8 -k 20 reads.fa.keep.abundfilt reads.fa.ka.hist\n", "!/usr/local/share/khmer/scripts/abundance-dist-single.py -x 1e8 -k 20 reads.fa.keep.abundfilt.keep reads.fa.kak.hist" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "dn1 = numpy.loadtxt('reads.fa.keep.hist')\n", "abund = numpy.loadtxt('reads.fa.ka.hist')\n", "dn2 = numpy.loadtxt('reads.fa.kak.hist')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "plot(dn1[:,0], dn1[:,1], label='first round')\n", "plot(abund[:,0], abund[:,1], label='trim errors')\n", "plot(dn2[:,0], dn2[:,1], label='second round')\n", "axis(ymax=2500)\n", "legend()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }