{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2015-06-26 14:53:45-- ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265_1.filt.fastq.gz\n", " => ‘SRR003265_1.filt.fastq.gz’\n", "Resolving ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)... 193.62.192.8\n", "Connecting to ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)|193.62.192.8|:21... connected.\n", "Logging in as anonymous ... Logged in!\n", "==> SYST ... done. ==> PWD ... done.\n", "==> TYPE I ... done. ==> CWD (1) /vol1/ftp/phase3/data/NA18489/sequence_read ... done.\n", "==> SIZE SRR003265_1.filt.fastq.gz ... 502660639\n", "==> PASV ... done. ==> RETR SRR003265_1.filt.fastq.gz ... done.\n", "Length: 502660639 (479M) (unauthoritative)\n", "\n", "SRR003265_1.filt.fa 100%[=====================>] 479.37M 8.72MB/s in 68s \n", "\n", "2015-06-26 14:54:53 (7.07 MB/s) - ‘SRR003265_1.filt.fastq.gz’ saved [502660639]\n", "\n", "--2015-06-26 14:54:53-- ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265_2.filt.fastq.gz\n", " => ‘SRR003265_2.filt.fastq.gz’\n", "Resolving ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)... 193.62.192.8\n", "Connecting to ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)|193.62.192.8|:21... connected.\n", "Logging in as anonymous ... Logged in!\n", "==> SYST ... done. ==> PWD ... done.\n", "==> TYPE I ... done. ==> CWD (1) /vol1/ftp/phase3/data/NA18489/sequence_read ... done.\n", "==> SIZE SRR003265_2.filt.fastq.gz ... 484084218\n", "==> PASV ... done. ==> RETR SRR003265_2.filt.fastq.gz ... done.\n", "Length: 484084218 (462M) (unauthoritative)\n", "\n", "SRR003265_2.filt.fa 100%[=====================>] 461.66M 5.02MB/s in 91s \n", "\n", "2015-06-26 14:56:24 (5.08 MB/s) - ‘SRR003265_2.filt.fastq.gz’ saved [484084218]\n", "\n" ] } ], "source": [ "!rm -f SRR003265_1.filt.fastq.gz 2>/dev/null\n", "!rm -f SRR003265_2.filt.fastq.gz 2>/dev/null\n", "!wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265_1.filt.fastq.gz\n", "!wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265_2.filt.fastq.gz" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from __future__ import division, print_function" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import gzip\n", "from Bio import SeqIO" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of pairs: 9170808\n" ] } ], "source": [ "import sys\n", "if sys.version_info[0] == 2:\n", " import itertools\n", " my_zip = itertools.izip\n", "else:\n", " my_zip = zip\n", "f1 = gzip.open('SRR003265_1.filt.fastq.gz', 'rt')\n", "f2 = gzip.open('SRR003265_2.filt.fastq.gz', 'rt')\n", "recs1 = SeqIO.parse(f1, 'fastq')\n", "recs2 = SeqIO.parse(f2, 'fastq')\n", "cnt = 0\n", "for rec1, rec2 in my_zip(recs1, recs2):\n", " cnt +=1\n", "print('Number of pairs: %d' % cnt)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "1000 loops, best of 3: 828 µs per loop\n", "The slowest run took 13.50 times longer than the fastest. This could mean that an intermediate result is being cached \n", "10000000 loops, best of 3: 141 ns per loop\n", "The slowest run took 10.96 times longer than the fastest. This could mean that an intermediate result is being cached \n", "10000000 loops, best of 3: 174 ns per loop\n" ] } ], "source": [ "print(type(range(100000)))\n", "print(type(xrange(100000)))\n", "%timeit range(100000)\n", "%timeit xrange(100000)\n", "%timeit xrange(100000)[5000]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1000 loops, best of 3: 828 µs per loop\n", "The slowest run took 23.51 times longer than the fastest. This could mean that an intermediate result is being cached \n", "1000000 loops, best of 3: 172 ns per loop\n", "1000 loops, best of 3: 831 µs per loop\n", "The slowest run took 10.88 times longer than the fastest. This could mean that an intermediate result is being cached \n", "10000000 loops, best of 3: 175 ns per loop\n", "1000 loops, best of 3: 829 µs per loop\n", "The slowest run took 9.39 times longer than the fastest. This could mean that an intermediate result is being cached \n", "1000000 loops, best of 3: 203 ns per loop\n" ] } ], "source": [ "%timeit range(100000)[10]\n", "%timeit xrange(100000)[10]\n", "%timeit range(100000)[50000]\n", "%timeit xrange(100000)[50000]\n", "%timeit range(100000)[-1]\n", "%timeit xrange(100000)[-1]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "\n" ] } ], "source": [ "print(type(range(10)))\n", "print(type(zip([10])))\n", "print(type(filter(lambda x: x > 10, [10, 11])))\n", "print(type(map(lambda x: x + 1, [10])))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "23\n" ] } ], "source": [ "big_10 = filter(lambda x: x > 10, [9, 10, 11, 23])\n", "big_10_list = list(big_10)\n", "print(big_10_list[1])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-0.0174188277631\n" ] } ], "source": [ "import numpy as np\n", "from Bio import SeqUtils\n", "from Bio.Alphabet import IUPAC\n", "\n", "f = gzip.open('SRR003265_2.filt.fastq.gz', 'rt')\n", "recs = SeqIO.parse(f, 'fastq', alphabet=IUPAC.unambiguous_dna)\n", "i = 0\n", "sum_skews = 0\n", "for rec in recs:\n", " if np.median(rec.letter_annotations['phred_quality']) < 40:\n", " continue\n", " skew = SeqUtils.GC_skew(rec.seq)[0]\n", " sum_skews += skew\n", " if i == 1000:\n", " break\n", " i += 1\n", "print (sum_skews / (i + 1))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-0.0174188277631\n" ] } ], "source": [ "\n", "def get_gcs(recs):\n", " for rec in recs:\n", " yield SeqUtils.GC_skew(rec.seq)[0]\n", "\n", "def filter_quality(recs):\n", " for rec in recs:\n", " if np.median(rec.letter_annotations['phred_quality']) >= 40:\n", " yield rec\n", " \n", "f = gzip.open('SRR003265_2.filt.fastq.gz', 'rt')\n", "recs = SeqIO.parse(f, 'fastq', alphabet=IUPAC.unambiguous_dna)\n", "sum_skews = 0\n", "for i, skew in enumerate(get_gcs(filter_quality(recs))):\n", " sum_skews += skew\n", " if i == 1000:\n", " break\n", "print (sum_skews / (i + 1))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }