{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2015-06-26 15:01:09-- ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265_1.filt.fastq.gz\n", " => ‘SRR003265_1.filt.fastq.gz’\n", "Resolving ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)... 193.62.192.8\n", "Connecting to ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)|193.62.192.8|:21... connected.\n", "Logging in as anonymous ... Logged in!\n", "==> SYST ... done. ==> PWD ... done.\n", "==> TYPE I ... done. ==> CWD (1) /vol1/ftp/phase3/data/NA18489/sequence_read ... done.\n", "==> SIZE SRR003265_1.filt.fastq.gz ... 502660639\n", "==> PASV ... done. ==> RETR SRR003265_1.filt.fastq.gz ... done.\n", "Length: 502660639 (479M) (unauthoritative)\n", "\n", "SRR003265_1.filt.fa 100%[=====================>] 479.37M 11.1MB/s in 57s \n", "\n", "2015-06-26 15:02:06 (8.43 MB/s) - ‘SRR003265_1.filt.fastq.gz’ saved [502660639]\n", "\n", "--2015-06-26 15:02:06-- ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265_2.filt.fastq.gz\n", " => ‘SRR003265_2.filt.fastq.gz’\n", "Resolving ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)... 193.62.192.8\n", "Connecting to ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)|193.62.192.8|:21... connected.\n", "Logging in as anonymous ... Logged in!\n", "==> SYST ... done. ==> PWD ... done.\n", "==> TYPE I ... done. ==> CWD (1) /vol1/ftp/phase3/data/NA18489/sequence_read ... done.\n", "==> SIZE SRR003265_2.filt.fastq.gz ... 484084218\n", "==> PASV ... done. ==> RETR SRR003265_2.filt.fastq.gz ... done.\n", "Length: 484084218 (462M) (unauthoritative)\n", "\n", "SRR003265_2.filt.fa 100%[=====================>] 461.66M 10.2MB/s in 62s \n", "\n", "2015-06-26 15:03:08 (7.50 MB/s) - ‘SRR003265_2.filt.fastq.gz’ saved [484084218]\n", "\n" ] } ], "source": [ "!rm -f SRR003265_1.filt.fastq.gz 2>/dev/null\n", "!rm -f SRR003265_2.filt.fastq.gz 2>/dev/null\n", "!wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265_1.filt.fastq.gz\n", "!wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265_2.filt.fastq.gz" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from __future__ import division, print_function\n", "import gzip\n", "import numpy as np\n", "from Bio import SeqIO, SeqUtils\n", "from Bio.Alphabet import IUPAC" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-0.00837274048503\n" ] } ], "source": [ "f = gzip.open('SRR003265_2.filt.fastq.gz', 'rt')\n", "recs = SeqIO.parse(f, 'fastq', alphabet=IUPAC.unambiguous_dna)\n", "sum_skews = 0\n", "for i, rec in enumerate(recs):\n", " skew = SeqUtils.GC_skew(rec.seq)[0]\n", " sum_skews += skew\n", " if i == 1000:\n", " break\n", "print (sum_skews / (i + 1))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-0.00837274048503\n" ] } ], "source": [ "def get_gcs(recs):\n", " for rec in recs:\n", " yield SeqUtils.GC_skew(rec.seq)[0]\n", "\n", "f = gzip.open('SRR003265_2.filt.fastq.gz', 'rt')\n", "recs = SeqIO.parse(f, 'fastq', alphabet=IUPAC.unambiguous_dna)\n", "sum_skews = 0\n", "for i, skew in enumerate(get_gcs(recs)):\n", " sum_skews += skew\n", " if i == 1000:\n", " break\n", "print (sum_skews / (i + 1))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-0.0174188277631\n" ] } ], "source": [ "f = gzip.open('SRR003265_2.filt.fastq.gz', 'rt')\n", "recs = SeqIO.parse(f, 'fastq', alphabet=IUPAC.unambiguous_dna)\n", "i = 0\n", "sum_skews = 0\n", "for rec in recs:\n", " if np.median(rec.letter_annotations['phred_quality']) < 40:\n", " continue\n", " skew = SeqUtils.GC_skew(rec.seq)[0]\n", " sum_skews += skew\n", " if i == 1000:\n", " break\n", " i += 1\n", "print (sum_skews / (i + 1))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-0.0174188277631\n" ] } ], "source": [ "def get_gcs(recs):\n", " for rec in recs:\n", " yield SeqUtils.GC_skew(rec.seq)[0]\n", "\n", "def filter_quality(recs):\n", " for rec in recs:\n", " if np.median(rec.letter_annotations['phred_quality']) >= 40:\n", " yield rec\n", " \n", "f = gzip.open('SRR003265_2.filt.fastq.gz', 'rt')\n", "recs = SeqIO.parse(f, 'fastq', alphabet=IUPAC.unambiguous_dna)\n", "sum_skews = 0\n", "for i, skew in enumerate(get_gcs(filter_quality(recs))):\n", " sum_skews += skew\n", " if i == 1000:\n", " break\n", "print (sum_skews / (i + 1))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }