{ "metadata": { "name": "using-screed" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "filename = '/Users/t/dev/2012-scripps/python/25k.fq.gz'" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "import screed\n", "for record in screed.open('/Users/t/dev/2012-scripps/python/25k.fq.gz'):\n", " print record.name\n", " print record.sequence\n", " print record.accuracy\n", " break" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "895:1:1:1246:14654/1\n", "CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT\n", "][aaX__aa[`ZUZ[NONNFNNNNNO_____^RQ_\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "for record in screed.open(filename):\n", " if record.name == '@895:1:4:1596:8538/2':\n", " break\n", " \n", " \n", "print record" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "{'annotations': '', 'sequence': 'TACGGTCTTTTGAACTCATTGATGGCCCTGCGAAACGCCGCCTCAAGCTGTGCTCCGTAA', 'name': '895:1:4:1596:6003/2', 'accuracy': '_____aMUV]VQFJZZVZ]VZZMPZP[Q\\\\X]]S]S````X__X_WUZZMU________P_'}\n" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "list_of_names = ['895:1:4:1596:8538/2', '895:1:4:1596:6003/2']\n", "list_of_records = []\n", "\n", "for record in screed.open(filename):\n", " if record.name in list_of_names:\n", " list_of_records.append(record)\n", "\n", "list_of_records" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 19, "text": [ "[{'annotations': '', 'sequence': 'TCGCTCAGTTTCTTCGGCATATTGATCTGAACTTCGACGTAAAGATCGCCACGCTTTCCTGAGACTCCC', 'name': '895:1:4:1596:8538/2', 'accuracy': '_________X```]ZZZX]]KXXXY____X________________________X___X______``P`'},\n", " {'annotations': '', 'sequence': 'TACGGTCTTTTGAACTCATTGATGGCCCTGCGAAACGCCGCCTCAAGCTGTGCTCCGTAA', 'name': '895:1:4:1596:6003/2', 'accuracy': '_____aMUV]VQFJZZVZ]VZZMPZP[Q\\\\X]]S]S````X__X_WUZZMU________P_'}]" ] } ], "prompt_number": 19 }, { "cell_type": "code", "collapsed": false, "input": [ "%%file list-of-seqs.txt\n", "895:1:4:1596:8538/2\n", "895:1:4:1596:6003/2" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Writing list-of-seqs.txt\n" ] } ], "prompt_number": 20 }, { "cell_type": "code", "collapsed": false, "input": [ "x = []\n", "for line in open('list-of-seqs.txt'):\n", " line = line.strip()\n", " x.append(line)\n", "x" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 21, "text": [ "['895:1:4:1596:8538/2', '895:1:4:1596:6003/2']" ] } ], "prompt_number": 21 }, { "cell_type": "code", "collapsed": false, "input": [ "list_of_names = x\n", "list_of_records = []\n", "\n", "for record in screed.open(filename):\n", " if record.name in list_of_names:\n", " list_of_records.append(record)\n", "\n", "list_of_records" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 22, "text": [ "[{'annotations': '', 'sequence': 'TCGCTCAGTTTCTTCGGCATATTGATCTGAACTTCGACGTAAAGATCGCCACGCTTTCCTGAGACTCCC', 'name': '895:1:4:1596:8538/2', 'accuracy': '_________X```]ZZZX]]KXXXY____X________________________X___X______``P`'},\n", " {'annotations': '', 'sequence': 'TACGGTCTTTTGAACTCATTGATGGCCCTGCGAAACGCCGCCTCAAGCTGTGCTCCGTAA', 'name': '895:1:4:1596:6003/2', 'accuracy': '_____aMUV]VQFJZZVZ]VZZMPZP[Q\\\\X]]S]S````X__X_WUZZMU________P_'}]" ] } ], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "n = 0.0\n", "m = 0.0\n", "for record in screed.open(filename):\n", " n += len(record.sequence) - record.sequence.count('N')\n", " m += record.sequence.count('G') + record.sequence.count('C')\n", "\n", "print '%g G/C content from %d and %d' % (m / n, m, n) # string interpolation in Python\n", "print '%g G/C content' % (m / n,) # string interpolation in Python\n", "\n", "print m/n, 'G/C content'" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0.597727 G/C content from 1030420 and 1723896\n", "0.597727 G/C content\n", "0.597727473119 G/C content\n" ] } ], "prompt_number": 33 }, { "cell_type": "code", "collapsed": false, "input": [ "outfp = open('/tmp/out.fa', 'w')\n", "for record in screed.open(filename):\n", " outfp.write('>%s\\n%s\\n' % (record.name, record.sequence))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 36 }, { "cell_type": "code", "collapsed": false, "input": [ "pwd" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 35, "text": [ "u'/Users/t/dev/2012-scripps/python'" ] } ], "prompt_number": 35 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }