{ "metadata": { "name": "", "signature": "sha256:1e2d45b3d89a4e731133b298599c8460c622960f6c60392adefa41485b058c92" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "from os.path import expandvars\n", "from skbio import Alignment, SequenceCollection\n", "from skbio.alignment import global_pairwise_align_nucleotide \n", "from skbio import DNA\n", "\n", "core_set_fp = expandvars(\"$HOME/data/greengenes_core_sets/core_set_aligned.fasta.imputed\")\n", "core_set = Alignment.read(core_set_fp)\n", "otus_85_ssu_fp = expandvars('$HOME/data/gg_13_8_otus/rep_set_aligned/85_otus.fasta')\n", "otus_85_ssu = Alignment.read(otus_85_ssu_fp)\n", "\n", "otus_85_pyn_fp = expandvars('$HOME/data/gg_13_8_otus/rep_set_aligned/85_otus.pynast.fasta')\n", "otus_85_pyn = Alignment.read(otus_85_pyn_fp)\n", "otus_85_unaligned_fp = expandvars('$HOME/data/gg_13_8_otus/rep_set/85_otus.fasta')\n", "otus_85_unaligned = SequenceCollection.read(otus_85_unaligned_fp)\n", "\n", "core_set = core_set.degap()\n", "otus_85_ssu = otus_85_ssu.degap()\n", "otus_85_pyn = otus_85_pyn.degap()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "otus_85_unaligned" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 2, "text": [ "" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "otus_85_pyn" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 3, "text": [ "" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "otus_85_ssu" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": [ "" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "not_equal = []\n", "for e in otus_85_pyn.ids():\n", " if str(otus_85_pyn[e]) != str(otus_85_unaligned[e]):\n", " try:\n", " not_equal.append((e, global_pairwise_align_nucleotide(str(otus_85_pyn[e]), str(otus_85_unaligned[e])).distances()[0,1]))\n", " except ValueError:\n", " not_equal.append((e, \"Sequence contains N's, skipping.\"))\n", " print not_equal[-1]\n", "print len(not_equal)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "('1110814', 0.0013458950201884253)\n", "('1109993', 0.0013605442176870747)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('1109948', 0.0012978585334198572)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('1107031', 0.0013395847287340924)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('851865', 0.0022865853658536584)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('851822', 0.0020891364902506965)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('847804', 0.00069444444444444447)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('819699', 0.00067842605156037987)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('813266', 0.00067159167226326397)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('810721', 0.0019815059445178335)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('769319', 0.0019442644199611147)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('674402', 0.0013183915622940012)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('674285', 0.00068540095956134343)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('660890', 0.0013360053440213762)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('566326', 0.0013395847287340924)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('566061', 0.0013395847287340924)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('556736', 0.00071530758226037196)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('550826', 0.002054794520547945)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('535799', 0.0014336917562724014)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('529491', 0.006405693950177936)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('528234', 0.00071530758226037196)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('522863', 0.0014440433212996389)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('517210', \"Sequence contains N's, skipping.\")\n", "('514547', \"Sequence contains N's, skipping.\")\n", "('512076', 0.00071684587813620072)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('511139', 0.0014825796886582653)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('510608', 0.0014285714285714286)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('354920', 0.0064148253741981472)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('353530', 0.0014367816091954023)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('349931', 0.0019933554817275745)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('347595', 0.00065746219592373442)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('346914', 0.0014357501794687725)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('344506', \"Sequence contains N's, skipping.\")\n", "('343387', 0.0019880715705765406)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('343009', 0.001440922190201729)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('340937', 0.00066889632107023408)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('340665', 0.0014336917562724014)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('339598', 0.00067294751009421266)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('339005', 0.0019973368841544607)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "('338374', 0.0064655172413793103)" ] } ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Misc. notes from here..." ] }, { "cell_type": "code", "collapsed": false, "input": [ "print global_pairwise_align_nucleotide(str(otus_85_pyn['674285']), str(otus_85_unaligned['674285'])).distances()[0,1]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0.000685400959561\n" ] } ], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "core_set.distribution_stats()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 14, "text": [ "(4938, 1521.4262859457269, 24.17461148474332)" ] } ], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "otus_85.distribution_stats()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 15, "text": [ "(5088, 1269.1540880503144, 44.555375046698863)" ] } ], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "otus_85_pyn.distribution_stats()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 108, "text": [ "(4797, 1425.0273087346259, 60.454162283929563)" ] } ], "prompt_number": 108 }, { "cell_type": "code", "collapsed": false, "input": [ "common_ids = list(set(otus_85.ids()) & (set(core_set.ids())))\n", "len(common_ids)\n", "print common_ids" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "['203503', '103940', '161507', '74066', '35786', '88193', '101764', '144333', '227846', '203137', '201355', '4146', '158611', '107881', '109057', '177228', '217784', '138007', '220160', '193092', '202746', '2917', '179300', '151811', '3490', '207151', '98761', '99461', '131207', '193598', '138301', '93530', '1663', '140806', '159742', '52397', '92430', '161484', '139526', '17862', '223031', '114856', '174109', '217989', '144625', '43971', '104067', '111274', '206066', '140938', '211314', '137541', '172015', '198375', '134265', '102448', '153802', '170060', '79590', '155181', '182630', '180127', '109624', '214891', '179000', '184252', '224506', '8194', '100246', '3892', '88848', '180402', '33606', '38917', '131533', '209414', '148047', '184401', '106627', '112651', '145658', '51807', '153746', '184661', '110224', '16195', '209483', '114724', '222095', '191213', '166367', '162211', '159293', '218559', '51154', '233513', '79191', '66919', '136119', '208896', '19806', '141088', '227130', '202139', '170145', '182569', '32172', '234397', '218058', '28592', '2534', '67842', '109663', '21', '105940', '6321', '141010', '90217', '58625', '203454', '150102', '110393', '150264', '55626', '155396', '13725', '131162', '210997', '192222', '232590', '158624', '60618', '211918', '33001', '148676', '32823', '151965', '213210', '142736', '17507', '161403', '161407', '1711', '175358', '140122', '201138', '7', '56503', '176242', '82334', '209806', '153017', '153403', '150680', '150441', '106474', '148318', '95345', '28705', '27500', '149351', '193107', '74136', '220200', '159367', '201556', '177176', '136895', '1006', '193666', '178045', '1222', '1220', '191032', '35350', '183051', '80101', '143399', '186578', '52036', '216234', '216235', '106876', '170175', '231994', '158117', '114082', '107797', '111582', '217227', '180430', '164402', '209664', '173296', '184873', '90508', '185977', '162509', '883', '203525', '221562', '159945', '108724', '19323', '215574', '888', '15704', '203529', '43314', '159203', '93487', '217717', '22294', '110020', '200608', '165300', '110455', '223835', '206331', '44257', '198577', '44156', '183870', '771', '164293', '772', '222456', '109549', '230258', '141239', '214270', '192697', '144924', '155335', '51926', '142268', '18783', '208292', '21860', '114305', '224817', '130062', '160813', '26708', '28429', '111025', '141395', '114578', '57451', '144458', '48617', '215025', '132414', '64372', '209786', '221463', '143944', '234668', '103462', '111958', '159862', '182617', '207920', '110470', '203449', '218453', '232927', '232888', '37026', '231758', '838', '206038', '146428', '58627', '203188', '180467', '92321', '149095', '105480', '225943', '221738', '93199', '107637', '145173', '108673', '91130', '200948', '161220', '185012', '146556', '170463', '213358', '150571', '112707', '37803', '151616', '141197', '159172', '45967', '159170', '1429', '105025', '2563', '158207', '137398', '178547', '166591', '99346', '183580', '196844', '140910', '202302', '143387', '51677', '207211', '143099', '144777', '174018', '218700', '183107', '178438', '41280', '107789', '163874', '195743', '218850', '165063', '161018', '3538', '200762', '89297', '43095', '221823', '151438', '134564', '102556', '10485', '98560', '220089', '144490', '111520', '195985', '169822', '142674', '164913', '13031', '181596', '32511', '145732', '160498', '181986', '45082', '2843', '101480', '234971', '189190', '171242', '160113', '100774', '135186', '6010', '143031', '135632', '143037', '146921', '132390', '199942', '20204', '104684', '139265', '52', '129486', '191378', '209792', '136414', '93838', '201939', '40435', '217293', '91225', '230061', '161126', '123055', '191183', '80284', '136950', '50807', '18466', '145762', '158641', '111406', '125020', '174974', '152225', '164420', '220131', '18867', '114291', '176499', '195325', '112809', '19343', '203276', '207228', '153923', '150191', '220728', '100328', '213436', '37498', '198417', '206757', '150741', '111066', '104534', '72055', '142856', '204874', '33564', '165553', '100020', '111614', '115256', '45149', '25562', '26642', '176053', '150632', '195496', '49', '111100', '182857', '101465', '2508', '42311', '141661', '34789', '200020', '197460', '136293', '79153', '160555', '34546', '101908', '49231', '32849', '148516', '63784', '152353', '185950', '224797', '145549']\n" ] } ], "prompt_number": 52 }, { "cell_type": "code", "collapsed": false, "input": [ "otus_85['203503']" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 23, "text": [ "" ] } ], "prompt_number": 23 }, { "cell_type": "code", "collapsed": false, "input": [ "core_set['203503']" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 39, "text": [ "" ] } ], "prompt_number": 39 }, { "cell_type": "code", "collapsed": false, "input": [ "core_set['90040']" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 38, "text": [ "" ] } ], "prompt_number": 38 }, { "cell_type": "code", "collapsed": false, "input": [ "denovo1009 = DNA(\"GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGATGAAGCCTAGCTTGCTAGGTGGATTAGTGGCGAACGGGTGAGTAATACGTGAGTAACCTACCTTTAACTCTGGGATAAGCCCGGGAAACTGGGTCTAATACCGGATACGACCAATCTTCGCATGGGGGTTGGTGGAAAGGTTTGTTCTGGTGGGGGATGGGCTCGCGGCCTATCAGCTTGTTGGTGGGGTGATGGCCTACCAAGGCTTTGACGGGTAGCCGGCCTGAGAGGGTGACCGGTCACATTGGGACTGAGATACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGAAAGCCTGATGCAGCGACGCCGCGTGAGGGATGGAGGCCTTCGGGTTGTAAACCTCTTTCGCTCATGGTCAAGCCGCAACTGTGGGTTGTGGTGAGGGTAGTGGGGTAAAGAAGCGCCGGGCTAACTCCGTGCCAGCAGCCGCGGTAATGACTGCCAAGGG\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 29 }, { "cell_type": "code", "collapsed": false, "input": [ "ac = global_pairwise_align_nucleotide(denovo1009, core_set['90040'])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 40 }, { "cell_type": "code", "collapsed": false, "input": [ "print(ac.to_fasta())" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ ">0\n", "--------------------GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAAC-GATGAAGCCTAGC---TTGCTAGGTGGATTAGTGGCGAACGGGTGAGTAATACGTGAGTAACCTACCTTTAACTCTGGGATAAGCCCGGGAAACTGGGTCTAATACCGGATACGACCAATCTTCGCATGGGGGTTGGTGGAAAGGTTTGTTCTGGTGGGGGATGGGCTCGCGGCCTATCAGCTTGTTGGTGGGGTGATGGCCTACCAAGGCTTTGACGGGTAGCCGGCCTGAGAGGGTGACCGGTCACATTGGGACTGAGATACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGAAAGCCTGATGCAGCGACGCCGCGTGAGGGATGGAGGCCTTCGGGTTGTAAACCTCTTTCGCTCATGGTCAAGCCGCAACTGTGGGTTGTGGTGAGGGTAGTGGGGTAAAGAAGCGCCGGGCTAACTCCGTGCCAGCAGCCGCGGTAATGACTGCCAAGGG--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", ">90040\n", "AGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGACTGGCCCTGGCTTTTTTTGGGGTTGGTGAGTGGCGAACGGGTGAGTAACACGTGAGTAACCTGCCCTTGACTCTGGGATAAGCGCCGGAAACGGTGTCTAATACTGGATGTGACTTCTGCCTGCATGGGTGGGGGTGGAAAGATTTTTT--GGTTGGGGATGGGCTCGCGGCTTATCAGCTTGTTGGTGGGGTGATGGCTTACCAAGGCTTTGACGGGTAGCCGGCCTGAGAGGGTGACCGGTCACATTGGGACTGAGATACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGGAAGCCTGATGCAGCGACGCCGCGTGAGGGATGGAGGCCTTCGGGTTGTGAACCTCTTTCGCTCGTGGTCAAGCCGCCCCTGTGGGGGGTGGTGAGGGTAGCGGG-TAAAGAAGCTCCGG-CTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGAGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGCTTGTAGGCGGTTTGTTGCGTCTGCCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGTGGGTACGGGCAGGCTTGAGTGCGGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCGTTACTGACGCTGAGGAGCGAAACGTTGGGGAGCGAACAGGATTAAATACCCTGGTAGTCCATGCTGTAAACGTTGGGCACTAGGTGTTGGGGCCTCCCTTGGTTTCGCGCCGTAGCTAACGCTTTAATTGCCCCCCTGGGGAGTACGGCGCAAGCTAAAACTCCAAGGAATTGACGGGGGCCCGCAAAACGGGGGGAGCATGCGGATTAATTCGATGCAACCGCAAAGAACCTTACCAAGGCTTGACATGCGCCCTGAGCACCCTGAGTGGGTGCCATTGGTTGGGGGTGCCAGGGGTCGCATGGTTGTCGTCAGCTCGTGTCGTAGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCACCCTTGCCCTATGTTGCCAGCACGTGATGGTGGGGACTCGTGGGGGACTGCCGGGGTTAACTCGGAGGAAGGTGGGGATGACGTCAAATCATCATGCCCCTTATGTCTTGGGCTCCACGCATGCTACAATGGCTGGTACAGAGGGTTGCGATGCCGTGAGGTGGAGCGAATCCCTTAAAGCTGGTCTCAGTTCGGATTGGGGTCTGCAACTCGACCCCATGAAGGTGGAGTCGCTAGTAATCGCAGATCAGCAGTGCTGCGGTGAATACGTTCTCGGGCCTTGTACACACCGCCCGTCACGTCATGAAAGTTGGTAACGCCCGAAGCCCATGGCCTAACCGGTTTGTCTGGGGGGAGTGGTCGAAGGTGGGACTGGCGATTGGGACGAAGTCGTAACAAGGTAGCCGTACCGGAAGGTGCGGCTGGATCACCTCCTT\n", "\n" ] } ], "prompt_number": 41 }, { "cell_type": "code", "collapsed": false, "input": [ "ac = global_pairwise_align_nucleotide(otus_85['214270'], str(core_set['214270']).upper())" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 58 }, { "cell_type": "code", "collapsed": false, "input": [ "print(ac.to_fasta())" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ ">214270\n", "-GAGTTTGATCCTGGCTCAGGACGAACGCTAGCGGCGTGTTTGAAACATGCAAGTCGAA-------------------CGGTGGCGAACGGGTGCGTAACACGTAAAGAACCTACCCTTTAGCGGGGGATAGCCTTGGGAAACCGGGGGTAATACCGCATAAG----------------------------------GAAA-GTCACTGAAGGAGGGCTTTGCTGCCTATCAGGTAGTTGGTGAGGTAAAGGCCCACCAAGCCTAAGACGGGTAGCTGGTCTGAGAGGATGGTCAGCCACACTGGGACTGAGATACGGCCCAGACTCCTACGGGGGGCAGCAGTTTAGAATATTGTGCAATGGGCGAAAGCCTGACACAGCGACGCCGCGTGGGTGATGAAGGCCTTCGGGTCGTAAAGCCCTGTTGGTAGGGATGA----GGACAGTACCTACCGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCTGCGGTAATACGTAGGGGGCGAGCGTTGTCCGAAGTTACTGGGCGTAAAGCGCACGTAGGCGGCATTGTAAGTTGCATTTGAAATTTGACGGCTCAACCGTCAAAAGTGGTGTAAGACTGCAAAGCTTGAGGCAATCAAAAGAATGTGGAACTCCGAGTGTAGCGGTGGAATGCGTAGAGATTCGGAAGAACACCCATGGCGAAGGCAGCATTCTGGGATT--CCTGACGCTGAGGAGCGAAAGCCAGGGGAGCGAACGGGATTAGATACCCCGGTAGTCCTGGCCGTAAACGATGGATACTAGGTGTA-GGGGCCT---------CTGTGCCGAAGCTAACGCAATAAGTATCCCGCCTGGGGAGTACGGTCGCAAGGCTAAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACCTTACCTGGGTTTGACAT--------------------------------------------CGACAGGTGCTGCATGGCTGTCGTCAGCTCGTATCGTGAGATGTCGGGTTAAGTCCCTTAACGAGCGCAACCCTTGTCCTTAGTTGCCATC------ATGGGCACTCTAAGGAGACCGCCGGTGTTAAACCGGAGGAAGGTGGGGATGACGTCAAGTCCTCATGGCCCTTATGTCCAGGGCTACACACGTGCTACAATGGCCGG--CAAAGGGATGCTAACCTGTAAGGGGGAGCTAACCTC----AACCGGTCTCAGTTCGGA-TGAAGTCTGCAATTCGACTTCA-GAAGCTGGAATCGCTAGTAATCGCGGATCAGAACGCCGCGGTGAATACGTTCCCGGGCCTTGCACACAC----------------------------------------------------------------------------------------------------------------------------------------------\n", ">1\n", "AGAGTTTGATCCTGGCTCAGGACGAACGCTAGCGGCGTGTTTGAAACATGCAAGTCGAACGGAGTGTAGTAATACATTCGGTGGCGAACGGGTGCGTAACACGTAAAGAACCTACCCTTTAGCGGGGGATAGCCTTGGGAAACCGGGGGTAATACCGCATAAGCAACTTTAGACGCATGTTTAGAGTTGGAAAGGCAGAAATGTCACTGAAGGAGGGCTTTGCTGCCTATCAGGTAGTTGGTGAGGTAAAGGCCCACCAAGCCTAAGACGGGTAGCTGGTCTGAGAGGATGGTCAGCCACACTGGGACTGAGATACGGCCCAGACTCCTACGGGGGGCAGCAGTTTAGAATATTGTGCAATGGGCGAAAGCCTGACACAGCGACGCCGCGTGGGTGATGAAGGCCTTCGGGTCGTAAAGCCCTGTTGGTAGGGATGAGCAAGGACAGTACCTACCGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCTGCGGTAATACGTAGGGGGCGAGCGTTGTCCGAAGTTACTGGGCGTAAAGCGCACGTAGGCGGCATTGTAAGTTGCATTTGAAATTTGACGGCTCAACCGTCAAAAGTGGTGTAAGACTGCAAAGCTTGAGGCAATCAAAAGAATGTGGAACTCCGAGTGTAGCGGTGGAATGCGTAGAGATTCGGAAGAACACCCATGGCGAAGGCAGCATTCTGGGATTGACCTGACGCTGAGGAGCGAAAGCCAGGGGAGCGAACGGGATTAGATACCCCGGTAGTCCTGGCCGTAAACGATGGATACTAGGTGTAGGGGGTATCGACCCCCTCTGTGCCGAAGCTAACGCAATAAGTATCCCGCCTGGGGAGTACGGTCGCAAGGCTAAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACCTTACCTGGGTTTGACATCCCCGGACAGCCCTAGAGATAGGGTTTCCTCTTCGGAGGCCGGGAGACAGGTGCTGCATGGCTGTCGTCAGCTCGTATCGTGAGATGTCGGGTTAAGTCCCTTAACGAGCGCAACCCTTGTCCTTAGTTGCCATCATTTAGTTGGGCACTCTAAGGAGACCGCCGGTGTTAAACCGGAGGAAGGTGGGGATGACGTCAAGTCCTCATGGCCCTTATGTCCAGGGCTACACACGTGCTACAATGGCCGGTACAAAGGGATGCTAACCTGTAAGGGGGAGCTAACCTCAAAAAGCCGGTCTCAGTTCGGATTGAAGTCTGCAATTCGACTTCATGAAGCTGGAATCGCTAGTAATCGCGGATCAGAACGCCGCGGTGAATACGTTCCCGGGCCTTGCACACACCGCCCGTCACACCATGGGAGTGGGTTGCAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGCTGGATCACCTCCTT\n", "\n" ] } ], "prompt_number": 60 }, { "cell_type": "code", "collapsed": false, "input": [ "ac = global_pairwise_align_nucleotide(denovo1009, otus_85['214270'])\n", "print ac.to_fasta()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ ">0\n", "GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGATGAAGCCTAGCTTGCTAGGTGGATTAGTGGCGAACGGGTGAGTAATACGTGAGTAACCTACCTTTAACTCTGGGATAAGCCCGGGAAACTGGGTCTAATACCGGATACGACCAATCTTCGCATGGGGGTTGGTGGAAAGGTTTGTTCTGGTGGGGGATGGGCTCGCGGCCTATCAGCTTGTTGGTGGGGTGATGGCCTACCAAGGCTTTGACGGGTAGCCGGCCTGAGAGGGTGACCGGTCACATTGGGACTGAGATACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGAAAGCCTGATGCAGCGACGCCGCGTGAGGGATGGAGGCCTTCGGGTTGTAAACCTCTTTCGCTCATGGTCAAGCCGCAACTGTGGGTTGTGGTGAGGGTAGTGGGGTAAAGAAGCGCCGGGCTAACTCCGTGCCAGCAGCCGCGGTAATGACTGCCAAGGG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", ">214270\n", "-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GAGTTTGATCCTGGCTCAGGACGAACGCTAGCGGCGTGTTTGAAACATGCAAGTCGAACGGTGGCGAACGGGTGCGTAACACGTAAAGAACCTACCCTTTAGCGGGGGATAGCCTTGGGAAACCGGGGGTAATACCGCATAAGGAAAGTCACTGAAGGAGGGCTTTGCTGCCTATCAGGTAGTTGGTGAGGTAAAGGCCCACCAAGCCTAAGACGGGTAGCTGGTCTGAGAGGATGGTCAGCCACACTGGGACTGAGATACGGCCCAGACTCCTACGGGGGGCAGCAGTTTAGAATATTGTGCAATGGGCGAAAGCCTGACACAGCGACGCCGCGTGGGTGATGAAGGCCTTCGGGTCGTAAAGCCCTGTTGGTAGGGATGAGGACAGTACCTACCGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCTGCGGTAATACGTAGGGGGCGAGCGTTGTCCGAAGTTACTGGGCGTAAAGCGCACGTAGGCGGCATTGTAAGTTGCATTTGAAATTTGACGGCTCAACCGTCAAAAGTGGTGTAAGACTGCAAAGCTTGAGGCAATCAAAAGAATGTGGAACTCCGAGTGTAGCGGTGGAATGCGTAGAGATTCGGAAGAACACCCATGGCGAAGGCAGCATTCTGGGATTCCTGACGCTGAGGAGCGAAAGCCAGGGGAGCGAACGGGATTAGATACCCCGGTAGTCCTGGCCGTAAACGATGGATACTAGGTGTAGGGGCCTCTGTGCCGAAGCTAACGCAATAAGTATCCCGCCTGGGGAGTACGGTCGCAAGGCTAAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACCTTACCTGGGTTTGACATCGACAGGTGCTGCATGGCTGTCGTCAGCTCGTATCGTGAGATGTCGGGTTAAGTCCCTTAACGAGCGCAACCCTTGTCCTTAGTTGCCATCATGGGCACTCTAAGGAGACCGCCGGTGTTAAACCGGAGGAAGGTGGGGATGACGTCAAGTCCTCATGGCCCTTATGTCCAGGGCTACACACGTGCTACAATGGCCGGCAAAGGGATGCTAACCTGTAAGGGGGAGCTAACCTCAACCGGTCTCAGTTCGGATGAAGTCTGCAATTCGACTTCAGAAGCTGGAATCGCTAGTAATCGCGGATCAGAACGCCGCGGTGAATACGTTCCCGGGCCTTGCACACAC\n", "\n" ] } ], "prompt_number": 68 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 61 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 64 }, { "cell_type": "code", "collapsed": false, "input": [ "print len(otus_85_unaligned['214270'])\n", "print len(otus_85['214270'])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "1360\n", "1232\n" ] } ], "prompt_number": 67 }, { "cell_type": "code", "collapsed": false, "input": [ "seqs_515_fp = \"/Users/caporaso/code/short-read-tax-assignment/data/simulated-community/B1-iter0/rep_set.fna\"\n", "seqs_515 = SequenceCollection.read(seqs_515_fp)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 89 }, { "cell_type": "code", "collapsed": false, "input": [ "print set(seqs_515.ids()) & set(otus_85.ids()) & set(core_set.ids())\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "set(['41280', '3490', '196844', '233513', '93487', '92430', '161484', '224506', '193107', '223031', '100246', '220728', '184873', '216235', '92321', '51154', '33606', '114578', '136119', '179300', '110393', '150680', '107637', '144458', '49231', '145173', '141088', '21', '32849', '220131', '142856', '220089', '223835', '158624', '159367', '225943', '138301', '142268', '132390', '32823', '145549'])\n" ] } ], "prompt_number": 97 }, { "cell_type": "code", "collapsed": false, "input": [ "print global_pairwise_align_nucleotide(str(otus_85['41280']), str(seqs_515['41280'])).to_fasta()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ ">0\n", "ATCCTGGCTCAGGACGAACGCTGGCGGCGTGCCTAAACTATGCGAGTCGCGCGAGCGGCGAACGGGTGAGTAACGCGTGAGTAACCTGCCCCCCGGACCGGGATAACTCCGAGAAATCGGGGCTAATACCGGATATGGAAAGCCACCGAGGGAGGGGCTCGCGTCCTATCAGCTTGTTGGTGGGGTAAAGGCCTACCAAGGCTGCGACGGGTAGCCGGTCTGAGAGGATGGCCGGTCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGTAGCAAGTGGGAATCTTGCACAATGGACGAAAGTCTGATGCAGCGACGCCGCGTGAGGGATGAAGGCTCTCGGGTCGTAAACCTCTGTTGACAGGGAAGATGACGGTACCTGTCGAGGAAGCCTCGGCTAACTCCGTGCCAGCAGCCGCGGCAAGACGGGGGAGGCGAGCGTTGTCCGGAATGACTGGGCGTAAAGCGCGTGTAGGCGGGATTTTAAGTCAGCGGTGAAATCCCCTCGCTCAACGAGGGAACTGCCGCTGATACTGGGATTCTTGAGGGCAGGAGAGGAGAGCGGAATTGCCGGTGTAGCGGTAAAATGCGTAGAAATCGGCAAGAACACCCGTGGCGAAGGCGGCTCTCTGGACTG--CCTGACGCTGAGACGCGAAAGCTAGGGTAGCGAACGGGATTAGATACCCCGGTAGTCCTAGCCGTAAACGATGGTCACTAGGTGTCGGAACTTCGGCGCCGAAGTTAACGCATTAAGTGACCCGCCTGGGGAGTACGGCCGCAAGGCTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGCTTAATTCGACGCAACGCGAAGAACCTTACCAGGGCTTGACTGTAACAGGTGGTGCATGGCCGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTGAGTCCCGCAACGAGCGCAACCCTCACTGTGTGTTGCCAGCGGCCGGGAACTCACACGGAACTGCCTCGGTAACGAGGAGGAAGGTGAGGATGACGTCAGGTCCGCATGCCCCTTACGCCCTGGGCTGCACACATGCTACAATGGAGAGCAATGGGTCGCAAAACCGCGAGGCCGAGCTAATCCCCCTCTCCCCAATTCGGATGTAGTCTGCAACTCGACTGCAGAAGTCGGAATCGCTAGTAACCGCAAATCAGCAACGTTGCGGTGAATACGTTCTCGGGCCTTGTACACACCGCCCGTCAAGCGATGAGAGTCAGGTGCACCTTAAGTCGCTGCCCAGGGAGGCGCCCACGGTGCGACTG\n", ">1\n", "---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GTGCCAGCAGCCGCGGCAAGACGGGGGAGGCGAGCGTTGTCCGGAATGACTGGGCGTAAAGCGCGTGTAGGCGGGATTTTAAGTCAGCGGTGAAATCCCCTCGCTCAACGAGGGAACTGCCGCTGATACTGGGATTCTTGAGGGCAGGAGAGGAGAGCGGAATTGCCGGTGTAGCGGTAAAATGCGTAGAAATCGGCAAGAACACCCGTGGCGAAGGCGGCTCTCTGGACTGCTCCTGACGCTGAGACGCGAAAGCTAGGGTAGCGAACGGGATTAGATACCCCGGTAGTC--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", "\n" ] } ], "prompt_number": 103 }, { "cell_type": "code", "collapsed": false, "input": [ "print global_pairwise_align_nucleotide(str(core_set['41280']).upper(), str(seqs_515['41280'])).to_fasta()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ ">0\n", "AGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCCTAAACTATGCGAGTCGCGCGGGGTTTTATGACAAACCGCAAGGCGAGTTGTAAAACTTAGCGGCGAACGGGTGAGTAACGCGTGAGTAACCTGCCCCCCGGACCGGGATAACTCCGAGAAATCGGGGCTAATACCGGATATGAAGTCCGCCCTCCTGGGCGGGCTTGGAAAGGAGCAATCCACCGAGGGAGGGGCTCGCGTCCTATCAGCTTGTTGGTGGGGTAAAGGCCTACCAAGGCTGCGACGGGTAGCCGGTCTGAGAGGATGGCCGGTCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGTAGCAAGTGGGAATCTTGCACAATGGACGAAAGTCTGATGCAGCGACGCCGCGTGAGGGATGAAGGCTCTCGGGTCGTAAACCTCTGTTGACAGGGAAGAATAACTGACGGTACCTGTCGAGGAAGCCTCGGCTAACTCCGTGCCAGCAGCCGCGGCAAGACGGGGGAGGCGAGCGTTGTCCGGAATGACTGGGCGTAAAGCGCGTGTAGGCGGGATTTTAAGTCAGCGGTGAAATCCCCTCGCTCAACGAGGGAACTGCCGCTGATACTGGGATTCTTGAGGGCAGGAGAGGAGAGCGGAATTGCCGGTGTAGCGGTAAAATGCGTAGAAATCGGCAAGAACACCCGTGGCGAAGGCGGCTCTCTGGACTGCTCCTGACGCTGAGACGCGAAAGCTAGGGTAGCGAACGGGATTAGATACCCCGGTAGTCCTAGCCGTAAACGATGGTCACTAGGTGTCGGAAGATCGACCCCTTCGGCGCCGAAGTTAACGCATTAAGTGACCCGCCTGGGGAGTACGGCCGCAAGGCTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGCTTAATTCGACGCAACGCGAAGAACCTTACCAGGGCTTGACTGTTGCAGAATCCTGATGAAAGTCAGGAGTGCTCTTCGGAGAGCTGCAAAACAGGTGGTGCATGGCCGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTGAGTCCCGCAACGAGCGCAACCCTCACTGTGTGTTGCCAGCGGTTCGGCCGGGAACTCACACGGAACTGCCTCGGTAACGAGGAGGAAGGTGAGGATGACGTCAGGTCCGCATGCCCCTTACGCCCTGGGCTGCACACATGCTACAATGGAGAGTACAATGGGTCGCAAAACCGCGAGGCCGAGCTAATCCCCAAAACTCTCCCCAATTCGGATTGTAGTCTGCAACTCGACTGCATGAAGTCGGAATCGCTAGTAACCGCAAATCAGCAACGTTGCGGTGAATACGTTCTCGGGCCTTGTACACACCGCCCGTCAAGCGATGAGAGTCAGGTGCACCTTAAGTCGCTGGCCCAACCCGCAAGGGAGGGAGGCGCCCACGGTGCGACTGATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGCTGGATCACCTCCTT\n", ">1\n", "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GTGCCAGCAGCCGCGGCAAGACGGGGGAGGCGAGCGTTGTCCGGAATGACTGGGCGTAAAGCGCGTGTAGGCGGGATTTTAAGTCAGCGGTGAAATCCCCTCGCTCAACGAGGGAACTGCCGCTGATACTGGGATTCTTGAGGGCAGGAGAGGAGAGCGGAATTGCCGGTGTAGCGGTAAAATGCGTAGAAATCGGCAAGAACACCCGTGGCGAAGGCGGCTCTCTGGACTGCTCCTGACGCTGAGACGCGAAAGCTAGGGTAGCGAACGGGATTAGATACCCCGGTAGTC-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", "\n" ] } ], "prompt_number": 105 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }