{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### ipyrad testing for pairddrad data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.1.37\n" ] } ], "source": [ "import ipyrad as ip ## for RADseq assembly\n", "print ip.__version__ ## print version\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## clear existing test_dir/\n", "import shutil\n", "import os\n", "if os.path.exists(\"./test_pairddrad/\"):\n", " shutil.rmtree(\"./test_pairddrad/\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " New Assembly: data1\n" ] } ], "source": [ "data1 = ip.Assembly('data1')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0 assembly_name data1 \n", " 1 project_dir . \n", " 2 raw_fastq_path ./*.fastq \n", " 3 barcodes_path ./*.barcodes.txt \n", " 4 sorted_fastq_path \n", " 5 assembly_method denovo \n", " 6 reference_sequence \n", " 7 datatype rad \n", " 8 restriction_overhang ('TGCAG', '') \n", " 9 max_low_qual_bases 5 \n", " 10 phred_Qscore_offset 33 \n", " 11 mindepth_statistical 6 \n", " 12 mindepth_majrule 6 \n", " 13 maxdepth 1000 \n", " 14 clust_threshold 0.85 \n", " 15 max_barcode_mismatch 1 \n", " 16 filter_adapters 0 \n", " 17 filter_min_trim_len 35 \n", " 18 max_alleles_consens 2 \n", " 19 max_Ns_consens (5, 5) \n", " 20 max_Hs_consens (8, 8) \n", " 21 min_samples_locus 4 \n", " 22 max_SNPs_locus (100, 100) \n", " 23 max_Indels_locus (5, 99) \n", " 24 max_shared_Hs_locus 0.25 \n", " 25 edit_cutsites (0, 0) \n", " 26 trim_overhang (1, 2, 2, 1) \n", " 27 output_formats * \n", " 28 pop_assign_file \n", " 29 excludes \n", " 30 outgroups \n" ] } ], "source": [ "data1.get_params()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0 assembly_name data1 \n", " 1 project_dir ./test_pairddrad \n", " 2 raw_fastq_path ./data/sim_pairddrad_*.gz \n", " 3 barcodes_path ./data/sim_pairddrad_barcodes.txt \n", " 4 sorted_fastq_path \n", " 5 assembly_method denovo \n", " 6 reference_sequence \n", " 7 datatype pairddrad \n", " 8 restriction_overhang ('TGCAG', 'AATT') \n", " 9 max_low_qual_bases 5 \n", " 10 phred_Qscore_offset 33 \n", " 11 mindepth_statistical 6 \n", " 12 mindepth_majrule 6 \n", " 13 maxdepth 1000 \n", " 14 clust_threshold 0.85 \n", " 15 max_barcode_mismatch 1 \n", " 16 filter_adapters 0 \n", " 17 filter_min_trim_len 35 \n", " 18 max_alleles_consens 2 \n", " 19 max_Ns_consens (5, 5) \n", " 20 max_Hs_consens (8, 8) \n", " 21 min_samples_locus 4 \n", " 22 max_SNPs_locus (100, 100) \n", " 23 max_Indels_locus (5, 99) \n", " 24 max_shared_Hs_locus 0.25 \n", " 25 edit_cutsites (0, 0) \n", " 26 trim_overhang (1, 2, 2, 1) \n", " 27 output_formats * \n", " 28 pop_assign_file \n", " 29 excludes \n", " 30 outgroups \n" ] } ], "source": [ "data1.set_params('project_dir', \"./test_pairddrad\")\n", "data1.set_params('raw_fastq_path', \"./data/sim_pairddrad_*.gz\")\n", "data1.set_params('barcodes_path', \"./data/sim_pairddrad_barcodes.txt\")\n", "data1.set_params('restriction_overhang', (\"TGCAG\", \"AATT\"))\n", "data1.set_params('datatype', 'pairddrad')\n", "#data1.set_params(17, 1)\n", "\n", "data1.get_params()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#data1.link_fastqs(path=\"test_pairddrad/test_pairddrad_fastqs/\", append=True)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Saving current assembly.\n", " state reads_raw\n", "1A0 1 20000\n", "1B0 1 20000\n", "1C0 1 20000\n", "1D0 1 20000\n", "2E0 1 20000\n", "2F0 1 20000\n", "2G0 1 20000\n", "2H0 1 20000\n", "3I0 1 20000\n", "3J0 1 20000\n", "3K0 1 20000\n", "3L0 1 20000\n" ] } ], "source": [ "data1.step1()\n", "print data1.stats" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Saving current assembly.\n", " state reads_raw reads_filtered\n", "1A0 2 20000 20000\n", "1B0 2 20000 20000\n", "1C0 2 20000 20000\n", "1D0 2 20000 20000\n", "2E0 2 20000 20000\n", "2F0 2 20000 20000\n", "2G0 2 20000 20000\n", "2H0 2 20000 20000\n", "3I0 2 20000 20000\n", "3J0 2 20000 20000\n", "3K0 2 20000 20000\n", "3L0 2 20000 20000\n" ] } ], "source": [ "data1.step2()#[\"1B0\", \"2H0\", \"3J0\", \"3K0\"], force=True)\n", "print data1.stats" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Saving current assembly.\n", " state reads_raw reads_filtered clusters_total clusters_hidepth\n", "1A0 3 20000 20000 1000 1000\n", "1B0 3 20000 20000 1000 1000\n", "1C0 3 20000 20000 1000 1000\n", "1D0 3 20000 20000 1000 1000\n", "2E0 3 20000 20000 1000 1000\n", "2F0 3 20000 20000 1000 1000\n", "2G0 3 20000 20000 1000 1000\n", "2H0 3 20000 20000 1000 1000\n", "3I0 3 20000 20000 1000 1000\n", "3J0 3 20000 20000 1000 1000\n", "3K0 3 20000 20000 1000 1000\n", "3L0 3 20000 20000 1000 1000\n" ] } ], "source": [ "# data1.step3() ## do all samples\n", "# data1.step3(\"1A0\") ## do one sample\n", "# data1.step3([\"1A0\", \"1B0\", \"1C0\"]) ## do list of samples\n", "data1.step3()#[\"1B0\", \"2H0\", \"3J0\", \"3K0\"], force=True) \n", "print data1.stats" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " skipping 1B0; already estimated. Use force=True to overwrite.\n", " skipping 2H0; already estimated. Use force=True to overwrite.\n", " skipping 3J0; already estimated. Use force=True to overwrite.\n", " skipping 3K0; already estimated. Use force=True to overwrite.\n", " Saving current assembly.\n", " state reads_raw reads_filtered clusters_total clusters_hidepth \\\n", "1A0 4 20000 20000 1000 1000 \n", "1B0 4 20000 20000 1000 1000 \n", "1C0 4 20000 20000 1000 1000 \n", "1D0 4 20000 20000 1000 1000 \n", "2E0 4 20000 20000 1000 1000 \n", "2F0 4 20000 20000 1000 1000 \n", "2G0 4 20000 20000 1000 1000 \n", "2H0 4 20000 20000 1000 1000 \n", "3I0 4 20000 20000 1000 1000 \n", "3J0 4 20000 20000 1000 1000 \n", "3K0 4 20000 20000 1000 1000 \n", "3L0 4 20000 20000 1000 1000 \n", "\n", " hetero_est error_est \n", "1A0 0.001308 0.000488 \n", "1B0 0.001422 0.000494 \n", "1C0 0.001427 0.000482 \n", "1D0 0.001286 0.000497 \n", "2E0 0.001226 0.000491 \n", "2F0 0.001406 0.000479 \n", "2G0 0.001497 0.000506 \n", "2H0 0.001460 0.000502 \n", "3I0 0.001432 0.000504 \n", "3J0 0.001464 0.000483 \n", "3K0 0.001412 0.000502 \n", "3L0 0.001282 0.000470 \n", "CPU times: user 331 ms, sys: 4.28 ms, total: 336 ms\n", "Wall time: 7.07 s\n" ] } ], "source": [ "%%time\n", "data1.step4()#[\"1B0\", \"2H0\", \"3J0\", \"3K0\"], force=True) \n", "print data1.stats\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "DEBUG:ipyrad:H4CKERZ-mode: __loglevel__ = DEBUG\n", "INFO:ipyrad.core.parallel:Local connection to 4 engines [ipyrad-2480]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading Assembly: test_pairddrad [test_pairddrad/test_pairddrad.assembly]\n", "ipyparallel setup: Local connection to 4 engines\n", "\n" ] } ], "source": [ "#data1.save(\"upto2\")\n", "## still figuring out how best to save...\n", "import ipyrad as ip\n", "data1 = ip.load_assembly(\"test_pairddrad/test_pairddrad.assembly\")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Saving current assembly.\n", " state reads_raw reads_filtered clusters_total clusters_hidepth \\\n", "1A0 5 20000 20000 1000 1000 \n", "1B0 5 20000 20000 1000 1000 \n", "1C0 5 20000 20000 1000 1000 \n", "1D0 5 20000 20000 1000 1000 \n", "2E0 5 20000 20000 1000 1000 \n", "2F0 5 20000 20000 1000 1000 \n", "2G0 5 20000 20000 1000 1000 \n", "2H0 5 20000 20000 1000 1000 \n", "3I0 5 20000 20000 1000 1000 \n", "3J0 5 20000 20000 1000 1000 \n", "3K0 5 20000 20000 1000 1000 \n", "3L0 5 20000 20000 1000 1000 \n", "\n", " hetero_est error_est reads_consens \n", "1A0 0.001308 0.000488 1000 \n", "1B0 0.001422 0.000494 1000 \n", "1C0 0.001427 0.000482 1000 \n", "1D0 0.001286 0.000497 1000 \n", "2E0 0.001226 0.000491 1000 \n", "2F0 0.001406 0.000479 1000 \n", "2G0 0.001497 0.000506 1000 \n", "2H0 0.001460 0.000502 1000 \n", "3I0 0.001432 0.000504 1000 \n", "3J0 0.001464 0.000483 1000 \n", "3K0 0.001412 0.000502 1000 \n", "3L0 0.001282 0.000470 1000 \n" ] } ], "source": [ "data1.step5()#[\"1B0\"], force=True) \n", "print data1.stats" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "data1.step5([\"1B0\", \"2H0\", \"3J0\", \"3K0\"], force=True) \n", "print data1.stats" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Saving current assembly.\n", " state reads_raw reads_filtered clusters_total clusters_hidepth \\\n", "1A0 6 20000 20000 1000 1000 \n", "1B0 6 20000 20000 1000 1000 \n", "1C0 6 20000 20000 1000 1000 \n", "1D0 6 20000 20000 1000 1000 \n", "2E0 6 20000 20000 1000 1000 \n", "2F0 6 20000 20000 1000 1000 \n", "2G0 6 20000 20000 1000 1000 \n", "2H0 6 20000 20000 1000 1000 \n", "3I0 6 20000 20000 1000 1000 \n", "3J0 6 20000 20000 1000 1000 \n", "3K0 6 20000 20000 1000 1000 \n", "3L0 6 20000 20000 1000 1000 \n", "\n", " hetero_est error_est reads_consens \n", "1A0 0.001308 0.000488 1000 \n", "1B0 0.001422 0.000494 1000 \n", "1C0 0.001427 0.000482 1000 \n", "1D0 0.001286 0.000497 1000 \n", "2E0 0.001226 0.000491 1000 \n", "2F0 0.001406 0.000479 1000 \n", "2G0 0.001497 0.000506 1000 \n", "2H0 0.001460 0.000502 1000 \n", "3I0 0.001432 0.000504 1000 \n", "3J0 0.001464 0.000483 1000 \n", "3K0 0.001412 0.000502 1000 \n", "3L0 0.001282 0.000470 1000 \n" ] } ], "source": [ "data1.step6()#([\"1B0\", \"2H0\", \"3J0\", \"3K0\"], force=True) \n", "print data1.stats" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "inc ['1B0', '2G0', '1C0', '1A0', '2H0', '2E0', '3L0', '3I0', '1D0', '2F0', '3J0', '3K0']\n", "sidx: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]\n", "finished filtering\n", " Saving current assembly.\n" ] } ], "source": [ "data1.step7()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'loci': '/home/deren/Documents/ipyrad/tests/test_pairddrad/data1_outfiles/data1.loci'}" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data1.outfiles\n" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [], "source": [ "less $data1.outfiles.loci" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "print pd.read_table('test_pairddrad/test_pairddrad_consens/s5_consens.txt', delim_whitespace=1, header=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "## how to merge Assembly objects\n", "data1.files.edits['1B0']\n", "data1.samples['1B0'].files" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "print data1.stats" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "for i in data1.log:\n", " print i" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "adds = np.ones([10, 4], dtype='int16')\n", "adds.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "np.array.([30, 1], dtype='int16')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "longest_reads = 190\n", "nreads = int(1e5)\n", "\n", "arr = np.empty([nreads,200, 4], dtype='int16')\n", "arr[0][0:adds.shape[0]] = adds\n", "arr[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import glob\n", "import os\n", "import numpy" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "catg = numpy.load(\"test_pairddrad/test_pairddrad_consens/1B0.catg\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "catg[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "cats1 = glob.glob(os.path.join(\n", " data1.dirs.consens,\n", " data1.samples['1B0'].name+\"_tmpcats.*\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "cats1.sort(key=lambda x: int(x.split(\".\")[-1]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "catg = numpy.load(cats1[0])\n", "lastg = numpy.load(cats1[-1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "catg[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "lastg.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Making alignment faster" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def alignfast(data, names, seqs):\n", " \"\"\" makes subprocess call to muscle \"\"\"\n", " inputstring = \"\\n\".join(\">\"+i+\"\\n\"+j for i, j in zip(names, seqs))\n", " cmd = \"/bin/echo '\"+inputstring+\"' | \"+data.muscle+\" -quiet -in -\"\n", " piped = subprocess.Popen(cmd, shell=True, \n", " stdin=subprocess.PIPE,\n", " stdout=subprocess.PIPE,\n", " stderr=subprocess.STDOUT,\n", " close_fds=True)\n", " _, fout = piped.stdin, piped.stdout\n", " return fout.read()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def alignfast(data, names, seqs):\n", " \"\"\" makes subprocess call to muscle \"\"\"\n", " inputstring = \"\\n\".join(\">\"+i+\"\\n\"+j for i, j in zip(names, seqs))\n", " cmd = \"/bin/echo '\"+inputstring+\"' | \"+data.muscle+\" -quiet -in -\"\n", " piped = subprocess.Popen(shlex.split(cmd), \n", " stdout=subprocess.PIPE)\n", " return piped.stdout.read()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%%timeit\n", "out = alignfast(data1, names, seqs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "print out" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def newmuscle(data, names, seqs):\n", " inputstring = \"\\n\".join(\">\"+i+\"\\n\"+j for i, j in zip(names, seqs))\n", " return subprocess.Popen(data.muscle, \n", " stdin=subprocess.PIPE, \n", " stdout=subprocess.PIPE)\\\n", " .communicate(inputstring)[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%%timeit \n", "newmuscle(data1, names, seqs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def alignfast(data, names, seqs):\n", " \"\"\" makes subprocess call to muscle \"\"\"\n", " inputstring = \"\\n\".join(\">\"+i+\"\\n\"+j for i, j in zip(names, seqs))\n", " cmd = \"/bin/echo '\"+inputstring+\"' | \"+data.muscle+\" -quiet -in -\"\n", " piped = subprocess.check_output(cmd, shell=True)\n", " return piped" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "out = alignfast(data1, names, seqs)\n", "print out" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%%timeit\n", "out = alignfast(data1, names, seqs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "inputstring = \"\\n\".join(i+\"\\n\"+j for i, j in zip(names, seqs))\n", "print inputstring" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pipe = subprocess.Popen(shlex.split(cmd), \n", " stdin=subprocess.PIPE, \n", " stderr=subprocess.PIPE, stdout=subprocess.PIPE)\n", "pipe.stdin.write(\"muscle -h\")\n", "get = pipe.communicate(input=pipe)\n", "get" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def alignfast2(data, names, seqs):\n", " \"\"\" makes subprocess call to muscle \"\"\"\n", " inputstring = \"\\n\".join(i+\"\\n\"+j for i, j in zip(names, seqs))\n", " cmd = \"/bin/echo '\"+inputstring+\"' | \"+data.muscle+\" -quiet -in -\"\n", " \n", " piped = subprocess.Popen(shlex.split(cmd), \n", " stdin=subprocess.PIPE, \n", " stdout=subprocess.PIPE,\n", " stderr=subprocess.PIPE)\n", " stdin, stdout = piped.communicate(input=cmd)\n", " return stdin, stdout" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def alignfast3(data, names, seqs):\n", " \"\"\" makes subprocess call to muscle \"\"\"\n", " inputstring = \"\\n\".join(i+\"\\n\"+j for i, j in zip(names, seqs))\n", " cmd = \"/bin/echo '\"+inputstring+\"' | \"+data.muscle+\" -quiet -in -\"\n", " \n", " piped = subprocess.check_output(cmd,\n", " shell=True)\n", " return piped" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def sortalign(stringnames):\n", " \"\"\" parses muscle output from a string to two lists \"\"\"\n", " objs = stringnames.split(\"\\n>\")\n", " seqs = [i.split(\"\\n\")[0].replace(\">\", \"\")+\"\\n\"+\\\n", " \"\".join(i.split('\\n')[1:]) for i in objs]\n", " \n", " aligned = [i.split(\"\\n\") for i in seqs]\n", " newnames = [\">\"+i[0] for i in aligned]\n", " seqs = [i[1] for i in aligned] \n", " ## return in sorted order by names\n", " sortedtups = [(i, j) for i, j in zip(*sorted(zip(newnames, seqs), \n", " key=lambda pair: pair[0]))]\n", " return sortedtups\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### get clusters" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import gzip\n", "import subprocess\n", "import shlex\n", "infile = gzip.open(\"test_pairddrad/test_pairddrad_clust_0.85/1B0.clust.gz\")\n", "clusts = infile.read().split(\"//\\n//\\n\")[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "for clust in clusts:\n", " lines = clust.split(\"\\n\")\n", " names = lines[::2]\n", " seqs = lines[1::2]\n", " \n", "seqs[0] = list(seqs[0])\n", "seqs[0].insert(7, \"AA\")\n", "seqs[0] = \"\".join(seqs[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%%timeit\n", "alignfast(data1, names, seqs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%%timeit \n", "alignfast3(data1, names, seqs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "out = alignfast3(data1, names, seqs)\n", "#sorts = sortalign(out)\n", "print out" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "out" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def sortalign(stringnames):\n", " \"\"\" parses muscle output from a string to two lists \"\"\"\n", " objs = stringnames[1:].split(\"\\n>\")\n", " seqs = [i.split(\"\\n\")[0].replace(\">\", \"\")+\"\\n\"+\\\n", " \"\".join(i.split('\\n')[1:]) for i in objs]\n", " \n", " aligned = [i.split(\"\\n\") for i in seqs]\n", " newnames = [\">\"+i[0] for i in aligned]\n", " seqs = [i[1] for i in aligned] \n", " \n", " ## return in sorted order by names\n", " sortedtups = [(i, j) for i, j in zip(*sorted(zip(newnames, seqs), \n", " key=lambda pair: pair[0]))]\n", " return sortedtups\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def parsemuscle(out):\n", " \"\"\" parse muscle string output into two sorted lists \"\"\"\n", " lines = out[1:].split(\"\\n>\")\n", " names = [line.split(\"\\n\", 1)[0] for line in lines]\n", " seqs = [line.split(\"\\n\", 1)[1].replace(\"\\n\", \"\") for line in lines]\n", " tups = zip(names, seqs)\n", " anames, aseqs = zip(*sorted(tups, key=lambda x: int(x[0].split(\";\")[-1][1:])))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }