{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Or... A speed test" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "block_size = 50000\n", "hdf5_fname = '../raw/total-3L.h5'\n", "vcf_fname = '../raw/total-3L.vcf.gz'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import gzip\n", "\n", "import dask.array as da\n", "import numpy as np\n", "import tables\n", "\n", "import vcf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Sequential HDF5 vs Sequential VCF" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "9643193 41956556\n" ] } ], "source": [ "#Preamble to get the maximum position\n", "store = tables.open_file(hdf5_fname, 'r')\n", "pos_array = store.get_node('/3L/variants/POS').iterrows()\n", "\n", "num_snps = pos_array.nrows\n", "max_pos = pos_array.read(num_snps - 1)[0] # [-1] notation not yet supported\n", "store.close()\n", "print(num_snps, max_pos)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#1-index vs 0-index\n", "def compute_bin_index(position):\n", " return (position - 1) // block_size" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## HDF5" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def compute_snps_per_bin_hdf5(in_memory=False):\n", " # lets save memory, not a lot in this case, but an example\n", " count_snps = np.zeros(1 + max_pos // block_size, dtype=np.uint16 if block_size < 65536 else np.uint32)\n", " store = tables.open_file(hdf5_fname, 'r')\n", " if in_memory:\n", " pos_iter = store.get_node('/3L/variants/POS').read()\n", " else:\n", " pos_iter = store.get_node('/3L/variants/POS').iterrows()\n", " for pos in pos_iter:\n", " count_snps[compute_bin_index(pos)] += 1\n", " store.close()\n", " return count_snps" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 35.8 s, sys: 1.86 s, total: 37.6 s\n", "Wall time: 37.7 s\n" ] }, { "data": { "text/plain": [ "array([ 44, 922, 947, 1508, 16, 367, 1235, 1570, 193,\n", " 255, 471, 0, 608, 844, 1056, 1619, 1869, 2161,\n", " 513, 790, 746, 122, 1363, 560, 1118, 2746, 964,\n", " 606, 0, 796, 556, 2965, 3118, 1341, 2240, 2875,\n", " 2028, 3171, 2948, 4471, 5361, 4225, 4150, 4391, 4521,\n", " 4125, 1693, 3975, 3986, 3742, 2866, 3822, 4708, 3372,\n", " 2529, 3106, 4640, 4710, 6672, 2866, 3729, 4116, 3987,\n", " 4380, 4710, 3915, 4001, 4106, 5719, 5161, 5011, 5496,\n", " 4526, 4518, 4488, 3837, 4584, 4038, 5364, 2917, 3237,\n", " 3837, 3691, 2930, 3083, 917, 368, 156, 705, 83,\n", " 2, 112, 126, 0, 37, 223, 3992, 3508, 2452,\n", " 405, 3070, 3722, 4343, 5154, 4086, 3949, 5600, 5617,\n", " 5373, 5003, 4531, 3858, 5463, 5895, 5544, 4980, 5603,\n", " 3133, 4859, 6090, 4047, 5047, 3224, 3732, 4010, 6464,\n", " 3667, 4161, 5077, 3353, 4211, 6107, 5768, 7598, 7645,\n", " 6855, 5855, 3962, 6124, 5011, 4262, 5818, 1912, 2457,\n", " 139, 75, 2721, 2273, 802, 2750, 2063, 3023, 7881,\n", " 4446, 0, 718, 4897, 6491, 5247, 4265, 5307, 6874,\n", " 2528, 3260, 7526, 5011, 8233, 5258, 6112, 9027, 10422,\n", " 6377, 7475, 6492, 6977, 6474, 7203, 5535, 4862, 5044,\n", " 7313, 4583, 5018, 5578, 487, 3944, 8671, 9137, 1323,\n", " 7948, 3542, 2495, 7390, 6856, 7613, 5310, 5593, 2672,\n", " 94, 0, 6377, 9638, 1869, 5084, 10877, 5454, 5577,\n", " 7232, 9003, 11156, 8447, 7614, 8652, 7402, 7646, 8516,\n", " 4900, 5634, 4821, 6556, 6860, 8818, 8058, 613, 3253,\n", " 5546, 10322, 11512, 8572, 6927, 5496, 2345, 3954, 9128,\n", " 9448, 12448, 15173, 9542, 9089, 11119, 8993, 8994, 9812,\n", " 10215, 11149, 6595, 8018, 8130, 10845, 11007, 15774, 14550,\n", " 7671, 10244, 12786, 11360, 9341, 12174, 5104, 10359, 11389,\n", " 12209, 9287, 9675, 11509, 11266, 14995, 11674, 13059, 13882,\n", " 9820, 10316, 12637, 13539, 2254, 5762, 10234, 468, 1052,\n", " 9829, 14275, 14710, 14613, 18258, 17301, 13921, 11845, 12823,\n", " 8641, 8454, 7292, 8200, 7988, 15067, 16474, 15719, 7810,\n", " 17140, 14419, 14896, 18316, 12084, 16779, 12500, 15195, 8956,\n", " 11441, 17814, 18211, 8627, 11326, 2006, 3239, 12783, 8305,\n", " 11581, 13666, 11973, 2310, 4870, 8834, 5568, 3455, 518,\n", " 4658, 8508, 14035, 16461, 16358, 12676, 13480, 14489, 19479,\n", " 16009, 18529, 16847, 17870, 16099, 17109, 18108, 16293, 16167,\n", " 16217, 13641, 16917, 20694, 15810, 16748, 13495, 13814, 13936,\n", " 16500, 13654, 13691, 12080, 14253, 12127, 9302, 5118, 5593,\n", " 18194, 15659, 12246, 12029, 13922, 13134, 16186, 18298, 20525,\n", " 18659, 21030, 18924, 17696, 16517, 18570, 14149, 18232, 13296,\n", " 15713, 11750, 18306, 16713, 12147, 17511, 16945, 12972, 13601,\n", " 17045, 15020, 14985, 15522, 14177, 15084, 14432, 18245, 17672,\n", " 14977, 11461, 9786, 12393, 13718, 10137, 13898, 17502, 15773,\n", " 3541, 17, 83, 14821, 10063, 13753, 12751, 14524, 13997,\n", " 13351, 13838, 12543, 5876, 13796, 12067, 12039, 9658, 10416,\n", " 17690, 16934, 14585, 19616, 12132, 10731, 12070, 12407, 13273,\n", " 12940, 10325, 14044, 17366, 11488, 159, 12850, 11440, 14626,\n", " 17196, 13620, 18362, 14321, 16261, 19155, 16360, 10809, 13839,\n", " 19611, 19180, 17469, 18089, 11198, 14285, 17635, 20275, 18957,\n", " 15682, 13970, 8300, 11441, 9392, 8534, 3784, 0, 0,\n", " 2897, 5830, 3534, 12097, 13585, 16870, 15340, 13788, 10533,\n", " 4649, 6595, 12812, 15825, 15143, 18161, 12345, 14831, 20867,\n", " 15622, 15926, 9297, 12006, 827, 12112, 4436, 5412, 12243,\n", " 13814, 19302, 13905, 14789, 15232, 16068, 16951, 14979, 16634,\n", " 16891, 14953, 9272, 16003, 10372, 8836, 4931, 19684, 14789,\n", " 11601, 14881, 12228, 14918, 18051, 19081, 16351, 16290, 19718,\n", " 18345, 17682, 13877, 12350, 15571, 19650, 13450, 12475, 13394,\n", " 10194, 10142, 17092, 19755, 11751, 12403, 7420, 2170, 10262,\n", " 12772, 13331, 16900, 16210, 13294, 14439, 20928, 21125, 19971,\n", " 17322, 20123, 23387, 17079, 13429, 15201, 16550, 20723, 13883,\n", " 18487, 15762, 16634, 18440, 12863, 19423, 18392, 17595, 14690,\n", " 15024, 14074, 7844, 15101, 15403, 10952, 12973, 16704, 12864,\n", " 13769, 16925, 18148, 19011, 16384, 6266, 17466, 14969, 16772,\n", " 12345, 17831, 17754, 19683, 18952, 22256, 22189, 19185, 18692,\n", " 23531, 22969, 20267, 20132, 16693, 16970, 13396, 13317, 14341,\n", " 15761, 17150, 20039, 14563, 19437, 16707, 13422, 16409, 16343,\n", " 15070, 17827, 20098, 14659, 14133, 19803, 17216, 14998, 9952,\n", " 13632, 6683, 9092, 9794, 15253, 20426, 19026, 19931, 12619,\n", " 15700, 19188, 19448, 14252, 15184, 14658, 12583, 16365, 19098,\n", " 16086, 12608, 12744, 12112, 15531, 15659, 16561, 15900, 18745,\n", " 15044, 17080, 16244, 21544, 13571, 20607, 23138, 17525, 9832,\n", " 13346, 14747, 12534, 15450, 15450, 10981, 14293, 16707, 11216,\n", " 12352, 14434, 16374, 16622, 17817, 13938, 17086, 16703, 14063,\n", " 17986, 15860, 15884, 16079, 18990, 22215, 11866, 12072, 21262,\n", " 18214, 18930, 19353, 18313, 18534, 20240, 19587, 17230, 14985,\n", " 14481, 18932, 19152, 15008, 14858, 18255, 17377, 18675, 19624,\n", " 15350, 13101, 14924, 14745, 18368, 17607, 18308, 15751, 16676,\n", " 17483, 12450, 13574, 14681, 20729, 8802, 11164, 19105, 20751,\n", " 14946, 14812, 16508, 13045, 13598, 1662, 9985, 1732, 2541,\n", " 0, 5170, 13475, 20586, 6070, 18577, 18802, 11117, 17653,\n", " 17283, 17812, 16100, 21801, 20480, 11822, 16371, 4693, 3153,\n", " 10670, 18295, 9412, 10875, 8988, 13147, 17466, 18072, 14239,\n", " 18637, 11022, 12864, 13788, 13225, 12474, 16018, 11694, 11701,\n", " 14004, 8149, 17778, 18699, 11032, 17928, 18699, 9506, 1009,\n", " 13449, 19832, 17837, 21597, 17062, 15945, 17439, 10506, 12991,\n", " 16155, 17368, 19337, 19747, 21282, 19063, 23277, 19012, 17843,\n", " 12667, 13727, 15519, 16703, 13610, 19547, 21413, 18396, 21770,\n", " 20833, 16803, 23080, 23685, 23059, 24809, 15926, 23384, 26079,\n", " 24982, 23776, 24136, 23378, 19434, 15781, 17058, 15817, 21908,\n", " 21144, 11910, 5914, 13589, 18370, 20585, 22493, 20552, 18456,\n", " 20849, 19781, 16845, 17556, 17573, 16246, 9085, 9751, 5038,\n", " 4399, 4222, 586], dtype=uint16)" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#We can do in-memory load\n", "%time compute_snps_per_bin_hdf5(True)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 49.3 s, sys: 24 ms, total: 49.3 s\n", "Wall time: 49.3 s\n" ] }, { "data": { "text/plain": [ "array([ 44, 922, 947, 1508, 16, 367, 1235, 1570, 193,\n", " 255, 471, 0, 608, 844, 1056, 1619, 1869, 2161,\n", " 513, 790, 746, 122, 1363, 560, 1118, 2746, 964,\n", " 606, 0, 796, 556, 2965, 3118, 1341, 2240, 2875,\n", " 2028, 3171, 2948, 4471, 5361, 4225, 4150, 4391, 4521,\n", " 4125, 1693, 3975, 3986, 3742, 2866, 3822, 4708, 3372,\n", " 2529, 3106, 4640, 4710, 6672, 2866, 3729, 4116, 3987,\n", " 4380, 4710, 3915, 4001, 4106, 5719, 5161, 5011, 5496,\n", " 4526, 4518, 4488, 3837, 4584, 4038, 5364, 2917, 3237,\n", " 3837, 3691, 2930, 3083, 917, 368, 156, 705, 83,\n", " 2, 112, 126, 0, 37, 223, 3992, 3508, 2452,\n", " 405, 3070, 3722, 4343, 5154, 4086, 3949, 5600, 5617,\n", " 5373, 5003, 4531, 3858, 5463, 5895, 5544, 4980, 5603,\n", " 3133, 4859, 6090, 4047, 5047, 3224, 3732, 4010, 6464,\n", " 3667, 4161, 5077, 3353, 4211, 6107, 5768, 7598, 7645,\n", " 6855, 5855, 3962, 6124, 5011, 4262, 5818, 1912, 2457,\n", " 139, 75, 2721, 2273, 802, 2750, 2063, 3023, 7881,\n", " 4446, 0, 718, 4897, 6491, 5247, 4265, 5307, 6874,\n", " 2528, 3260, 7526, 5011, 8233, 5258, 6112, 9027, 10422,\n", " 6377, 7475, 6492, 6977, 6474, 7203, 5535, 4862, 5044,\n", " 7313, 4583, 5018, 5578, 487, 3944, 8671, 9137, 1323,\n", " 7948, 3542, 2495, 7390, 6856, 7613, 5310, 5593, 2672,\n", " 94, 0, 6377, 9638, 1869, 5084, 10877, 5454, 5577,\n", " 7232, 9003, 11156, 8447, 7614, 8652, 7402, 7646, 8516,\n", " 4900, 5634, 4821, 6556, 6860, 8818, 8058, 613, 3253,\n", " 5546, 10322, 11512, 8572, 6927, 5496, 2345, 3954, 9128,\n", " 9448, 12448, 15173, 9542, 9089, 11119, 8993, 8994, 9812,\n", " 10215, 11149, 6595, 8018, 8130, 10845, 11007, 15774, 14550,\n", " 7671, 10244, 12786, 11360, 9341, 12174, 5104, 10359, 11389,\n", " 12209, 9287, 9675, 11509, 11266, 14995, 11674, 13059, 13882,\n", " 9820, 10316, 12637, 13539, 2254, 5762, 10234, 468, 1052,\n", " 9829, 14275, 14710, 14613, 18258, 17301, 13921, 11845, 12823,\n", " 8641, 8454, 7292, 8200, 7988, 15067, 16474, 15719, 7810,\n", " 17140, 14419, 14896, 18316, 12084, 16779, 12500, 15195, 8956,\n", " 11441, 17814, 18211, 8627, 11326, 2006, 3239, 12783, 8305,\n", " 11581, 13666, 11973, 2310, 4870, 8834, 5568, 3455, 518,\n", " 4658, 8508, 14035, 16461, 16358, 12676, 13480, 14489, 19479,\n", " 16009, 18529, 16847, 17870, 16099, 17109, 18108, 16293, 16167,\n", " 16217, 13641, 16917, 20694, 15810, 16748, 13495, 13814, 13936,\n", " 16500, 13654, 13691, 12080, 14253, 12127, 9302, 5118, 5593,\n", " 18194, 15659, 12246, 12029, 13922, 13134, 16186, 18298, 20525,\n", " 18659, 21030, 18924, 17696, 16517, 18570, 14149, 18232, 13296,\n", " 15713, 11750, 18306, 16713, 12147, 17511, 16945, 12972, 13601,\n", " 17045, 15020, 14985, 15522, 14177, 15084, 14432, 18245, 17672,\n", " 14977, 11461, 9786, 12393, 13718, 10137, 13898, 17502, 15773,\n", " 3541, 17, 83, 14821, 10063, 13753, 12751, 14524, 13997,\n", " 13351, 13838, 12543, 5876, 13796, 12067, 12039, 9658, 10416,\n", " 17690, 16934, 14585, 19616, 12132, 10731, 12070, 12407, 13273,\n", " 12940, 10325, 14044, 17366, 11488, 159, 12850, 11440, 14626,\n", " 17196, 13620, 18362, 14321, 16261, 19155, 16360, 10809, 13839,\n", " 19611, 19180, 17469, 18089, 11198, 14285, 17635, 20275, 18957,\n", " 15682, 13970, 8300, 11441, 9392, 8534, 3784, 0, 0,\n", " 2897, 5830, 3534, 12097, 13585, 16870, 15340, 13788, 10533,\n", " 4649, 6595, 12812, 15825, 15143, 18161, 12345, 14831, 20867,\n", " 15622, 15926, 9297, 12006, 827, 12112, 4436, 5412, 12243,\n", " 13814, 19302, 13905, 14789, 15232, 16068, 16951, 14979, 16634,\n", " 16891, 14953, 9272, 16003, 10372, 8836, 4931, 19684, 14789,\n", " 11601, 14881, 12228, 14918, 18051, 19081, 16351, 16290, 19718,\n", " 18345, 17682, 13877, 12350, 15571, 19650, 13450, 12475, 13394,\n", " 10194, 10142, 17092, 19755, 11751, 12403, 7420, 2170, 10262,\n", " 12772, 13331, 16900, 16210, 13294, 14439, 20928, 21125, 19971,\n", " 17322, 20123, 23387, 17079, 13429, 15201, 16550, 20723, 13883,\n", " 18487, 15762, 16634, 18440, 12863, 19423, 18392, 17595, 14690,\n", " 15024, 14074, 7844, 15101, 15403, 10952, 12973, 16704, 12864,\n", " 13769, 16925, 18148, 19011, 16384, 6266, 17466, 14969, 16772,\n", " 12345, 17831, 17754, 19683, 18952, 22256, 22189, 19185, 18692,\n", " 23531, 22969, 20267, 20132, 16693, 16970, 13396, 13317, 14341,\n", " 15761, 17150, 20039, 14563, 19437, 16707, 13422, 16409, 16343,\n", " 15070, 17827, 20098, 14659, 14133, 19803, 17216, 14998, 9952,\n", " 13632, 6683, 9092, 9794, 15253, 20426, 19026, 19931, 12619,\n", " 15700, 19188, 19448, 14252, 15184, 14658, 12583, 16365, 19098,\n", " 16086, 12608, 12744, 12112, 15531, 15659, 16561, 15900, 18745,\n", " 15044, 17080, 16244, 21544, 13571, 20607, 23138, 17525, 9832,\n", " 13346, 14747, 12534, 15450, 15450, 10981, 14293, 16707, 11216,\n", " 12352, 14434, 16374, 16622, 17817, 13938, 17086, 16703, 14063,\n", " 17986, 15860, 15884, 16079, 18990, 22215, 11866, 12072, 21262,\n", " 18214, 18930, 19353, 18313, 18534, 20240, 19587, 17230, 14985,\n", " 14481, 18932, 19152, 15008, 14858, 18255, 17377, 18675, 19624,\n", " 15350, 13101, 14924, 14745, 18368, 17607, 18308, 15751, 16676,\n", " 17483, 12450, 13574, 14681, 20729, 8802, 11164, 19105, 20751,\n", " 14946, 14812, 16508, 13045, 13598, 1662, 9985, 1732, 2541,\n", " 0, 5170, 13475, 20586, 6070, 18577, 18802, 11117, 17653,\n", " 17283, 17812, 16100, 21801, 20480, 11822, 16371, 4693, 3153,\n", " 10670, 18295, 9412, 10875, 8988, 13147, 17466, 18072, 14239,\n", " 18637, 11022, 12864, 13788, 13225, 12474, 16018, 11694, 11701,\n", " 14004, 8149, 17778, 18699, 11032, 17928, 18699, 9506, 1009,\n", " 13449, 19832, 17837, 21597, 17062, 15945, 17439, 10506, 12991,\n", " 16155, 17368, 19337, 19747, 21282, 19063, 23277, 19012, 17843,\n", " 12667, 13727, 15519, 16703, 13610, 19547, 21413, 18396, 21770,\n", " 20833, 16803, 23080, 23685, 23059, 24809, 15926, 23384, 26079,\n", " 24982, 23776, 24136, 23378, 19434, 15781, 17058, 15817, 21908,\n", " 21144, 11910, 5914, 13589, 18370, 20585, 22493, 20552, 18456,\n", " 20849, 19781, 16845, 17556, 17573, 16246, 9085, 9751, 5038,\n", " 4399, 4222, 586], dtype=uint16)" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%time compute_snps_per_bin_hdf5(False)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def compute_snps_per_bin_vcf():\n", " # lets save memory, not a lot in this case, but an example\n", " count_snps = np.zeros(1 + max_pos // block_size, dtype=np.uint16 if block_size < 65536 else np.uint32)\n", " vcf_file = vcf.Reader(filename=vcf_fname)\n", " for rec in vcf_file:\n", " count_snps[compute_bin_index(rec.POS)] += 1\n", " store.close()\n", " return count_snps" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#This will take days!\n", "#%time compute_snps_per_bin_vcf()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#TODO: Do partial function application of the above" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Researching speed problems" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def read_lines(name, num):\n", " with gzip.open(name) as f:\n", " for i, l in enumerate(f):\n", " if i == num:\n", " break\n", "\n", "def parse_lines(name, num):\n", " f = vcf.Reader(filename=name)\n", " for i, rec in enumerate(f):\n", " if i == num:\n", " break\n", "\n", "num_lines = 10000" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.92 s, sys: 16 ms, total: 1.94 s\n", "Wall time: 1.94 s\n" ] } ], "source": [ "%time read_lines(vcf_fname, num_lines)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3min 28s, sys: 1.07 s, total: 3min 29s\n", "Wall time: 3min 30s\n" ] } ], "source": [ "%time parse_lines(vcf_fname, num_lines)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Converting with vcfnp" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }