{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Note: Please start IPython parallel before running this" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/core/software/conda/lib/python3.5/site-packages/rpy2/robjects/robject.py:6: UserWarning: During startup - \n", " rpy2.rinterface.initr()\n", "/core/software/conda/lib/python3.5/site-packages/rpy2/robjects/robject.py:6: UserWarning: Warning messages:\n", "\n", " rpy2.rinterface.initr()\n", "/core/software/conda/lib/python3.5/site-packages/rpy2/robjects/robject.py:6: UserWarning: 1: Setting LC_CTYPE failed, using \"C\" \n", "\n", " rpy2.rinterface.initr()\n", "/core/software/conda/lib/python3.5/site-packages/rpy2/robjects/robject.py:6: UserWarning: 2: Setting LC_COLLATE failed, using \"C\" \n", "\n", " rpy2.rinterface.initr()\n", "/core/software/conda/lib/python3.5/site-packages/rpy2/robjects/robject.py:6: UserWarning: 3: Setting LC_TIME failed, using \"C\" \n", "\n", " rpy2.rinterface.initr()\n", "/core/software/conda/lib/python3.5/site-packages/rpy2/robjects/robject.py:6: UserWarning: 4: Setting LC_MESSAGES failed, using \"C\" \n", "\n", " rpy2.rinterface.initr()\n", "/core/software/conda/lib/python3.5/site-packages/rpy2/robjects/robject.py:6: UserWarning: 5: Setting LC_MONETARY failed, using \"C\" \n", "\n", " rpy2.rinterface.initr()\n", "/core/software/conda/lib/python3.5/site-packages/rpy2/robjects/robject.py:6: UserWarning: 6: Setting LC_PAPER failed, using \"C\" \n", "\n", " rpy2.rinterface.initr()\n", "/core/software/conda/lib/python3.5/site-packages/rpy2/robjects/robject.py:6: UserWarning: 7: Setting LC_MEASUREMENT failed, using \"C\" \n", "\n", " rpy2.rinterface.initr()\n" ] } ], "source": [ "import numpy as np\n", "import tables\n", "import vcf" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "max_pos = 41956556 # This should not be manually input, but we will indulge\n", "chrom = '3L' # ditto\n", "vcf_fname = '../raw/total-3L.vcf.gz'\n", "block_size = 250000\n", "out_dir = '../raw'\n", "out_hdf5_fname = '../raw/example.h5'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import dask\n", "import dask.multiprocessing\n", "from dask import delayed\n", "#from multiprocessing.pool import ThreadPool\n", "dask.set_options(get=dask.multiprocessing.get)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "@delayed(pure=True)\n", "def get_pos_dp(out_dir, fname, block_size, block):\n", " vcf_file = vcf.Reader(filename=vcf_fname)\n", " start = block * block_size\n", " temp_block_fname = '%s/block_%06d.h5' % (out_dir, block)\n", " store = tables.open_file(temp_block_fname, 'w')\n", " group = store.create_group('/', 'DATA', 'Temporary data')\n", " positions = []\n", " annotations = []\n", " #We are assuming that we have memory for loading\n", " # that should be fine if careful with granularity\n", " for rec in vcf_file.fetch(chrom, start, start + block_size):\n", " #Note that fetch(x, 0, 10) does 1 to 10, compare with Python range\n", " positions.append(rec.POS)\n", " genomic_region = rec.INFO['ANN'][0].split('|')[1]\n", " annotations.append(genomic_region.encode('ascii'))\n", " pos_array = np.array(positions, dtype=np.uint32)\n", " ann_array = np.array(annotations, dtype=np.dtype('a'))\n", " \n", " # annotations could be compressed with enum\n", " store.create_array(group, 'positions', pos_array, 'Positions')\n", " store.create_array(group, 'annotations', ann_array, 'Annotations')\n", " store.close()\n", " return temp_block_fname, len(pos_array)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "class PositionType(tables.IsDescription):\n", " name = tables.StringCol(32)\n", " bit = tables.Int32Col()\n", " \n", "\n", "@delayed(pure=True)\n", "def consolidate_blocks(out_hdf5_name, block_meta):\n", " def get_pos_annotations(str_ann):\n", " ann_pos = str_ann.decode('ascii')\n", " return ann_pos.split('&')\n", " store = tables.open_file(out_hdf5_name, 'w')\n", " group = store.create_group('/', 'DATA', 'All data')\n", "\n", " #lets find all genomic position types and encode them for compression\n", " position_types = set ()\n", " for block_fname, num_snp in block_meta:\n", " block_hdf5 = tables.open_file(block_fname, 'r')\n", " annotations = block_hdf5.root.DATA.annotations.read()\n", " for ann_pos in annotations:\n", " for ann in get_pos_annotations(ann_pos):\n", " position_types.add(ann)\n", " block_hdf5.close()\n", " position_type_table = store.create_table(group, 'position_type', PositionType, 'Key to solve position type')\n", " position_row = position_type_table.row\n", " name_to_bit = {}\n", " for bit_pos, position_type in enumerate(position_types):\n", " #position_row['name'] = position_type\n", " #position_row['bit'] = bit_pos\n", " #position_row.append()\n", " name_to_bit[position_type] = bit_pos\n", " \n", " total_size = sum([size for name, size in block_meta])\n", "\n", " \n", " positions = np.empty(total_size, dtype=np.uint32)\n", " #empty - talk about it...\n", " snp_type = np.empty(total_size, dtype=np.uint32)\n", " curr_pos = 0\n", " for block_fname, num_snp in block_meta:\n", " block_hdf5 = tables.open_file(block_fname, 'r')\n", " for pos, annotations in zip(block_hdf5.root.DATA.positions,\n", " block_hdf5.root.DATA.annotations):\n", " positions[curr_pos] = pos\n", " encode_type = 0\n", " for position_type in get_pos_annotations(annotations):\n", " encode_type += 2**name_to_bit[position_type]\n", " snp_type[curr_pos] = encode_type\n", " curr_pos += 1\n", " \n", " block_hdf5.close()\n", " #remember to delete\n", " store.create_array(group, 'positions', positions, 'SNP positions')\n", " store.create_array(group, 'snp_type', snp_type, 'Type per SNP')\n", " store.close()\n", " return block_meta" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "blocks = range(1 + max_pos // block_size)\n", "block_meta = []\n", "for block in blocks:\n", " block_meta.append(get_pos_dp(out_dir, vcf_fname, block_size, block))\n", "consolidate = consolidate_blocks(out_hdf5_fname, block_meta)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "block_run = consolidate.compute()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#import pickle\n", "#\n", "#pickle_w = open('block_run.pickle', 'wb')\n", "#pickle.dump(block_run, pickle_w)\n", "#pickle_w.close()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('../raw/block_000000.h5', 3437),\n", " ('../raw/block_000001.h5', 3620),\n", " ('../raw/block_000002.h5', 2979),\n", " ('../raw/block_000003.h5', 6952),\n", " ('../raw/block_000004.h5', 3909),\n", " ('../raw/block_000005.h5', 5112),\n", " ('../raw/block_000006.h5', 10220),\n", " ('../raw/block_000007.h5', 15493),\n", " ('../raw/block_000008.h5', 22648),\n", " ('../raw/block_000009.h5', 17521),\n", " ('../raw/block_000010.h5', 17297),\n", " ('../raw/block_000011.h5', 21994),\n", " ('../raw/block_000012.h5', 20922),\n", " ('../raw/block_000013.h5', 22902),\n", " ('../raw/block_000014.h5', 24039),\n", " ('../raw/block_000015.h5', 20740),\n", " ('../raw/block_000016.h5', 16778),\n", " ('../raw/block_000017.h5', 2229),\n", " ('../raw/block_000018.h5', 277),\n", " ('../raw/block_000019.h5', 10580),\n", " ('../raw/block_000020.h5', 20375),\n", " ('../raw/block_000021.h5', 25542),\n", " ('../raw/block_000022.h5', 25291),\n", " ('../raw/block_000023.h5', 24665),\n", " ('../raw/block_000024.h5', 20060),\n", " ('../raw/block_000025.h5', 22722),\n", " ('../raw/block_000026.h5', 31329),\n", " ('../raw/block_000027.h5', 27807),\n", " ('../raw/block_000028.h5', 14588),\n", " ('../raw/block_000029.h5', 8621),\n", " ('../raw/block_000030.h5', 17413),\n", " ('../raw/block_000031.h5', 21618),\n", " ('../raw/block_000032.h5', 25495),\n", " ('../raw/block_000033.h5', 33641),\n", " ('../raw/block_000034.h5', 37743),\n", " ('../raw/block_000035.h5', 29118),\n", " ('../raw/block_000036.h5', 22979),\n", " ('../raw/block_000037.h5', 31023),\n", " ('../raw/block_000038.h5', 27896),\n", " ('../raw/block_000039.h5', 13669),\n", " ('../raw/block_000040.h5', 33845),\n", " ('../raw/block_000041.h5', 38422),\n", " ('../raw/block_000042.h5', 39761),\n", " ('../raw/block_000043.h5', 30427),\n", " ('../raw/block_000044.h5', 27602),\n", " ('../raw/block_000045.h5', 42879),\n", " ('../raw/block_000046.h5', 30371),\n", " ('../raw/block_000047.h5', 57371),\n", " ('../raw/block_000048.h5', 49163),\n", " ('../raw/block_000049.h5', 44595),\n", " ('../raw/block_000050.h5', 61025),\n", " ('../raw/block_000051.h5', 48338),\n", " ('../raw/block_000052.h5', 54069),\n", " ('../raw/block_000053.h5', 64876),\n", " ('../raw/block_000054.h5', 48566),\n", " ('../raw/block_000055.h5', 27345),\n", " ('../raw/block_000056.h5', 79157),\n", " ('../raw/block_000057.h5', 55684),\n", " ('../raw/block_000058.h5', 55021),\n", " ('../raw/block_000059.h5', 69984),\n", " ('../raw/block_000060.h5', 74874),\n", " ('../raw/block_000061.h5', 65049),\n", " ('../raw/block_000062.h5', 37659),\n", " ('../raw/block_000063.h5', 44400),\n", " ('../raw/block_000064.h5', 23033),\n", " ('../raw/block_000065.h5', 68038),\n", " ('../raw/block_000066.h5', 81986),\n", " ('../raw/block_000067.h5', 86033),\n", " ('../raw/block_000068.h5', 79235),\n", " ('../raw/block_000069.h5', 80561),\n", " ('../raw/block_000070.h5', 69861),\n", " ('../raw/block_000071.h5', 46393),\n", " ('../raw/block_000072.h5', 72050),\n", " ('../raw/block_000073.h5', 86802),\n", " ('../raw/block_000074.h5', 92737),\n", " ('../raw/block_000075.h5', 73140),\n", " ('../raw/block_000076.h5', 81622),\n", " ('../raw/block_000077.h5', 73623),\n", " ('../raw/block_000078.h5', 77460),\n", " ('../raw/block_000079.h5', 66289),\n", " ('../raw/block_000080.h5', 71028),\n", " ('../raw/block_000081.h5', 28525),\n", " ('../raw/block_000082.h5', 68376),\n", " ('../raw/block_000083.h5', 58120),\n", " ('../raw/block_000084.h5', 66737),\n", " ('../raw/block_000085.h5', 69134),\n", " ('../raw/block_000086.h5', 62989),\n", " ('../raw/block_000087.h5', 53303),\n", " ('../raw/block_000088.h5', 78125),\n", " ('../raw/block_000089.h5', 76424),\n", " ('../raw/block_000090.h5', 85547),\n", " ('../raw/block_000091.h5', 86834),\n", " ('../raw/block_000092.h5', 51637),\n", " ('../raw/block_000093.h5', 12511),\n", " ('../raw/block_000094.h5', 61426),\n", " ('../raw/block_000095.h5', 48377),\n", " ('../raw/block_000096.h5', 76305),\n", " ('../raw/block_000097.h5', 73718),\n", " ('../raw/block_000098.h5', 35030),\n", " ('../raw/block_000099.h5', 77042),\n", " ('../raw/block_000100.h5', 81523),\n", " ('../raw/block_000101.h5', 59436),\n", " ('../raw/block_000102.h5', 65886),\n", " ('../raw/block_000103.h5', 80629),\n", " ('../raw/block_000104.h5', 85912),\n", " ('../raw/block_000105.h5', 73496),\n", " ('../raw/block_000106.h5', 70577),\n", " ('../raw/block_000107.h5', 44006),\n", " ('../raw/block_000108.h5', 72507),\n", " ('../raw/block_000109.h5', 93785),\n", " ('../raw/block_000110.h5', 89219),\n", " ('../raw/block_000111.h5', 85405),\n", " ('../raw/block_000112.h5', 85752),\n", " ('../raw/block_000113.h5', 69227),\n", " ('../raw/block_000114.h5', 71133),\n", " ('../raw/block_000115.h5', 80717),\n", " ('../raw/block_000116.h5', 71857),\n", " ('../raw/block_000117.h5', 86565),\n", " ('../raw/block_000118.h5', 105853),\n", " ('../raw/block_000119.h5', 97031),\n", " ('../raw/block_000120.h5', 73965),\n", " ('../raw/block_000121.h5', 84168),\n", " ('../raw/block_000122.h5', 85747),\n", " ('../raw/block_000123.h5', 80809),\n", " ('../raw/block_000124.h5', 49153),\n", " ('../raw/block_000125.h5', 87255),\n", " ('../raw/block_000126.h5', 83772),\n", " ('../raw/block_000127.h5', 78790),\n", " ('../raw/block_000128.h5', 68654),\n", " ('../raw/block_000129.h5', 83330),\n", " ('../raw/block_000130.h5', 95104),\n", " ('../raw/block_000131.h5', 67984),\n", " ('../raw/block_000132.h5', 72881),\n", " ('../raw/block_000133.h5', 70998),\n", " ('../raw/block_000134.h5', 79607),\n", " ('../raw/block_000135.h5', 84799),\n", " ('../raw/block_000136.h5', 85629),\n", " ('../raw/block_000137.h5', 95370),\n", " ('../raw/block_000138.h5', 85215),\n", " ('../raw/block_000139.h5', 84650),\n", " ('../raw/block_000140.h5', 81674),\n", " ('../raw/block_000141.h5', 84779),\n", " ('../raw/block_000142.h5', 74864),\n", " ('../raw/block_000143.h5', 80551),\n", " ('../raw/block_000144.h5', 72909),\n", " ('../raw/block_000145.h5', 15920),\n", " ('../raw/block_000146.h5', 63878),\n", " ('../raw/block_000147.h5', 82667),\n", " ('../raw/block_000148.h5', 86574),\n", " ('../raw/block_000149.h5', 46223),\n", " ('../raw/block_000150.h5', 68548),\n", " ('../raw/block_000151.h5', 70550),\n", " ('../raw/block_000152.h5', 65112),\n", " ('../raw/block_000153.h5', 69662),\n", " ('../raw/block_000154.h5', 60591),\n", " ('../raw/block_000155.h5', 92273),\n", " ('../raw/block_000156.h5', 74459),\n", " ('../raw/block_000157.h5', 102706),\n", " ('../raw/block_000158.h5', 78768),\n", " ('../raw/block_000159.h5', 89669),\n", " ('../raw/block_000160.h5', 106171),\n", " ('../raw/block_000161.h5', 113257),\n", " ('../raw/block_000162.h5', 115706),\n", " ('../raw/block_000163.h5', 91708),\n", " ('../raw/block_000164.h5', 70368),\n", " ('../raw/block_000165.h5', 102131),\n", " ('../raw/block_000166.h5', 77305),\n", " ('../raw/block_000167.h5', 23996)]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#import pickle\n", "\n", "#pickle_f = open('block_run.pickle', 'rb')\n", "#block_run = pickle.load(pickle_f)\n", "#pickle_f.close()\n", "#consolidate_blocks(out_hdf5_fname, block_run)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#If we have 20 CPUS, why not 20 processes with equal number of blocks --> SNP density." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }