{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from __future__ import print_function, division\n", "\n", "from collections import defaultdict\n", "import os\n", "import pickle\n", "\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data download" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2015-06-26 14:52:43-- http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2\n", "Resolving hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)... 130.14.29.113, 2607:f220:41e:4290::113\n", "Connecting to hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)|130.14.29.113|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 10590722 (10M) [application/x-bzip2]\n", "Saving to: ‘hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2.1’\n", "\n", "hapmap3_r2_b36_fwd. 100%[=====================>] 10.10M 3.74MB/s in 2.7s \n", "\n", "2015-06-26 14:52:46 (3.74 MB/s) - ‘hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2.1’ saved [10590722/10590722]\n", "\n", "--2015-06-26 14:52:46-- http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2\n", "Resolving hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)... 130.14.29.113, 2607:f220:41e:4290::113\n", "Connecting to hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)|130.14.29.113|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 757962593 (723M) [application/x-bzip2]\n", "Saving to: ‘hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2.1’\n", "\n", "hapmap3_r2_b36_fwd. 100%[=====================>] 722.85M 8.51MB/s in 3m 58s \n", "\n", "2015-06-26 14:56:44 (3.03 MB/s) - ‘hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2.1’ saved [757962593/757962593]\n", "\n", "--2015-06-26 14:56:45-- http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/relationships_w_pops_121708.txt\n", "Resolving hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)... 130.14.29.113, 2607:f220:41e:4290::113\n", "Connecting to hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)|130.14.29.113|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 36765 (36K) [text/plain]\n", "Saving to: ‘relationships_w_pops_121708.txt.4’\n", "\n", "relationships_w_pop 100%[=====================>] 35.90K 174KB/s in 0.2s \n", "\n", "2015-06-26 14:56:45 (174 KB/s) - ‘relationships_w_pops_121708.txt.4’ saved [36765/36765]\n", "\n" ] } ], "source": [ "!wget http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2\n", "!wget http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2\n", "\n", "!wget http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/relationships_w_pops_121708.txt" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bunzip2: Output file hapmap3_r2_b36_fwd.consensus.qc.poly.map already exists.\n", "bunzip2: Output file hapmap3_r2_b36_fwd.consensus.qc.poly.ped already exists.\n" ] } ], "source": [ "!bunzip2 hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2\n", "!bunzip2 hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data preparation" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "tsi = open('tsi.ind', 'w')\n", "f = open('relationships_w_pops_121708.txt')\n", "\n", "for l in f:\n", " toks = l.rstrip().split('\\t')\n", " fam_id = toks[0]\n", " ind_id = toks[1]\n", " mom = toks[2]\n", " dad = toks[3]\n", " pop = toks[-1]\n", " if pop != 'TSI' or mom != '0' or dad != '0':\n", " continue\n", " tsi.write('%s\\t%s\\n' % (fam_id, ind_id))\n", "f.close()\n", "tsi.close()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.system('plink --file hapmap3_r2_b36_fwd.consensus.qc.poly --maf 0.001 --keep tsi.ind --make-bed --out tsi')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "max_chro_pos = defaultdict(int)\n", "f = open('tsi.bim')\n", "for l in f:\n", " toks = l.rstrip().split('\\t')\n", " chrom = int(toks[0])\n", " if chrom > 22:\n", " continue\n", " pos = int(toks[3])\n", " if pos > max_chro_pos[chrom]:\n", " max_chro_pos[chrom] = pos\n", "f.close()\n", "w = open('max_chro_pos.pickle', 'w')\n", "pickle.dump(max_chro_pos, w)\n", "w.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sequencial run" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#os.system('plink --bfile tsi --freq --out tsi')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "window_size = 2000000" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n", "Wall time: 1.68 s\n" ] }, { "data": { "text/plain": [ "0" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%time os.system('plink --bfile tsi --freq --out tsi')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def traverse_genome(traverse_fun, state=None):\n", " if state is None:\n", " state = {}\n", " for chrom, max_pos in max_chro_pos.items():\n", " num_bin = (max_pos + 1) // window_size\n", " for my_bin in range(num_bin):\n", " start_pos = my_bin * window_size + 1 # 1-start\n", " end_pos = start_pos + window_size\n", " traverse_fun(state, chrom, start_pos, end_pos)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 192 ms, sys: 1.06 s, total: 1.25 s\n", "Wall time: 4min 47s\n" ] } ], "source": [ "def compute_MAF(state, chrom, start_pos, end_pos):\n", " os.system('plink --bfile tsi --freq --out tsi-%d-%d --chr %d --from-bp %d --to-bp %d' %\n", " (chrom, start_pos, chrom, start_pos, end_pos))\n", " os.remove('tsi-%d-%d.log' % (chrom, start_pos))\n", " \n", "%time traverse_genome(compute_MAF)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def gather_statistics(state, chrom, start_pos, end_pos):\n", " try:\n", " f = open('tsi-%d-%d.frq' % (chrom, start_pos))\n", " except:\n", " # empty block\n", " state['block_mafs'][(chrom, start_pos)] = []\n", " return\n", " f.readline()\n", " for cnt, l in enumerate(f):\n", " toks = [tok for tok in l.rstrip().split(' ') if tok != '']\n", " maf = float(toks[-2])\n", " state['snp_cnt'] += 1\n", " state['sum_maf'] += maf\n", " state['block_mafs'][(chrom, start_pos)].append(maf)\n", " f.close()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [], "source": [ "stats = {'snp_cnt': 0, 'sum_maf': 0.0, 'block_mafs': defaultdict(list)}\n", "traverse_genome(gather_statistics, state=stats)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1222126 0.23159641651\n" ] } ], "source": [ "print(stats['snp_cnt'], stats['sum_maf'] / stats['snp_cnt'])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [], "source": [ "all_mafs = []\n", "for mafs in stats['block_mafs'].values():\n", " all_mafs.extend(mafs)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 84 ms, sys: 4 ms, total: 88 ms\n", "Wall time: 132 ms\n" ] }, { "data": { "text/plain": [ "0.22159999999999999" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%time np.median(all_mafs)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.2216\n" ] } ], "source": [ "all_mafs.sort()\n", "middle = len(all_mafs) // 2\n", "#array of even size\n", "print((all_mafs[middle] + all_mafs[middle + 1]) / 2)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['block_mafs', 'snp_cnt', 'sum_maf']" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stats.keys()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import functools\n", "\n", "def collect_mafs(state, chrom, start_pos, end_pos, block_mafs):\n", " state[chrom] += len(block_mafs[(chrom, start_pos)])\n", "\n", "chrom_cnts = defaultdict(int)\n", "traverse_genome(functools.partial(collect_mafs, block_mafs=stats['block_mafs']),\n", " state=chrom_cnts)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 1\t100780\n", " 2\t102377\n", " 3\t 84921\n", " 4\t 75819\n", " 5\t 78013\n", " 6\t 82169\n", " 7\t 67218\n", " 8\t 66803\n", " 9\t 56921\n", "10\t 65166\n", "11\t 62182\n", "12\t 60381\n", "13\t 46354\n", "14\t 40525\n", "15\t 37217\n", "16\t 38443\n", "17\t 33175\n", "18\t 36144\n", "19\t 21749\n", "20\t 31872\n", "21\t 16978\n", "22\t 16919\n" ] } ], "source": [ "for chrom in range(1, 23):\n", " print('%2d\\t%6d' % (chrom, chrom_cnts[chrom]))\n", "#block printing on the next chapter" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }