{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from __future__ import print_function, division\n",
    "\n",
    "from collections import defaultdict\n",
    "import os\n",
    "import pickle\n",
    "\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data download"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2015-06-26 14:52:43--  http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2\n",
      "Resolving hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)... 130.14.29.113, 2607:f220:41e:4290::113\n",
      "Connecting to hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)|130.14.29.113|:80... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 10590722 (10M) [application/x-bzip2]\n",
      "Saving to: ‘hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2.1’\n",
      "\n",
      "hapmap3_r2_b36_fwd. 100%[=====================>]  10.10M  3.74MB/s   in 2.7s   \n",
      "\n",
      "2015-06-26 14:52:46 (3.74 MB/s) - ‘hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2.1’ saved [10590722/10590722]\n",
      "\n",
      "--2015-06-26 14:52:46--  http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2\n",
      "Resolving hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)... 130.14.29.113, 2607:f220:41e:4290::113\n",
      "Connecting to hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)|130.14.29.113|:80... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 757962593 (723M) [application/x-bzip2]\n",
      "Saving to: ‘hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2.1’\n",
      "\n",
      "hapmap3_r2_b36_fwd. 100%[=====================>] 722.85M  8.51MB/s   in 3m 58s \n",
      "\n",
      "2015-06-26 14:56:44 (3.03 MB/s) - ‘hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2.1’ saved [757962593/757962593]\n",
      "\n",
      "--2015-06-26 14:56:45--  http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/relationships_w_pops_121708.txt\n",
      "Resolving hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)... 130.14.29.113, 2607:f220:41e:4290::113\n",
      "Connecting to hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)|130.14.29.113|:80... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 36765 (36K) [text/plain]\n",
      "Saving to: ‘relationships_w_pops_121708.txt.4’\n",
      "\n",
      "relationships_w_pop 100%[=====================>]  35.90K   174KB/s   in 0.2s   \n",
      "\n",
      "2015-06-26 14:56:45 (174 KB/s) - ‘relationships_w_pops_121708.txt.4’ saved [36765/36765]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2\n",
    "!wget http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2\n",
    "\n",
    "!wget http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/relationships_w_pops_121708.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "bunzip2: Output file hapmap3_r2_b36_fwd.consensus.qc.poly.map already exists.\n",
      "bunzip2: Output file hapmap3_r2_b36_fwd.consensus.qc.poly.ped already exists.\n"
     ]
    }
   ],
   "source": [
    "!bunzip2 hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2\n",
    "!bunzip2 hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data preparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "tsi = open('tsi.ind', 'w')\n",
    "f = open('relationships_w_pops_121708.txt')\n",
    "\n",
    "for l in f:\n",
    "    toks = l.rstrip().split('\\t')\n",
    "    fam_id = toks[0]\n",
    "    ind_id = toks[1]\n",
    "    mom = toks[2]\n",
    "    dad = toks[3]\n",
    "    pop = toks[-1]\n",
    "    if pop != 'TSI' or mom != '0' or dad != '0':\n",
    "        continue\n",
    "    tsi.write('%s\\t%s\\n' % (fam_id, ind_id))\n",
    "f.close()\n",
    "tsi.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "os.system('plink --file hapmap3_r2_b36_fwd.consensus.qc.poly --maf 0.001 --keep tsi.ind --make-bed --out tsi')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "max_chro_pos = defaultdict(int)\n",
    "f = open('tsi.bim')\n",
    "for l in f:\n",
    "    toks = l.rstrip().split('\\t')\n",
    "    chrom = int(toks[0])\n",
    "    if chrom > 22:\n",
    "        continue\n",
    "    pos = int(toks[3])\n",
    "    if pos > max_chro_pos[chrom]:\n",
    "        max_chro_pos[chrom] = pos\n",
    "f.close()\n",
    "w = open('max_chro_pos.pickle', 'w')\n",
    "pickle.dump(max_chro_pos, w)\n",
    "w.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sequencial run"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#os.system('plink --bfile tsi --freq --out tsi')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "window_size = 2000000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
      "Wall time: 1.68 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time os.system('plink --bfile tsi --freq --out tsi')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def traverse_genome(traverse_fun, state=None):\n",
    "    if state is None:\n",
    "        state = {}\n",
    "    for chrom, max_pos in max_chro_pos.items():\n",
    "        num_bin = (max_pos + 1) // window_size\n",
    "        for my_bin in range(num_bin):\n",
    "            start_pos = my_bin * window_size + 1  # 1-start\n",
    "            end_pos = start_pos + window_size\n",
    "            traverse_fun(state, chrom, start_pos, end_pos)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 192 ms, sys: 1.06 s, total: 1.25 s\n",
      "Wall time: 4min 47s\n"
     ]
    }
   ],
   "source": [
    "def compute_MAF(state, chrom, start_pos, end_pos):\n",
    "    os.system('plink --bfile tsi --freq --out tsi-%d-%d --chr %d --from-bp %d --to-bp %d' %\n",
    "                  (chrom, start_pos, chrom, start_pos, end_pos))\n",
    "    os.remove('tsi-%d-%d.log' % (chrom, start_pos))\n",
    "    \n",
    "%time traverse_genome(compute_MAF)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def gather_statistics(state, chrom, start_pos, end_pos):\n",
    "    try:\n",
    "        f = open('tsi-%d-%d.frq' % (chrom, start_pos))\n",
    "    except:\n",
    "        # empty block\n",
    "        state['block_mafs'][(chrom, start_pos)] = []\n",
    "        return\n",
    "    f.readline()\n",
    "    for cnt, l in enumerate(f):\n",
    "        toks = [tok for tok in l.rstrip().split(' ') if tok != '']\n",
    "        maf = float(toks[-2])\n",
    "        state['snp_cnt'] += 1\n",
    "        state['sum_maf'] += maf\n",
    "        state['block_mafs'][(chrom, start_pos)].append(maf)\n",
    "    f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "stats = {'snp_cnt': 0, 'sum_maf': 0.0, 'block_mafs': defaultdict(list)}\n",
    "traverse_genome(gather_statistics, state=stats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1222126 0.23159641651\n"
     ]
    }
   ],
   "source": [
    "print(stats['snp_cnt'], stats['sum_maf'] / stats['snp_cnt'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "all_mafs = []\n",
    "for mafs in stats['block_mafs'].values():\n",
    "    all_mafs.extend(mafs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 84 ms, sys: 4 ms, total: 88 ms\n",
      "Wall time: 132 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.22159999999999999"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time np.median(all_mafs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.2216\n"
     ]
    }
   ],
   "source": [
    "all_mafs.sort()\n",
    "middle = len(all_mafs) // 2\n",
    "#array of even size\n",
    "print((all_mafs[middle] + all_mafs[middle + 1]) / 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['block_mafs', 'snp_cnt', 'sum_maf']"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stats.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import functools\n",
    "\n",
    "def collect_mafs(state, chrom, start_pos, end_pos, block_mafs):\n",
    "    state[chrom] += len(block_mafs[(chrom, start_pos)])\n",
    "\n",
    "chrom_cnts = defaultdict(int)\n",
    "traverse_genome(functools.partial(collect_mafs, block_mafs=stats['block_mafs']),\n",
    "                state=chrom_cnts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " 1\t100780\n",
      " 2\t102377\n",
      " 3\t 84921\n",
      " 4\t 75819\n",
      " 5\t 78013\n",
      " 6\t 82169\n",
      " 7\t 67218\n",
      " 8\t 66803\n",
      " 9\t 56921\n",
      "10\t 65166\n",
      "11\t 62182\n",
      "12\t 60381\n",
      "13\t 46354\n",
      "14\t 40525\n",
      "15\t 37217\n",
      "16\t 38443\n",
      "17\t 33175\n",
      "18\t 36144\n",
      "19\t 21749\n",
      "20\t 31872\n",
      "21\t 16978\n",
      "22\t 16919\n"
     ]
    }
   ],
   "source": [
    "for chrom in range(1, 23):\n",
    "    print('%2d\\t%6d' % (chrom, chrom_cnts[chrom]))\n",
    "#block printing on the next chapter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}