{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "from Bio.PopGen.GenePop import read\n",
    "from Bio.PopGen.GenePop.LargeFileParser import read as read_large\n",
    "\n",
    "from genomics.popgen.plink.convert import to_genepop"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "f = open('relationships_w_pops_121708.txt')\n",
    "pop_ind = defaultdict(list)\n",
    "f.readline()  # header\n",
    "for line in f:\n",
    "    toks = line.rstrip().split('\\t')\n",
    "    fam_id = toks[0]\n",
    "    ind_id = toks[1]\n",
    "    pop = toks[-1]\n",
    "    pop_ind[pop].append((fam_id, ind_id))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Check for consistency issues"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Problems with 2469/NA20281\n"
     ]
    }
   ],
   "source": [
    "all_inds = []\n",
    "for inds in pop_ind.values():\n",
    "    all_inds.extend(inds)\n",
    "for line in open('hapmap1.ped'):\n",
    "    toks = line.rstrip().replace(' ', '\\t').split('\\t')\n",
    "    fam = toks[0]\n",
    "    ind = toks[1]\n",
    "    if (fam, ind) not in all_inds:\n",
    "        print('Problems with %s/%s' % (fam, ind))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Convert from PLINK to Genepop"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "to_genepop('hapmap1_auto', 'hapmap1_auto', pop_ind)\n",
    "to_genepop('hapmap10', 'hapmap10', pop_ind)\n",
    "to_genepop('hapmap10_auto', 'hapmap10_auto', pop_ind)\n",
    "to_genepop('hapmap10_auto_noofs_ld', 'hapmap10_auto_noofs_ld', pop_ind)\n",
    "to_genepop('hapmap10_auto_noofs_2', 'hapmap10_auto_noofs_2', pop_ind)\n",
    "#slow\n",
    "#talk about coding: ACGT - 1234\n",
    "#talk about pop Pop names"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using the in-memory parser"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Careful: this will severely increase your memory usage, consider it optional"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "rec = read(open('hapmap1_auto.gp'))\n",
    "print('Header: %s' % len(rec.comment_line))\n",
    "print('Number of loci %d' % len(rec.loci_list))\n",
    "print('Number of populations %d' % len(rec.pop_list))\n",
    "print('Population names: %s' % ', '.join(rec.pop_list))\n",
    "print('Individuals per population %s' % ', '.join([str(len(inds)) for inds in rec.populations]))\n",
    "ind = rec.populations[1][0]\n",
    "print('Individual %s, SNP %s, alleles: %d %d' % (ind[0], rec.loci_list[0], ind[1][0][0], ind[1][0][1]))\n",
    "del rec\n",
    "#talk about population names"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using the Large File Parser"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def count_individuals(fname):\n",
    "    rec = read_large(open(fname))\n",
    "    pop_sizes = []\n",
    "    for line in rec.data_generator():\n",
    "        if line == ():\n",
    "            pop_sizes.append(0)\n",
    "        else:\n",
    "            pop_sizes[-1] += 1\n",
    "    return pop_sizes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Individuals per population 82, 165, 84, 85, 88, 86, 90, 77, 171, 88, 167\n"
     ]
    }
   ],
   "source": [
    "print('Individuals per population %s' % ', '.join([str(ninds) for ninds in count_individuals('hapmap1_auto.gp')]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "143809\n",
      "138694\n",
      "55217\n"
     ]
    }
   ],
   "source": [
    "print(len(read_large(open('hapmap10.gp')).loci_list))\n",
    "print(len(read_large(open('hapmap10_auto.gp')).loci_list))\n",
    "print(len(read_large(open('hapmap10_auto_noofs_ld.gp')).loci_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}