{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from collections import defaultdict\n", "\n", "from Bio.PopGen.GenePop import read\n", "from Bio.PopGen.GenePop.LargeFileParser import read as read_large\n", "\n", "from genomics.popgen.plink.convert import to_genepop" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "f = open('relationships_w_pops_121708.txt')\n", "pop_ind = defaultdict(list)\n", "f.readline() # header\n", "for line in f:\n", " toks = line.rstrip().split('\\t')\n", " fam_id = toks[0]\n", " ind_id = toks[1]\n", " pop = toks[-1]\n", " pop_ind[pop].append((fam_id, ind_id))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Check for consistency issues" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Problems with 2469/NA20281\n" ] } ], "source": [ "all_inds = []\n", "for inds in pop_ind.values():\n", " all_inds.extend(inds)\n", "for line in open('hapmap1.ped'):\n", " toks = line.rstrip().replace(' ', '\\t').split('\\t')\n", " fam = toks[0]\n", " ind = toks[1]\n", " if (fam, ind) not in all_inds:\n", " print('Problems with %s/%s' % (fam, ind))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Convert from PLINK to Genepop" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "to_genepop('hapmap1_auto', 'hapmap1_auto', pop_ind)\n", "to_genepop('hapmap10', 'hapmap10', pop_ind)\n", "to_genepop('hapmap10_auto', 'hapmap10_auto', pop_ind)\n", "to_genepop('hapmap10_auto_noofs_ld', 'hapmap10_auto_noofs_ld', pop_ind)\n", "to_genepop('hapmap10_auto_noofs_2', 'hapmap10_auto_noofs_2', pop_ind)\n", "#slow\n", "#talk about coding: ACGT - 1234\n", "#talk about pop Pop names" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using the in-memory parser" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Careful: this will severely increase your memory usage, consider it optional" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "rec = read(open('hapmap1_auto.gp'))\n", "print('Header: %s' % len(rec.comment_line))\n", "print('Number of loci %d' % len(rec.loci_list))\n", "print('Number of populations %d' % len(rec.pop_list))\n", "print('Population names: %s' % ', '.join(rec.pop_list))\n", "print('Individuals per population %s' % ', '.join([str(len(inds)) for inds in rec.populations]))\n", "ind = rec.populations[1][0]\n", "print('Individual %s, SNP %s, alleles: %d %d' % (ind[0], rec.loci_list[0], ind[1][0][0], ind[1][0][1]))\n", "del rec\n", "#talk about population names" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using the Large File Parser" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def count_individuals(fname):\n", " rec = read_large(open(fname))\n", " pop_sizes = []\n", " for line in rec.data_generator():\n", " if line == ():\n", " pop_sizes.append(0)\n", " else:\n", " pop_sizes[-1] += 1\n", " return pop_sizes" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Individuals per population 82, 165, 84, 85, 88, 86, 90, 77, 171, 88, 167\n" ] } ], "source": [ "print('Individuals per population %s' % ', '.join([str(ninds) for ninds in count_individuals('hapmap1_auto.gp')]))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "143809\n", "138694\n", "55217\n" ] } ], "source": [ "print(len(read_large(open('hapmap10.gp')).loci_list))\n", "print(len(read_large(open('hapmap10_auto.gp')).loci_list))\n", "print(len(read_large(open('hapmap10_auto_noofs_ld.gp')).loci_list))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }