{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Calculate Fst and other popgen statistics" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import ipyrad.analysis as ipa\n", "import numpy as np\n", "import pandas as pd\n", "import itertools " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# get a sequence alignment from the HDF5" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "database = \"/home/deren/Downloads/spp30.seqs.hdf5\"\n", "idx=0\n", "wmin=0\n", "wmax=100000" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "parse_phystring" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# init a popgen analysis object\n", "pop = ipa.popgen(\n", " name=\"ped\", \n", " workdir=\"analysis-popgen\", \n", " data=\"./pedicularis/clust85_min12_outfiles/clust85_min12.loci\",\n", ")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'dict' object has no attribute 'key'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fst\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/Documents/ipyrad/ipyrad/analysis/popgen.py\u001b[0m in \u001b[0;36m_fst\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 114\u001b[0m ) \n\u001b[1;32m 115\u001b[0m \u001b[0md\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnpops\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 116\u001b[0;31m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 117\u001b[0m \u001b[0;31m# iterate over pairs of pops and fill Fst values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0mpairs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mitertools\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcombinations\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpopdict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mAttributeError\u001b[0m: 'dict' object has no attribute 'key'" ] } ], "source": [ "pop._fst()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Update `loci_to_arr()`\n", "\n", "... Now I'm thinking maybe I really should make a HDF5 format as the default for downstream analyses..., it can have SNPs, mapfile, and whole sequences with locimap. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def _loci_to_arr(self):\n", " \"\"\"\n", " return a frequency array from a loci file for all loci with taxa from \n", " taxdict and min coverage from mindict. \n", " \"\"\"\n", "\n", " # \n", " loci = infile.read().strip().split(\"|\\n\")\n", " \n", " ## make the array (4 or 5) and a mask array to remove loci without cov\n", " nloci = len(loci)\n", " keep = np.zeros(nloci, dtype=np.bool_)\n", " arr = np.zeros((nloci, 4, 300), dtype=np.float64)\n", "\n", " ## six rows b/c one for each p3, and for the fused p3 ancestor\n", " if len(taxdict) == 5:\n", " arr = np.zeros((nloci, 6, 300), dtype=np.float64)\n", "\n", " ## if not mindict, make one that requires 1 in each taxon\n", " if isinstance(mindict, int):\n", " mindict = {i: mindict for i in taxdict}\n", " elif isinstance(mindict, dict):\n", " mindict = {i: mindict[i] for i in taxdict}\n", " else:\n", " mindict = {i: 1 for i in taxdict}\n", "\n", " ## raise error if names are not 'p[int]' \n", " allowed_names = ['p1', 'p2', 'p3', 'p4', 'p5']\n", " if any([i not in allowed_names for i in taxdict]):\n", " raise IPyradError(\\\n", " \"keys in taxdict must be named 'p1' through 'p4' or 'p5'\")\n", "\n", " ## parse key names\n", " keys = sorted([i for i in taxdict.keys() if i[0] == 'p'])\n", " outg = keys[-1]\n", "\n", " ## grab seqs just for the good guys\n", " for loc in xrange(nloci):\n", "\n", " ## parse the locus\n", " lines = loci[loc].split(\"\\n\")[:-1]\n", " names = [i.split()[0] for i in lines]\n", " seqs = np.array([list(i.split()[1]) for i in lines])\n", "\n", " ## check that names cover the taxdict (still need to check by site)\n", " covs = [sum([j in names for j in taxdict[tax]]) >= mindict[tax] \\\n", " for tax in taxdict]\n", "\n", " ## keep locus\n", " if all(covs):\n", " keep[loc] = True\n", "\n", " ## get the refseq\n", " refidx = np.where([i in taxdict[outg] for i in names])[0]\n", " refseq = seqs[refidx].view(np.uint8)\n", " ancestral = np.array([reftrick(refseq, GETCONS2)[:, 0]])\n", "\n", " ## freq of ref in outgroup\n", " iseq = _reffreq2(ancestral, refseq, GETCONS2)\n", " arr[loc, -1, :iseq.shape[1]] = iseq \n", "\n", " ## enter 4-taxon freqs\n", " if len(taxdict) == 4:\n", " for tidx, key in enumerate(keys[:-1]):\n", "\n", " ## get idx of names in test tax\n", " nidx = np.where([i in taxdict[key] for i in names])[0]\n", " sidx = seqs[nidx].view(np.uint8)\n", " \n", " ## get freq of sidx\n", " iseq = _reffreq2(ancestral, sidx, GETCONS2)\n", " \n", " ## fill it in \n", " arr[loc, tidx, :iseq.shape[1]] = iseq\n", "\n", " else:\n", "\n", " ## entere p5; and fill it in\n", " iseq = _reffreq2(ancestral, refseq, GETCONS2) \n", " arr[loc, -1, :iseq.shape[1]] = iseq \n", " \n", " ## enter p1\n", " nidx = np.where([i in taxdict['p1'] for i in names])[0]\n", " sidx = seqs[nidx].view(np.uint8)\n", " iseq = _reffreq2(ancestral, sidx, GETCONS2)\n", " arr[loc, 0, :iseq.shape[1]] = iseq\n", " \n", " ## enter p2\n", " nidx = np.where([i in taxdict['p2'] for i in names])[0]\n", " sidx = seqs[nidx].view(np.uint8)\n", " iseq = _reffreq2(ancestral, sidx, GETCONS2)\n", " arr[loc, 1, :iseq.shape[1]] = iseq\n", " \n", " ## enter p3 with p4 masked, and p4 with p3 masked\n", " nidx = np.where([i in taxdict['p3'] for i in names])[0]\n", " nidy = np.where([i in taxdict['p4'] for i in names])[0]\n", " sidx = seqs[nidx].view(np.uint8)\n", " sidy = seqs[nidy].view(np.uint8)\n", " xseq = _reffreq2(ancestral, sidx, GETCONS2)\n", " yseq = _reffreq2(ancestral, sidy, GETCONS2)\n", " mask3 = xseq != 0\n", " mask4 = yseq != 0\n", " xseq[mask4] = 0\n", " yseq[mask3] = 0\n", " arr[loc, 2, :xseq.shape[1]] = xseq\n", " arr[loc, 3, :yseq.shape[1]] = yseq\n", " \n", " ## enter p34 \n", " nidx = nidx.tolist() + nidy.tolist()\n", " sidx = seqs[nidx].view(np.uint8)\n", " iseq = _reffreq2(ancestral, sidx, GETCONS2)\n", " arr[loc, 4, :iseq.shape[1]] = iseq\n", "\n", "\n", " ## size-down array to the number of loci that have taxa for the test\n", " arr = arr[keep, :, :]\n", "\n", " ## size-down sites to \n", " arr = masknulls(arr)\n", "\n", " return arr, keep\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Calculations\n", "\n", "Fst can be calculated per SNP, or averaged over sliding windows of SNPs of some size if reference information is present. Or over loci if " ] }, { "cell_type": "code", "execution_count": 309, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "fst Empty DataFrame\n", "Columns: []\n", "Index: []\n", "pi Empty DataFrame\n", "Columns: [0]\n", "Index: []\n", "theta Empty DataFrame\n", "Columns: [0]\n", "Index: []" ] }, "execution_count": 309, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pop.results" ] }, { "cell_type": "code", "execution_count": 527, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'i': 1, 'j': 1}" ] }, "execution_count": 527, "metadata": {}, "output_type": "execute_result" } ], "source": [ "popdict = {'i': [2, 3], 'j': [2, 4]}\n", "mindict = {}\n", "(mindict if mindict else {j:1 for j in popdict})" ] }, { "cell_type": "code", "execution_count": 528, "metadata": {}, "outputs": [], "source": [ "testdata1 = np.array([\n", " [0, 0, 0, 0, 0],\n", " [1, 0, 0, 0, 0],\n", " [1, 1, 1, 0, 0], \n", " [1, 1, 1, 1, 0],\n", " [1, 1, 1, 1, 0], \n", " [1, 1, 1, 1, 1],\n", "])\n", "\n", "testdata2 = np.random.binomial(1, 0.01, (6, 100))" ] }, { "cell_type": "code", "execution_count": 532, "metadata": {}, "outputs": [], "source": [ "class Self:\n", " def __init__(self, data, popdict={}, mindict={}, mapfile=False):\n", " self.data = data\n", " self.popdict = popdict\n", " self.mindict = mindict\n", " self._parsedata()\n", " self._checkdicts()\n", " self._filterdata()\n", " \n", " def _parsedata(self):\n", " \"\"\"\n", " parse data as an ndarray, vcfile, or locifile and store \n", " in ... arrays, one with all sites, and mapfile... \n", " \"\"\"\n", " pass\n", " \n", " \n", " def _checkdicts(self): \n", " #assert len(popdict) > 2, \"must enter at least two populations\"\n", " self.mindict = (mindict if mindict else {j:1 for j in popdict})\n", " \n", " \n", " def _filterdata(self):\n", " pass\n", " \n", " \n", " def fst_H(self, pop1idx, pop2idx):\n", " ii = list(itertools.combinations(pop1idx, 2))\n", " jj = list(itertools.combinations(pop2idx, 2))\n", " kk = itertools.combinations(pop1idx + pop2idx, 2)\n", " between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk) \n", "\n", " diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]\n", " sums = np.sum(diff, axis=0)\n", " a = sums / sums.shape[0]\n", "\n", " diff = [self.data[i] != self.data[j] for (i, j) in between]\n", " sums = np.sum(diff, axis=0)\n", " b = sums / sums.shape[0]\n", " return abs(1 - (a / b))\n", " \n", " \n", " def theta_W(self):\n", " \"return Watternson's theta for each population\"\n", " df = pd.DataFrame(\n", " data=np.zeros(len(self.popdict)),\n", " columns=[\"theta\"],\n", " index=list(self.popdict.keys())\n", " )\n", " for row in self.popdict:\n", " idat = self.data[self.popdict[row], :]\n", " segregating = np.invert(np.all(idat == idat[0], axis=0)).sum()\n", " denom = np.sum((1 / i for i in range(1, idat.shape[1] - 1)))\n", " theta = segregating / denom\n", " df.loc[row] = theta\n", " return df\n", " \n", " \n", " def pi(self, pop1idx, pop2idx):\n", " pass" ] }, { "cell_type": "code", "execution_count": 533, "metadata": {}, "outputs": [], "source": [ "popdict = {'A': [0, 1], 'B': [2, 3, 4, 5]}\n", "mindict = {'A': 1, 'B': 2}\n", "self = Self(testdata2, popdict, mindict)" ] }, { "cell_type": "code", "execution_count": 535, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
theta
A0.387
B0.581
\n", "
" ], "text/plain": [ " theta\n", "A 0.387\n", "B 0.581" ] }, "execution_count": 535, "metadata": {}, "output_type": "execute_result" } ], "source": [ "self.theta_W()" ] }, { "cell_type": "code", "execution_count": 537, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]\n", " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]\n", "\n", "Fst per SNP: [ nan nan nan nan nan nan nan nan nan nan nan nan nan nan\n", " nan nan 0.75 nan nan nan nan nan nan nan nan nan nan nan\n", " nan nan nan nan nan nan nan nan nan 0.5 nan nan nan nan\n", " nan nan nan nan nan nan nan nan nan nan nan nan nan nan\n", " nan nan nan nan 0.5 nan nan nan nan nan nan nan nan nan\n", " nan nan nan nan nan nan nan nan nan nan nan nan nan 0.75\n", " nan nan nan nan nan nan nan nan nan nan nan nan nan 0.5\n", " nan nan]\n", "\n", "Fst mean: nan\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/deren/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:40: RuntimeWarning: invalid value encountered in true_divide\n" ] }, { "ename": "TypeError", "evalue": "theta_W() takes 1 positional argument but 3 were given", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\nFst per SNP:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfst_H\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m4\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\nFst mean:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfst_H\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m4\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\ntheta per individual:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtheta_W\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m4\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\ntheta per population:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtheta_W\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m4\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mTypeError\u001b[0m: theta_W() takes 1 positional argument but 3 were given" ] } ], "source": [ "print(self.data)\n", "print(\"\\nFst per SNP:\", self.fst_H([0, 1], [2, 3, 4, 5]))\n", "print(\"\\nFst mean:\", np.mean(self.fst_H([0, 1], [2, 3, 4, 5])))\n", "print(\"\\ntheta per individual:\", self.theta_W([0, 1], [2, 3, 4, 5]))\n", "print(\"\\ntheta per population:\", self.theta_W([0, 1], [2, 3, 4, 5]))" ] }, { "cell_type": "code", "execution_count": 555, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01
aaaaaa0.00.0
bbbbbbbbbbb0.00.0
\n", "
" ], "text/plain": [ " 0 1\n", "aaaaaa 0.0 0.0\n", "bbbbbbbbbbb 0.0 0.0" ] }, "execution_count": 555, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(\n", " data=np.zeros((2, 2)),\n", " index=['aaaaaa', 'bbbbbbbbbbb'],\n", ")" ] }, { "cell_type": "code", "execution_count": 242, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([0.2, 0. , 0. , 0.6, 0.6]), array([0.2, 0. , 0. , 0.6, 0.6]))" ] }, "execution_count": 242, "metadata": {}, "output_type": "execute_result" } ], "source": [ "self.fst([0, 1], [2, 3, 4, 5])" ] }, { "cell_type": "code", "execution_count": 235, "metadata": {}, "outputs": [], "source": [ "pop1idx = [0, 1]\n", "pop2idx = [2, 3, 4, 5]\n", "pop1 = self.data[pop1idx, :]\n", "pop2 = self.data[pop2idx, :] \n", "popa = self.data[pop1idx + pop2idx, :]" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ilist = itertools.combinations(range(4), 2)\n", "list(ilist)" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ilist = itertools.combinations(range(pop2.shape[0]), 2)\n", "list(ilist)" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "def diffs(pop):\n", " ilist = itertools.combinations(range(pop.shape[0]), 2)\n", " diff = [pop[i] == pop[j] for (i, j) in ilist]\n", " sums = np.sum(diff, axis=0)\n", " return sums/sums.shape[0]" ] }, { "cell_type": "code", "execution_count": 231, "metadata": {}, "outputs": [], "source": [ "def fst(pop1idx, pop2idx):\n", " ii = list(itertools.combinations(pop1idx, 2))\n", " jj = list(itertools.combinations(pop2idx, 2))\n", " kk = itertools.combinations(pop1idx + pop2idx, 2)\n", " between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk) \n", "\n", " diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]\n", " sums = np.sum(diff, axis=0)\n", " a = sums / sums.shape[0]\n", " \n", " diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]\n", " sums = np.sum(diff, axis=0)\n", " b = sums / sums.shape[0]\n", " \n", " return abs(1 - (a / b))" ] }, { "cell_type": "code", "execution_count": 175, "metadata": {}, "outputs": [], "source": [ "#fst([0, 1], [2, 3, 4, 5])" ] }, { "cell_type": "code", "execution_count": 245, "metadata": {}, "outputs": [], "source": [ "ii = list(itertools.combinations(pop1idx, 2))\n", "jj = list(itertools.combinations(pop2idx, 2))\n", "within = ii + jj\n", "kk = itertools.combinations(pop1idx + pop2idx, 2)\n", "between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk)" ] }, { "cell_type": "code", "execution_count": 246, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(0, 1), (2, 3), (2, 4), (2, 5), (3, 4), (3, 5), (4, 5)]" ] }, "execution_count": 246, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ii + jj" ] }, { "cell_type": "code", "execution_count": 247, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 247, "metadata": {}, "output_type": "execute_result" } ], "source": [ "between" ] }, { "cell_type": "code", "execution_count": 244, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 0, 0, 0],\n", " [1, 0, 0, 0, 0],\n", " [1, 1, 1, 0, 0],\n", " [1, 1, 1, 1, 0],\n", " [1, 1, 1, 1, 0],\n", " [1, 1, 1, 1, 1]])" ] }, "execution_count": 244, "metadata": {}, "output_type": "execute_result" } ], "source": [ "self.data" ] }, { "cell_type": "code", "execution_count": 209, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.2, 0. , 0. , 0.6, 0.6])" ] }, "execution_count": 209, "metadata": {}, "output_type": "execute_result" } ], "source": [ "diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]\n", "sums = np.sum(diff, axis=0)\n", "a = sums / sums.shape[0]\n", "\n", "a" ] }, { "cell_type": "code", "execution_count": 243, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.75, 1. , 1. , 0.5 , 0.5 ])" ] }, "execution_count": 243, "metadata": {}, "output_type": "execute_result" } ], "source": [ "abs(1 - (a / b))" ] }, { "cell_type": "code", "execution_count": 210, "metadata": {}, "outputs": [], "source": [ "diff = [self.data[i] != self.data[j] for (i, j) in between]\n", "sums = np.sum(diff, axis=0)\n", "b = sums / sums.shape[0]" ] }, { "cell_type": "code", "execution_count": 195, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0.75, 1. , 1. , 0.5 , -0.5 ])" ] }, "execution_count": 195, "metadata": {}, "output_type": "execute_result" } ], "source": [ "1 - (a / b)" ] }, { "cell_type": "code", "execution_count": 196, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 0, 0, 0],\n", " [1, 0, 0, 0, 0],\n", " [1, 1, 1, 0, 0],\n", " [1, 1, 1, 1, 0],\n", " [1, 1, 1, 1, 0],\n", " [1, 1, 1, 1, 1]])" ] }, "execution_count": 196, "metadata": {}, "output_type": "execute_result" } ], "source": [ "self.data" ] }, { "cell_type": "code", "execution_count": 171, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(0, 2), (0, 3), (0, 4), (0, 5), (1, 2), (1, 3), (1, 4), (1, 5)]" ] }, "execution_count": 171, "metadata": {}, "output_type": "execute_result" } ], "source": [ "between" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.6 , 0.85714286, 0.85714286, 0.5 , 0.3 ])" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "diffs(pop2) / diffs(popa)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 0\n", "0 1\n", "1 0\n", "1 1\n" ] } ], "source": [ "for i in range(pop1.shape[0]):\n", " for j in range(pop1.shape[0]):\n", " print(i,j)" ] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [], "source": [ "def _fstH(self, pop1idx, pop2idx):\n", " \"Hudson's Fst estimator\"\n", " pop1 = self.data[pop1idx, :]\n", " pop2 = self.data[pop2idx, :] \n", " pop1freq = np.mean(pop1, axis=0)\n", " pop2freq = np.mean(pop2, axis=0)\n", " a = (pop1freq - pop2freq) ** 2\n", " b = ((pop1freq * (1.0 - pop1freq))) / (pop1.shape[0] - 1)\n", " c = ((pop2freq * (1.0 - pop2freq))) / (pop2.shape[0] - 1)\n", " return pd.Series(a - b - c).round(5)" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [], "source": [ "def _fstWC(self, pop1idx, pop2idx):\n", " \"Wier and Cockerham (1984) Fst estimator\"\n", " pop1 = self.data[pop1idx, :]\n", " pop2 = self.data[pop2idx, :] \n", " popa = self.data[pop1idx + pop2idx, :]\n", " pop1freq = np.mean(pop1, axis=0)\n", " pop2freq = np.mean(pop2, axis=0)\n", " popafreq = np.mean(popa, axis=0)\n", " nbar = pop1.sum(axis=0) / 2. + pop2.sum(axis=0) / 2.\n", " s2a = (\n", " (1. / ((popa.shape[0] - 1) * nbar))\n", " * ((pop1.shape[0] * 1))\n", " )\n", " nc = (\n", " (1. / (npops - 1.)) \n", " * (popa.shape[0] \n", " - ((pop1.shape[0] ** 2 + pop2.shape[0] ** 2) / popa.shape[0])\n", " )\n", " )\n", " #t1 = \n", " #t2 = \n", " return t1, t2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def fstWC(self, pop1idx, pop2idx):\n", " pop1 = self.data[pop1idx, :]\n", " pop2 = self.data[pop2idx, :] \n", " popa = self.data[pop1idx + pop2idx, :]\n", " \n", " pop1freq = np.mean(pop1, axis=0)\n", " pop2freq = np.mean(pop2, axis=0)\n", " popafreq = np.mean(popa, axis=0)\n", " \n", " # frequecy of allele A in the sample of size ni from population i\n", " p = ...\n", " # observed proportion of individuals heterozygous for allele A\n", " hbar = \n", " # the average sample size\n", " nbar = \n", " nc = " ] }, { "cell_type": "code", "execution_count": 138, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([2. , 1.5, 1.5, 1. , 0.5])" ] }, "execution_count": 138, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pop1.sum(axis=0) / 2. + pop2.sum(axis=0) / 2.\n" ] }, { "cell_type": "code", "execution_count": 140, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.5" ] }, "execution_count": 140, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pop1.shape[0] / 2. + pop2.shape[0] / 2.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\t \n", "\n", "\t npops= 2.0\n", "\t nsamples = float(NpopA + NpopB)\n", "\t n_bar= (NpopA / npops) + (NpopB / npops)\n", "\t samplefreq = ( (popAcount+popBcount) / (NpopA + NpopB) )\n", "\t pop1freq = popAcount / float(NpopA )\n", "\t pop2freq = popBcount / float(NpopB )\n", "\t Npop1 = NpopA\n", "\t Npop2 = NpopB\n", "\t S2A= (1/ ( (npops-1.0) * n_bar) ) * \n", " ( ( (Npop1)* ((pop1freq-samplefreq)**2) ) + \n", " ( (Npop2)*((pop2freq-samplefreq)**2) ) )\n", "\t nc = 1.0/(npops-1.0) * ( (Npop1+Npop2) - (((Npop1**2)+(Npop2**2)) / (Npop1+Npop2)) )\n", "\t T_1 = S2A -( ( 1/(n_bar-1) ) * ( (samplefreq * (1-samplefreq)) - ((npops-1)/npops)* S2A ) )\n", "\t T_2 = (( (nc-1) / (n_bar-1) ) * samplefreq *(1-samplefreq) ) + (1.0 + (((npops-1)*(n_bar-nc)) / (n_bar-1))) * (S2A/npops)" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 0.000\n", "1 1.000\n", "2 1.000\n", "3 0.333\n", "4 -0.000\n", "dtype: float64" ] }, "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pop1idx = [0, 1]\n", "pop2idx = [2, 3, 4]\n", "_fst(self, pop1idx, pop2idx)" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 0.000\n", "1 -0.333\n", "2 -0.333\n", "3 -0.333\n", "4 0.000\n", "dtype: float64" ] }, "execution_count": 120, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pop1idx = [0, 4]\n", "pop2idx = [1, 2, 3]\n", "_fst(self, pop1idx, pop2idx)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[0 0 0 0 0]\n", " [1 0 0 0 0]]\n", "[[1 1 1 0 0]\n", " [1 1 1 1 0]\n", " [1 1 1 1 1]]\n" ] } ], "source": [ "pop1 = self.data[pop1idx, :]\n", "pop2 = self.data[pop2idx, :]\n", "popa = self.data[pop1idx + pop2idx, :]\n", "\n", "print(pop1)\n", "print(pop2)" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [], "source": [ "pop1freq = np.mean(pop1, axis=0)\n", "pop2freq = np.mean(pop2, axis=0)\n", "popafreq = np.mean(popa, axis=0)" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.25 , 1. , 1. , 0.44444444, 0.11111111])" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a = (pop1freq - pop2freq) ** 2\n", "a" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.25, 0. , 0. , 0. , 0. ])" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "b = ((pop1freq * (1.0 - pop1freq))) / (pop1.shape[0] - 1)\n", "b" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0. , 0. , 0. , 0.11111111, 0.11111111])" ] }, "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "c = ((pop2freq * (1.0 - pop2freq))) / (pop2.shape[0] - 1)\n", "c" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 0.000\n", "1 1.000\n", "2 1.000\n", "3 0.333\n", "4 -0.000\n", "dtype: float64" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.Series(a - b - c).round(3)" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 0.000\n", "1 1.000\n", "2 1.000\n", "3 0.333\n", "4 -0.000\n", "dtype: float64" ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.Series(a - b - c).round(3)" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0., 0., 0., 0., 0.])" ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d = (pop1freq * (1.0 - pop2freq)) + (pop1freq * (1.0 - pop2freq))\n", "d" ] }, { "cell_type": "code", "execution_count": 151, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-0.25 , -0.66666667, -0.66666667, -0.66666667, -0.66666667])" ] }, "execution_count": 151, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(np.mean(popa, axis=0) - np.mean(pop2, axis=0)) / (np.mean(popa, axis=0))" ] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 0, 0, 0],\n", " [1, 0, 0, 0, 0],\n", " [1, 1, 1, 0, 0],\n", " [1, 1, 1, 1, 0],\n", " [1, 1, 1, 1, 1]])" ] }, "execution_count": 152, "metadata": {}, "output_type": "execute_result" } ], "source": [ "popa" ] }, { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([3, 3, 3, 2, 1])" ] }, "execution_count": 153, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.count_nonzero(pop2, axis=0)" ] }, { "cell_type": "code", "execution_count": 169, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.2, 0.4, 0.4, 0.4, 0.2])" ] }, "execution_count": 169, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a = np.mean(popa, axis=0)\n", "mask = a > 0.5\n", "a[a > 0.5] = abs(a[a > 0.5] - 1)\n", "a\n", "\n" ] }, { "cell_type": "code", "execution_count": 172, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0. , 0. , 0. , 0.33333333, 0.33333333])" ] }, "execution_count": 172, "metadata": {}, "output_type": "execute_result" } ], "source": [ "b = np.mean(pop2, axis=0)\n", "mask = b > 0.5\n", "b[b > 0.5] = abs(b[b > 0.5] - 1)\n", "b\n" ] }, { "cell_type": "code", "execution_count": 171, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1. , 1. , 1. , 0.16666667, -0.66666667])" ] }, "execution_count": 171, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(a - b) / a" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }