{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculate Fst and other popgen statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import ipyrad.analysis as ipa\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import itertools "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get a sequence alignment from the HDF5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "database = \"/home/deren/Downloads/spp30.seqs.hdf5\"\n",
    "idx=0\n",
    "wmin=0\n",
    "wmax=100000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "parse_phystring"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# init a popgen analysis object\n",
    "pop = ipa.popgen(\n",
    "    name=\"ped\", \n",
    "    workdir=\"analysis-popgen\", \n",
    "    data=\"./pedicularis/clust85_min12_outfiles/clust85_min12.loci\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'dict' object has no attribute 'key'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-11-0dc99510b3b6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fst\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/Documents/ipyrad/ipyrad/analysis/popgen.py\u001b[0m in \u001b[0;36m_fst\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    114\u001b[0m         ) \n\u001b[1;32m    115\u001b[0m         \u001b[0md\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnpops\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 116\u001b[0;31m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    117\u001b[0m         \u001b[0;31m# iterate over pairs of pops and fill Fst values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    118\u001b[0m         \u001b[0mpairs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mitertools\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcombinations\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpopdict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'dict' object has no attribute 'key'"
     ]
    }
   ],
   "source": [
    "pop._fst()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Update `loci_to_arr()`\n",
    "\n",
    "... Now I'm thinking maybe I really should make a HDF5 format as the default for downstream analyses..., it can have SNPs, mapfile, and whole sequences with locimap. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _loci_to_arr(self):\n",
    "    \"\"\"\n",
    "    return a frequency array from a loci file for all loci with taxa from \n",
    "    taxdict and min coverage from mindict. \n",
    "    \"\"\"\n",
    "\n",
    "    # \n",
    "    loci = infile.read().strip().split(\"|\\n\")\n",
    "    \n",
    "    ## make the array (4 or 5) and a mask array to remove loci without cov\n",
    "    nloci = len(loci)\n",
    "    keep = np.zeros(nloci, dtype=np.bool_)\n",
    "    arr = np.zeros((nloci, 4, 300), dtype=np.float64)\n",
    "\n",
    "    ## six rows b/c one for each p3, and for the fused p3 ancestor\n",
    "    if len(taxdict) == 5:\n",
    "        arr = np.zeros((nloci, 6, 300), dtype=np.float64)\n",
    "\n",
    "    ## if not mindict, make one that requires 1 in each taxon\n",
    "    if isinstance(mindict, int):\n",
    "        mindict = {i: mindict for i in taxdict}\n",
    "    elif isinstance(mindict, dict):\n",
    "        mindict = {i: mindict[i] for i in taxdict}\n",
    "    else:\n",
    "        mindict = {i: 1 for i in taxdict}\n",
    "\n",
    "    ## raise error if names are not 'p[int]' \n",
    "    allowed_names = ['p1', 'p2', 'p3', 'p4', 'p5']\n",
    "    if any([i not in allowed_names for i in taxdict]):\n",
    "        raise IPyradError(\\\n",
    "            \"keys in taxdict must be named 'p1' through 'p4' or 'p5'\")\n",
    "\n",
    "    ## parse key names\n",
    "    keys = sorted([i for i in taxdict.keys() if i[0] == 'p'])\n",
    "    outg = keys[-1]\n",
    "\n",
    "    ## grab seqs just for the good guys\n",
    "    for loc in xrange(nloci):\n",
    "\n",
    "        ## parse the locus\n",
    "        lines = loci[loc].split(\"\\n\")[:-1]\n",
    "        names = [i.split()[0] for i in lines]\n",
    "        seqs = np.array([list(i.split()[1]) for i in lines])\n",
    "\n",
    "        ## check that names cover the taxdict (still need to check by site)\n",
    "        covs = [sum([j in names for j in taxdict[tax]]) >= mindict[tax] \\\n",
    "                for tax in taxdict]\n",
    "\n",
    "        ## keep locus\n",
    "        if all(covs):\n",
    "            keep[loc] = True\n",
    "\n",
    "            ## get the refseq\n",
    "            refidx = np.where([i in taxdict[outg] for i in names])[0]\n",
    "            refseq = seqs[refidx].view(np.uint8)\n",
    "            ancestral = np.array([reftrick(refseq, GETCONS2)[:, 0]])\n",
    "\n",
    "            ## freq of ref in outgroup\n",
    "            iseq = _reffreq2(ancestral, refseq, GETCONS2)\n",
    "            arr[loc, -1, :iseq.shape[1]] = iseq \n",
    "\n",
    "            ## enter 4-taxon freqs\n",
    "            if len(taxdict) == 4:\n",
    "                for tidx, key in enumerate(keys[:-1]):\n",
    "\n",
    "                    ## get idx of names in test tax\n",
    "                    nidx = np.where([i in taxdict[key] for i in names])[0]\n",
    "                    sidx = seqs[nidx].view(np.uint8)\n",
    "                   \n",
    "                    ## get freq of sidx\n",
    "                    iseq = _reffreq2(ancestral, sidx, GETCONS2)\n",
    "                   \n",
    "                    ## fill it in \n",
    "                    arr[loc, tidx, :iseq.shape[1]] = iseq\n",
    "\n",
    "            else:\n",
    "\n",
    "                ## entere p5; and fill it in\n",
    "                iseq = _reffreq2(ancestral, refseq, GETCONS2) \n",
    "                arr[loc, -1, :iseq.shape[1]] = iseq \n",
    "                \n",
    "                ## enter p1\n",
    "                nidx = np.where([i in taxdict['p1'] for i in names])[0]\n",
    "                sidx = seqs[nidx].view(np.uint8)\n",
    "                iseq = _reffreq2(ancestral, sidx, GETCONS2)\n",
    "                arr[loc, 0, :iseq.shape[1]] = iseq\n",
    "                \n",
    "                ## enter p2\n",
    "                nidx = np.where([i in taxdict['p2'] for i in names])[0]\n",
    "                sidx = seqs[nidx].view(np.uint8)\n",
    "                iseq = _reffreq2(ancestral, sidx, GETCONS2)\n",
    "                arr[loc, 1, :iseq.shape[1]] = iseq\n",
    "                \n",
    "                ## enter p3 with p4 masked, and p4 with p3 masked\n",
    "                nidx = np.where([i in taxdict['p3'] for i in names])[0]\n",
    "                nidy = np.where([i in taxdict['p4'] for i in names])[0]\n",
    "                sidx = seqs[nidx].view(np.uint8)\n",
    "                sidy = seqs[nidy].view(np.uint8)\n",
    "                xseq = _reffreq2(ancestral, sidx, GETCONS2)\n",
    "                yseq = _reffreq2(ancestral, sidy, GETCONS2)\n",
    "                mask3 = xseq != 0\n",
    "                mask4 = yseq != 0\n",
    "                xseq[mask4] = 0\n",
    "                yseq[mask3] = 0\n",
    "                arr[loc, 2, :xseq.shape[1]] = xseq\n",
    "                arr[loc, 3, :yseq.shape[1]] = yseq\n",
    "                \n",
    "                ## enter p34 \n",
    "                nidx = nidx.tolist() + nidy.tolist()\n",
    "                sidx = seqs[nidx].view(np.uint8)\n",
    "                iseq = _reffreq2(ancestral, sidx, GETCONS2)\n",
    "                arr[loc, 4, :iseq.shape[1]] = iseq\n",
    "\n",
    "\n",
    "    ## size-down array to the number of loci that have taxa for the test\n",
    "    arr = arr[keep, :, :]\n",
    "\n",
    "    ## size-down sites to \n",
    "    arr = masknulls(arr)\n",
    "\n",
    "    return arr, keep\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Calculations\n",
    "\n",
    "Fst can be calculated per SNP, or averaged over sliding windows of SNPs of some size if reference information is present. Or over loci if "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 309,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "fst     Empty DataFrame\n",
       "Columns: []\n",
       "Index: []\n",
       "pi      Empty DataFrame\n",
       "Columns: [0]\n",
       "Index: []\n",
       "theta   Empty DataFrame\n",
       "Columns: [0]\n",
       "Index: []"
      ]
     },
     "execution_count": 309,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pop.results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 527,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'i': 1, 'j': 1}"
      ]
     },
     "execution_count": 527,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "popdict = {'i': [2, 3], 'j': [2, 4]}\n",
    "mindict = {}\n",
    "(mindict if mindict else {j:1 for j in popdict})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 528,
   "metadata": {},
   "outputs": [],
   "source": [
    "testdata1 = np.array([\n",
    "    [0, 0, 0, 0, 0],\n",
    "    [1, 0, 0, 0, 0],\n",
    "    [1, 1, 1, 0, 0], \n",
    "    [1, 1, 1, 1, 0],\n",
    "    [1, 1, 1, 1, 0], \n",
    "    [1, 1, 1, 1, 1],\n",
    "])\n",
    "\n",
    "testdata2 = np.random.binomial(1, 0.01, (6, 100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 532,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Self:\n",
    "    def __init__(self, data, popdict={}, mindict={}, mapfile=False):\n",
    "        self.data = data\n",
    "        self.popdict = popdict\n",
    "        self.mindict = mindict\n",
    "        self._parsedata()\n",
    "        self._checkdicts()\n",
    "        self._filterdata()\n",
    "        \n",
    "    def _parsedata(self):\n",
    "        \"\"\"\n",
    "        parse data as an ndarray, vcfile, or locifile and store \n",
    "        in ... arrays, one with all sites, and mapfile... \n",
    "        \"\"\"\n",
    "        pass\n",
    "    \n",
    "    \n",
    "    def _checkdicts(self):       \n",
    "        #assert len(popdict) > 2, \"must enter at least two populations\"\n",
    "        self.mindict = (mindict if mindict else {j:1 for j in popdict})\n",
    "        \n",
    "        \n",
    "    def _filterdata(self):\n",
    "        pass\n",
    "        \n",
    "        \n",
    "    def fst_H(self, pop1idx, pop2idx):\n",
    "        ii = list(itertools.combinations(pop1idx, 2))\n",
    "        jj = list(itertools.combinations(pop2idx, 2))\n",
    "        kk = itertools.combinations(pop1idx + pop2idx, 2)\n",
    "        between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk)    \n",
    "\n",
    "        diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]\n",
    "        sums = np.sum(diff, axis=0)\n",
    "        a = sums / sums.shape[0]\n",
    "\n",
    "        diff = [self.data[i] != self.data[j] for (i, j) in between]\n",
    "        sums = np.sum(diff, axis=0)\n",
    "        b = sums / sums.shape[0]\n",
    "        return abs(1 - (a / b))\n",
    "    \n",
    "    \n",
    "    def theta_W(self):\n",
    "        \"return Watternson's theta for each population\"\n",
    "        df = pd.DataFrame(\n",
    "            data=np.zeros(len(self.popdict)),\n",
    "            columns=[\"theta\"],\n",
    "            index=list(self.popdict.keys())\n",
    "        )\n",
    "        for row in self.popdict:\n",
    "            idat = self.data[self.popdict[row], :]\n",
    "            segregating = np.invert(np.all(idat == idat[0], axis=0)).sum()\n",
    "            denom = np.sum((1 / i for i in range(1, idat.shape[1] - 1)))\n",
    "            theta = segregating / denom\n",
    "            df.loc[row] = theta\n",
    "        return df\n",
    "            \n",
    "        \n",
    "    def pi(self, pop1idx, pop2idx):\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 533,
   "metadata": {},
   "outputs": [],
   "source": [
    "popdict = {'A': [0, 1], 'B': [2, 3, 4, 5]}\n",
    "mindict = {'A': 1, 'B': 2}\n",
    "self = Self(testdata2, popdict, mindict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 535,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>theta</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A</th>\n",
       "      <td>0.387</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>B</th>\n",
       "      <td>0.581</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   theta\n",
       "A  0.387\n",
       "B  0.581"
      ]
     },
     "execution_count": 535,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "self.theta_W()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 537,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]\n",
      "\n",
      "Fst per SNP: [ nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan\n",
      "  nan  nan 0.75  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan\n",
      "  nan  nan  nan  nan  nan  nan  nan  nan  nan 0.5   nan  nan  nan  nan\n",
      "  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan\n",
      "  nan  nan  nan  nan 0.5   nan  nan  nan  nan  nan  nan  nan  nan  nan\n",
      "  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan 0.75\n",
      "  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan 0.5\n",
      "  nan  nan]\n",
      "\n",
      "Fst mean: nan\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/deren/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:40: RuntimeWarning: invalid value encountered in true_divide\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "theta_W() takes 1 positional argument but 3 were given",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-537-e9d9e53cd653>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\nFst per SNP:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfst_H\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m4\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\nFst mean:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfst_H\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m4\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\ntheta per individual:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtheta_W\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m4\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\ntheta per population:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtheta_W\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m4\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mTypeError\u001b[0m: theta_W() takes 1 positional argument but 3 were given"
     ]
    }
   ],
   "source": [
    "print(self.data)\n",
    "print(\"\\nFst per SNP:\", self.fst_H([0, 1], [2, 3, 4, 5]))\n",
    "print(\"\\nFst mean:\", np.mean(self.fst_H([0, 1], [2, 3, 4, 5])))\n",
    "print(\"\\ntheta per individual:\", self.theta_W([0, 1], [2, 3, 4, 5]))\n",
    "print(\"\\ntheta per population:\", self.theta_W([0, 1], [2, 3, 4, 5]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 555,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>aaaaaa</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bbbbbbbbbbb</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               0    1\n",
       "aaaaaa       0.0  0.0\n",
       "bbbbbbbbbbb  0.0  0.0"
      ]
     },
     "execution_count": 555,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(\n",
    "    data=np.zeros((2, 2)),\n",
    "    index=['aaaaaa', 'bbbbbbbbbbb'],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 242,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([0.2, 0. , 0. , 0.6, 0.6]), array([0.2, 0. , 0. , 0.6, 0.6]))"
      ]
     },
     "execution_count": 242,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "self.fst([0, 1], [2, 3, 4, 5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 235,
   "metadata": {},
   "outputs": [],
   "source": [
    "pop1idx = [0, 1]\n",
    "pop2idx = [2, 3, 4, 5]\n",
    "pop1 = self.data[pop1idx, :]\n",
    "pop2 = self.data[pop2idx, :]  \n",
    "popa = self.data[pop1idx + pop2idx, :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ilist = itertools.combinations(range(4), 2)\n",
    "list(ilist)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ilist = itertools.combinations(range(pop2.shape[0]), 2)\n",
    "list(ilist)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "def diffs(pop):\n",
    "    ilist = itertools.combinations(range(pop.shape[0]), 2)\n",
    "    diff = [pop[i] == pop[j] for (i, j) in ilist]\n",
    "    sums = np.sum(diff, axis=0)\n",
    "    return sums/sums.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 231,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fst(pop1idx, pop2idx):\n",
    "    ii = list(itertools.combinations(pop1idx, 2))\n",
    "    jj = list(itertools.combinations(pop2idx, 2))\n",
    "    kk = itertools.combinations(pop1idx + pop2idx, 2)\n",
    "    between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk)    \n",
    "\n",
    "    diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]\n",
    "    sums = np.sum(diff, axis=0)\n",
    "    a = sums / sums.shape[0]\n",
    "    \n",
    "    diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]\n",
    "    sums = np.sum(diff, axis=0)\n",
    "    b = sums / sums.shape[0]\n",
    "    \n",
    "    return abs(1 - (a / b))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "metadata": {},
   "outputs": [],
   "source": [
    "#fst([0, 1], [2, 3, 4, 5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 245,
   "metadata": {},
   "outputs": [],
   "source": [
    "ii = list(itertools.combinations(pop1idx, 2))\n",
    "jj = list(itertools.combinations(pop2idx, 2))\n",
    "within = ii + jj\n",
    "kk = itertools.combinations(pop1idx + pop2idx, 2)\n",
    "between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 246,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, 1), (2, 3), (2, 4), (2, 5), (3, 4), (3, 5), (4, 5)]"
      ]
     },
     "execution_count": 246,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ii + jj"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 247,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<itertools.filterfalse at 0x7faa876c4eb8>"
      ]
     },
     "execution_count": 247,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "between"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 244,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, 0, 0],\n",
       "       [1, 0, 0, 0, 0],\n",
       "       [1, 1, 1, 0, 0],\n",
       "       [1, 1, 1, 1, 0],\n",
       "       [1, 1, 1, 1, 0],\n",
       "       [1, 1, 1, 1, 1]])"
      ]
     },
     "execution_count": 244,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "self.data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 209,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.2, 0. , 0. , 0.6, 0.6])"
      ]
     },
     "execution_count": 209,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]\n",
    "sums = np.sum(diff, axis=0)\n",
    "a = sums / sums.shape[0]\n",
    "\n",
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 243,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.75, 1.  , 1.  , 0.5 , 0.5 ])"
      ]
     },
     "execution_count": 243,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "abs(1 - (a / b))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 210,
   "metadata": {},
   "outputs": [],
   "source": [
    "diff = [self.data[i] != self.data[j] for (i, j) in between]\n",
    "sums = np.sum(diff, axis=0)\n",
    "b = sums / sums.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.75,  1.  ,  1.  ,  0.5 , -0.5 ])"
      ]
     },
     "execution_count": 195,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "1 - (a / b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 196,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, 0, 0],\n",
       "       [1, 0, 0, 0, 0],\n",
       "       [1, 1, 1, 0, 0],\n",
       "       [1, 1, 1, 1, 0],\n",
       "       [1, 1, 1, 1, 0],\n",
       "       [1, 1, 1, 1, 1]])"
      ]
     },
     "execution_count": 196,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "self.data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, 2), (0, 3), (0, 4), (0, 5), (1, 2), (1, 3), (1, 4), (1, 5)]"
      ]
     },
     "execution_count": 171,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "between"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.6       , 0.85714286, 0.85714286, 0.5       , 0.3       ])"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "diffs(pop2) / diffs(popa)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 0\n",
      "0 1\n",
      "1 0\n",
      "1 1\n"
     ]
    }
   ],
   "source": [
    "for i in range(pop1.shape[0]):\n",
    "    for j in range(pop1.shape[0]):\n",
    "        print(i,j)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _fstH(self, pop1idx, pop2idx):\n",
    "    \"Hudson's Fst estimator\"\n",
    "    pop1 = self.data[pop1idx, :]\n",
    "    pop2 = self.data[pop2idx, :]  \n",
    "    pop1freq = np.mean(pop1, axis=0)\n",
    "    pop2freq = np.mean(pop2, axis=0)\n",
    "    a = (pop1freq - pop2freq) ** 2\n",
    "    b = ((pop1freq * (1.0 - pop1freq))) / (pop1.shape[0] - 1)\n",
    "    c = ((pop2freq * (1.0 - pop2freq))) / (pop2.shape[0] - 1)\n",
    "    return pd.Series(a - b - c).round(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _fstWC(self, pop1idx, pop2idx):\n",
    "    \"Wier and Cockerham (1984) Fst estimator\"\n",
    "    pop1 = self.data[pop1idx, :]\n",
    "    pop2 = self.data[pop2idx, :]  \n",
    "    popa = self.data[pop1idx + pop2idx, :]\n",
    "    pop1freq = np.mean(pop1, axis=0)\n",
    "    pop2freq = np.mean(pop2, axis=0)\n",
    "    popafreq = np.mean(popa, axis=0)\n",
    "    nbar = pop1.sum(axis=0) / 2. + pop2.sum(axis=0) / 2.\n",
    "    s2a = (\n",
    "        (1. / ((popa.shape[0] - 1) * nbar))\n",
    "        * ((pop1.shape[0] * 1))\n",
    "    )\n",
    "    nc = (\n",
    "        (1. / (npops - 1.)) \n",
    "         * (popa.shape[0] \n",
    "            - ((pop1.shape[0] ** 2 + pop2.shape[0] ** 2) / popa.shape[0])\n",
    "           )\n",
    "    )\n",
    "    #t1 = \n",
    "    #t2 = \n",
    "    return t1, t2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fstWC(self, pop1idx, pop2idx):\n",
    "    pop1 = self.data[pop1idx, :]\n",
    "    pop2 = self.data[pop2idx, :]  \n",
    "    popa = self.data[pop1idx + pop2idx, :]\n",
    "    \n",
    "    pop1freq = np.mean(pop1, axis=0)\n",
    "    pop2freq = np.mean(pop2, axis=0)\n",
    "    popafreq = np.mean(popa, axis=0)\n",
    "    \n",
    "    # frequecy of allele A in the sample of size ni from population i\n",
    "    p = ...\n",
    "    # observed proportion of individuals heterozygous for allele A\n",
    "    hbar = \n",
    "    # the average sample size\n",
    "    nbar = \n",
    "    nc = "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([2. , 1.5, 1.5, 1. , 0.5])"
      ]
     },
     "execution_count": 138,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pop1.sum(axis=0) / 2. + pop2.sum(axis=0) / 2.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2.5"
      ]
     },
     "execution_count": 140,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pop1.shape[0] / 2. + pop2.shape[0] / 2.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\t  \n",
    "\n",
    "\t  npops= 2.0\n",
    "\t  nsamples = float(NpopA + NpopB)\n",
    "\t  n_bar= (NpopA / npops) + (NpopB / npops)\n",
    "\t  samplefreq = ( (popAcount+popBcount) / (NpopA + NpopB) )\n",
    "\t  pop1freq = popAcount / float(NpopA )\n",
    "\t  pop2freq = popBcount / float(NpopB )\n",
    "\t  Npop1 = NpopA\n",
    "\t  Npop2 = NpopB\n",
    "\t  S2A= (1/ ( (npops-1.0) * n_bar) ) * \n",
    "        ( ( (Npop1)* ((pop1freq-samplefreq)**2) ) + \n",
    "         ( (Npop2)*((pop2freq-samplefreq)**2) ) )\n",
    "\t  nc = 1.0/(npops-1.0) * ( (Npop1+Npop2) - (((Npop1**2)+(Npop2**2)) / (Npop1+Npop2)) )\n",
    "\t  T_1 = S2A -( ( 1/(n_bar-1) ) * ( (samplefreq * (1-samplefreq)) -  ((npops-1)/npops)* S2A ) )\n",
    "\t  T_2 = (( (nc-1) / (n_bar-1) ) * samplefreq *(1-samplefreq) )   +  (1.0 +   (((npops-1)*(n_bar-nc))  / (n_bar-1)))       * (S2A/npops)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.000\n",
       "1    1.000\n",
       "2    1.000\n",
       "3    0.333\n",
       "4   -0.000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pop1idx = [0, 1]\n",
    "pop2idx = [2, 3, 4]\n",
    "_fst(self, pop1idx, pop2idx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.000\n",
       "1   -0.333\n",
       "2   -0.333\n",
       "3   -0.333\n",
       "4    0.000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pop1idx = [0, 4]\n",
    "pop2idx = [1, 2, 3]\n",
    "_fst(self, pop1idx, pop2idx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0 0 0 0 0]\n",
      " [1 0 0 0 0]]\n",
      "[[1 1 1 0 0]\n",
      " [1 1 1 1 0]\n",
      " [1 1 1 1 1]]\n"
     ]
    }
   ],
   "source": [
    "pop1 = self.data[pop1idx, :]\n",
    "pop2 = self.data[pop2idx, :]\n",
    "popa = self.data[pop1idx + pop2idx, :]\n",
    "\n",
    "print(pop1)\n",
    "print(pop2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "pop1freq = np.mean(pop1, axis=0)\n",
    "pop2freq = np.mean(pop2, axis=0)\n",
    "popafreq = np.mean(popa, axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.25      , 1.        , 1.        , 0.44444444, 0.11111111])"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a = (pop1freq - pop2freq) ** 2\n",
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.25, 0.  , 0.  , 0.  , 0.  ])"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b = ((pop1freq * (1.0 - pop1freq))) / (pop1.shape[0] - 1)\n",
    "b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.        , 0.        , 0.        , 0.11111111, 0.11111111])"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c = ((pop2freq * (1.0 - pop2freq))) / (pop2.shape[0] - 1)\n",
    "c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.000\n",
       "1    1.000\n",
       "2    1.000\n",
       "3    0.333\n",
       "4   -0.000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series(a - b - c).round(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.000\n",
       "1    1.000\n",
       "2    1.000\n",
       "3    0.333\n",
       "4   -0.000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 98,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series(a - b - c).round(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0., 0., 0., 0., 0.])"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d = (pop1freq * (1.0 - pop2freq)) + (pop1freq * (1.0 - pop2freq))\n",
    "d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-0.25      , -0.66666667, -0.66666667, -0.66666667, -0.66666667])"
      ]
     },
     "execution_count": 151,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(np.mean(popa, axis=0) - np.mean(pop2, axis=0)) / (np.mean(popa, axis=0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, 0, 0],\n",
       "       [1, 0, 0, 0, 0],\n",
       "       [1, 1, 1, 0, 0],\n",
       "       [1, 1, 1, 1, 0],\n",
       "       [1, 1, 1, 1, 1]])"
      ]
     },
     "execution_count": 152,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "popa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([3, 3, 3, 2, 1])"
      ]
     },
     "execution_count": 153,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.count_nonzero(pop2, axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.2, 0.4, 0.4, 0.4, 0.2])"
      ]
     },
     "execution_count": 169,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a = np.mean(popa, axis=0)\n",
    "mask = a > 0.5\n",
    "a[a > 0.5] = abs(a[a > 0.5] - 1)\n",
    "a\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.        , 0.        , 0.        , 0.33333333, 0.33333333])"
      ]
     },
     "execution_count": 172,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b = np.mean(pop2, axis=0)\n",
    "mask = b > 0.5\n",
    "b[b > 0.5] = abs(b[b > 0.5] - 1)\n",
    "b\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 1.        ,  1.        ,  1.        ,  0.16666667, -0.66666667])"
      ]
     },
     "execution_count": 171,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(a - b) / a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}