{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "Click run all for this jupyter notebook, please send an email to btsui@eng.ucsd.edu if there is any error. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### download example data\n",
    "\n",
    "\n",
    "Change syn15659419 to syn15624400 if you want to download the entire vairant dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "pip install synapseclient \n",
    "pip install pandas --upgrade\n",
    "####only download one file for now\n",
    "#\n",
    "mkdir tmp_data/\n",
    "cd ./tmp_data/\n",
    "#created a dummy accounts so that any one can download without registering, please don't do anything crazy with the account\n",
    "synapse -u synapse.skymap.download -p QtL-E2g-hzz-N4k get  syn15659419    \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# input \n",
    "1. configure base_mergedBySRR_dir to your local copy of mergedBySRR\n",
    "2. query_SRR, Sequence read archive SRR ID\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "base_mergedBySRR_dir='./tmp_data/'#'~/Data/merged/snp/hg38/mergedBySRR/'\n",
    "query_SRR='ERR126304'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# slicing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 25.5 s, sys: 3.14 s, total: 28.7 s\n",
      "Wall time: 28.7 s\n",
      "CPU times: user 32.6 s, sys: 30 s, total: 1min 2s\n",
      "Wall time: 36.8 s\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import re \n",
    "query_Run_digits=int(re.search(r\"\\d+\", query_SRR).group(0))\n",
    "query_Run_db=re.search(r\"\\wRR+\", query_SRR).group(0)\n",
    "#currently fixed, just use this\n",
    "chunkSize=int(10**5)\n",
    "%time tmpDf=pd.read_pickle('{}/{}.pickle.gz'.format(base_mergedBySRR_dir,chunkSize))\n",
    "%time hitDf=tmpDf.loc[[query_Run_db,query_Run_digits]]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# output \n",
    "viola, in a minute, you got the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>features</th>\n",
       "      <th>ReadDepth</th>\n",
       "      <th>AverageBaseQuality</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Run_db</th>\n",
       "      <th>Run_digits</th>\n",
       "      <th>Chr</th>\n",
       "      <th>Pos</th>\n",
       "      <th>base</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"30\" valign=\"top\">ERR</th>\n",
       "      <th rowspan=\"30\" valign=\"top\">187270</th>\n",
       "      <th rowspan=\"30\" valign=\"top\">1</th>\n",
       "      <th rowspan=\"2\" valign=\"top\">14727</th>\n",
       "      <th>A</th>\n",
       "      <td>2</td>\n",
       "      <td>31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>G</th>\n",
       "      <td>8</td>\n",
       "      <td>37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>630825</th>\n",
       "      <th>T</th>\n",
       "      <td>70</td>\n",
       "      <td>36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">630833</th>\n",
       "      <th>C</th>\n",
       "      <td>75</td>\n",
       "      <td>35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>T</th>\n",
       "      <td>1</td>\n",
       "      <td>37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>833068</th>\n",
       "      <th>G</th>\n",
       "      <td>1</td>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>842133</th>\n",
       "      <th>G</th>\n",
       "      <td>5</td>\n",
       "      <td>37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>843942</th>\n",
       "      <th>G</th>\n",
       "      <td>1</td>\n",
       "      <td>40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>850609</th>\n",
       "      <th>T</th>\n",
       "      <td>4</td>\n",
       "      <td>38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>948136</th>\n",
       "      <th>G</th>\n",
       "      <td>3</td>\n",
       "      <td>34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>955964</th>\n",
       "      <th>G</th>\n",
       "      <td>1</td>\n",
       "      <td>38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>970788</th>\n",
       "      <th>G</th>\n",
       "      <td>2</td>\n",
       "      <td>37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1013541</th>\n",
       "      <th>C</th>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1014143</th>\n",
       "      <th>C</th>\n",
       "      <td>2</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">1014228</th>\n",
       "      <th>A</th>\n",
       "      <td>2</td>\n",
       "      <td>37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>G</th>\n",
       "      <td>1</td>\n",
       "      <td>37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1014316</th>\n",
       "      <th>C</th>\n",
       "      <td>2</td>\n",
       "      <td>37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1014359</th>\n",
       "      <th>G</th>\n",
       "      <td>4</td>\n",
       "      <td>38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1020239</th>\n",
       "      <th>G</th>\n",
       "      <td>1</td>\n",
       "      <td>36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1022188</th>\n",
       "      <th>A</th>\n",
       "      <td>2</td>\n",
       "      <td>35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1022225</th>\n",
       "      <th>G</th>\n",
       "      <td>2</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1022260</th>\n",
       "      <th>C</th>\n",
       "      <td>3</td>\n",
       "      <td>39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1022313</th>\n",
       "      <th>A</th>\n",
       "      <td>1</td>\n",
       "      <td>38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1042136</th>\n",
       "      <th>T</th>\n",
       "      <td>2</td>\n",
       "      <td>36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">1042190</th>\n",
       "      <th>A</th>\n",
       "      <td>1</td>\n",
       "      <td>36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>G</th>\n",
       "      <td>2</td>\n",
       "      <td>36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1043223</th>\n",
       "      <th>C</th>\n",
       "      <td>3</td>\n",
       "      <td>35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1043248</th>\n",
       "      <th>C</th>\n",
       "      <td>3</td>\n",
       "      <td>34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1043288</th>\n",
       "      <th>G</th>\n",
       "      <td>3</td>\n",
       "      <td>37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1043382</th>\n",
       "      <th>G</th>\n",
       "      <td>1</td>\n",
       "      <td>39</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "features                            ReadDepth  AverageBaseQuality\n",
       "Run_db Run_digits Chr Pos     base                               \n",
       "ERR    187270     1   14727   A             2                  31\n",
       "                              G             8                  37\n",
       "                      630825  T            70                  36\n",
       "                      630833  C            75                  35\n",
       "                              T             1                  37\n",
       "                      833068  G             1                  32\n",
       "                      842133  G             5                  37\n",
       "                      843942  G             1                  40\n",
       "                      850609  T             4                  38\n",
       "                      948136  G             3                  34\n",
       "                      955964  G             1                  38\n",
       "                      970788  G             2                  37\n",
       "                      1013541 C             1                  17\n",
       "                      1014143 C             2                  24\n",
       "                      1014228 A             2                  37\n",
       "                              G             1                  37\n",
       "                      1014316 C             2                  37\n",
       "                      1014359 G             4                  38\n",
       "                      1020239 G             1                  36\n",
       "                      1022188 A             2                  35\n",
       "                      1022225 G             2                  30\n",
       "                      1022260 C             3                  39\n",
       "                      1022313 A             1                  38\n",
       "                      1042136 T             2                  36\n",
       "                      1042190 A             1                  36\n",
       "                              G             2                  36\n",
       "                      1043223 C             3                  35\n",
       "                      1043248 C             3                  34\n",
       "                      1043288 G             3                  37\n",
       "                      1043382 G             1                  39"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hitDf.head(n=30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Diagnosis\n",
    "\n",
    "if you run into memory issue, I highly recommend trying out [AWS](https://aws.amazon.com/) machines. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "memory_usageS=tmpDf.memory_usage()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4.791737056 GB of RAM was for storing the pickle\n"
     ]
    }
   ],
   "source": [
    "print ('{} GB of RAM was for storing the pickle'.format(memory_usageS.sum()/(10**9)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}