{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "Click run all for this jupyter notebook, please send an email to btsui@eng.ucsd.edu if there is any error. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### download example data\n", "\n", "\n", "Change syn15659419 to syn15624400 if you want to download the entire vairant dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "pip install synapseclient \n", "pip install pandas --upgrade\n", "####only download one file for now\n", "#\n", "mkdir tmp_data/\n", "cd ./tmp_data/\n", "#created a dummy accounts so that any one can download without registering, please don't do anything crazy with the account\n", "synapse -u synapse.skymap.download -p QtL-E2g-hzz-N4k get syn15659419 \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# input \n", "1. configure base_mergedBySRR_dir to your local copy of mergedBySRR\n", "2. query_SRR, Sequence read archive SRR ID\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "base_mergedBySRR_dir='./tmp_data/'#'~/Data/merged/snp/hg38/mergedBySRR/'\n", "query_SRR='ERR126304'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# slicing" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 25.5 s, sys: 3.14 s, total: 28.7 s\n", "Wall time: 28.7 s\n", "CPU times: user 32.6 s, sys: 30 s, total: 1min 2s\n", "Wall time: 36.8 s\n" ] } ], "source": [ "import pandas as pd\n", "import re \n", "query_Run_digits=int(re.search(r\"\\d+\", query_SRR).group(0))\n", "query_Run_db=re.search(r\"\\wRR+\", query_SRR).group(0)\n", "#currently fixed, just use this\n", "chunkSize=int(10**5)\n", "%time tmpDf=pd.read_pickle('{}/{}.pickle.gz'.format(base_mergedBySRR_dir,chunkSize))\n", "%time hitDf=tmpDf.loc[[query_Run_db,query_Run_digits]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# output \n", "viola, in a minute, you got the data" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
featuresReadDepthAverageBaseQuality
Run_dbRun_digitsChrPosbase
ERR187270114727A231
G837
630825T7036
630833C7535
T137
833068G132
842133G537
843942G140
850609T438
948136G334
955964G138
970788G237
1013541C117
1014143C224
1014228A237
G137
1014316C237
1014359G438
1020239G136
1022188A235
1022225G230
1022260C339
1022313A138
1042136T236
1042190A136
G236
1043223C335
1043248C334
1043288G337
1043382G139
\n", "
" ], "text/plain": [ "features ReadDepth AverageBaseQuality\n", "Run_db Run_digits Chr Pos base \n", "ERR 187270 1 14727 A 2 31\n", " G 8 37\n", " 630825 T 70 36\n", " 630833 C 75 35\n", " T 1 37\n", " 833068 G 1 32\n", " 842133 G 5 37\n", " 843942 G 1 40\n", " 850609 T 4 38\n", " 948136 G 3 34\n", " 955964 G 1 38\n", " 970788 G 2 37\n", " 1013541 C 1 17\n", " 1014143 C 2 24\n", " 1014228 A 2 37\n", " G 1 37\n", " 1014316 C 2 37\n", " 1014359 G 4 38\n", " 1020239 G 1 36\n", " 1022188 A 2 35\n", " 1022225 G 2 30\n", " 1022260 C 3 39\n", " 1022313 A 1 38\n", " 1042136 T 2 36\n", " 1042190 A 1 36\n", " G 2 36\n", " 1043223 C 3 35\n", " 1043248 C 3 34\n", " 1043288 G 3 37\n", " 1043382 G 1 39" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hitDf.head(n=30)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Diagnosis\n", "\n", "if you run into memory issue, I highly recommend trying out [AWS](https://aws.amazon.com/) machines. " ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "memory_usageS=tmpDf.memory_usage()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4.791737056 GB of RAM was for storing the pickle\n" ] } ], "source": [ "print ('{} GB of RAM was for storing the pickle'.format(memory_usageS.sum()/(10**9)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }