{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "Click run all for this jupyter notebook, please send an email to btsui@eng.ucsd.edu if there is any error. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### download example data\n", "\n", "\n", "Change syn15659419 to syn15624400 if you want to download the entire vairant dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "pip install synapseclient \n", "pip install pandas --upgrade\n", "####only download one file for now\n", "#\n", "mkdir tmp_data/\n", "cd ./tmp_data/\n", "#created a dummy accounts so that any one can download without registering, please don't do anything crazy with the account\n", "synapse -u synapse.skymap.download -p QtL-E2g-hzz-N4k get syn15659419 \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# input \n", "1. configure base_mergedBySRR_dir to your local copy of mergedBySRR\n", "2. query_SRR, Sequence read archive SRR ID\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "base_mergedBySRR_dir='./tmp_data/'#'~/Data/merged/snp/hg38/mergedBySRR/'\n", "query_SRR='ERR126304'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# slicing" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 25.5 s, sys: 3.14 s, total: 28.7 s\n", "Wall time: 28.7 s\n", "CPU times: user 32.6 s, sys: 30 s, total: 1min 2s\n", "Wall time: 36.8 s\n" ] } ], "source": [ "import pandas as pd\n", "import re \n", "query_Run_digits=int(re.search(r\"\\d+\", query_SRR).group(0))\n", "query_Run_db=re.search(r\"\\wRR+\", query_SRR).group(0)\n", "#currently fixed, just use this\n", "chunkSize=int(10**5)\n", "%time tmpDf=pd.read_pickle('{}/{}.pickle.gz'.format(base_mergedBySRR_dir,chunkSize))\n", "%time hitDf=tmpDf.loc[[query_Run_db,query_Run_digits]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# output \n", "viola, in a minute, you got the data" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | \n", " | \n", " | \n", " | features | \n", "ReadDepth | \n", "AverageBaseQuality | \n", "
---|---|---|---|---|---|---|
Run_db | \n", "Run_digits | \n", "Chr | \n", "Pos | \n", "base | \n", "\n", " | \n", " |
ERR | \n", "187270 | \n", "1 | \n", "14727 | \n", "A | \n", "2 | \n", "31 | \n", "
G | \n", "8 | \n", "37 | \n", "||||
630825 | \n", "T | \n", "70 | \n", "36 | \n", "|||
630833 | \n", "C | \n", "75 | \n", "35 | \n", "|||
T | \n", "1 | \n", "37 | \n", "||||
833068 | \n", "G | \n", "1 | \n", "32 | \n", "|||
842133 | \n", "G | \n", "5 | \n", "37 | \n", "|||
843942 | \n", "G | \n", "1 | \n", "40 | \n", "|||
850609 | \n", "T | \n", "4 | \n", "38 | \n", "|||
948136 | \n", "G | \n", "3 | \n", "34 | \n", "|||
955964 | \n", "G | \n", "1 | \n", "38 | \n", "|||
970788 | \n", "G | \n", "2 | \n", "37 | \n", "|||
1013541 | \n", "C | \n", "1 | \n", "17 | \n", "|||
1014143 | \n", "C | \n", "2 | \n", "24 | \n", "|||
1014228 | \n", "A | \n", "2 | \n", "37 | \n", "|||
G | \n", "1 | \n", "37 | \n", "||||
1014316 | \n", "C | \n", "2 | \n", "37 | \n", "|||
1014359 | \n", "G | \n", "4 | \n", "38 | \n", "|||
1020239 | \n", "G | \n", "1 | \n", "36 | \n", "|||
1022188 | \n", "A | \n", "2 | \n", "35 | \n", "|||
1022225 | \n", "G | \n", "2 | \n", "30 | \n", "|||
1022260 | \n", "C | \n", "3 | \n", "39 | \n", "|||
1022313 | \n", "A | \n", "1 | \n", "38 | \n", "|||
1042136 | \n", "T | \n", "2 | \n", "36 | \n", "|||
1042190 | \n", "A | \n", "1 | \n", "36 | \n", "|||
G | \n", "2 | \n", "36 | \n", "||||
1043223 | \n", "C | \n", "3 | \n", "35 | \n", "|||
1043248 | \n", "C | \n", "3 | \n", "34 | \n", "|||
1043288 | \n", "G | \n", "3 | \n", "37 | \n", "|||
1043382 | \n", "G | \n", "1 | \n", "39 | \n", "