{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Purpose of this script:\n", "\n", "\n", "dbSNP use the reference strand to annnotate. \n", "\n", "\n", "Map genomic corrdinates with known human variants to other species (Like mouse)\n", "\n", "Input: Variant table" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "import gzip\n", "import re\n", "from tqdm import tqdm\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] } ], "source": [ "pattern='VP=0x\\d{4}(\\d{2})'\n", "prog = re.compile(pattern)\n", "inDbDir='/data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz'\n", "outDbDir=inDbDir.replace('.vcf.gz','.f1_byte2_not_00.vcf.gz')\n", "humanDf=pd.read_csv(outDbDir,sep='\\t',header=None)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "#!gunzip -c /data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz |head -n 20" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "humanDf.columns=['Chr','Loc','rs','REF','ALT','','','Annot']" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "## pos strand" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "#humanDf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### identify regions in human to LiftOVer ( mapping homologous) to mouse" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "hg38ToDanRer10.over.chain.gz\r\n", "hg38ToDanRer11.over.chain.gz\r\n" ] } ], "source": [ "!ls /cellar/users/btsui/Data/ucsc_chains/ | grep Dan" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "inputDir='/cellar/users/btsui/Data/ucsc_chains/'\n", "fileNameInFolderS=pd.Series(os.listdir(inputDir))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "143 hg38ToMm10.over.chain.gz\n", "dtype: object" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fileNameInFolderS[fileNameInFolderS.str.contains('mm10',case=False)]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "chainFiles=fileNameInFolderS[fileNameInFolderS.str.contains('.chain.gz$')]\n", "chainFiles=pd.concat([chainFiles[chainFiles.str.contains('mm10|DanRer',case=False)],\n", " chainFiles[~chainFiles.str.contains('mm10|DanRer',case=False)]])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "intervalLength=1\n", "refOffset=1\n", "humanDf['Chr_uscs']='chr'+humanDf['Chr'].astype(str)\n", "\n", "humanDf['Loc_+1']=humanDf['Loc']+intervalLength-refOffset\n", "\n", "humanDf['Loc_-1']=humanDf['Loc']-refOffset\n", "humanDf['Chr_uscs']=humanDf['Chr_uscs'].str.replace('chrMT','chrM')\n", "humanDf['Score']='.'\n", "humanDf['Strand']='+'\n", "humanDf[['Chr_uscs','Loc_-1','Loc_+1','rs','Score','Strand']\n", " ].to_csv('tmp.human.ucsc.bed',sep='\\t',header=None,index=None)\n", "#humanDf[['Chr','Loc_-1','Loc_+1','rs']].to_csv('tmp.human.ensembl.bed',sep='\\t',header=None,index=None)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "#!head tmp.human.ucsc.bed" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### make chains" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 1%| | 1/157 [00:01<03:20, 1.29s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 1%|▏ | 2/157 [00:03<04:30, 1.74s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 2%|▏ | 3/157 [08:08<6:58:07, 162.91s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 3%|▎ | 4/157 [08:10<5:12:26, 122.52s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 3%|▎ | 5/157 [08:19<4:13:16, 99.98s/it] " ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 4%|▍ | 6/157 [08:21<3:30:23, 83.60s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 4%|▍ | 7/157 [08:34<3:03:46, 73.51s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 5%|▌ | 8/157 [09:09<2:50:31, 68.67s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 6%|▌ | 9/157 [16:18<4:28:06, 108.69s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 6%|▋ | 10/157 [16:30<4:02:46, 99.09s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 7%|▋ | 11/157 [29:17<6:28:48, 159.78s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 8%|▊ | 12/157 [46:33<9:22:34, 232.79s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] } ], "source": [ "for chainFile in tqdm(chainFiles):\n", " outDir='/cellar/users/btsui/Data/ucsc_chains_human_homo_snps/{}'.format(chainFile+'.human_homo.bed') \n", " unmappedOutDir='/cellar/users/btsui/Data/ucsc_chains_human_homo_snps/{}'.format(chainFile+'.human_homo.unmapped.bed') \n", " liftOverCmd=\"liftOver {} {} {} {}\".format('tmp.human.ucsc.bed',\n", " inputDir+chainFile,\n", " outDir,unmappedOutDir)\n", " print (os.system(liftOverCmd))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#!mkdir /cellar/users/btsui/Data/ucsc_chains_human_homo_snps/" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "asdasd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#os.system(liftOverCmd)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Scratch" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### check SNP strand infos" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "humanDf[['Chr','Loc_-1','Loc','rs']].to_csv('tmp.human.ensembl.bed',sep='\\t',header=None,index=None)\n", "\n", "!bedtools getfasta -fi /cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa -bed tmp.human.ensembl.bed -fo tmp.human.ucsc.bed.fasta" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "tmpS=pd.read_csv('tmp.human.ucsc.bed.fasta',header=None)[0]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "extractedDf=pd.DataFrame({'Loc':tmpS[(tmpS.index%2)==0].values,\n", " 'Nucleotide':tmpS[(tmpS.index%2)==1].values})" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "scrolled": true }, "outputs": [], "source": [ "extractedDf['Chr']=extractedDf.Loc.str.extract('(\\w+):')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "extractedDf['Site']=extractedDf.Loc.str.extract('\\w+:\\d+-(\\d+)')" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "scrolled": true }, "outputs": [], "source": [ "extractedDf['Chr-Site']=extractedDf['Chr']+'-'+extractedDf['Site'].astype(str)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "scrolled": true }, "outputs": [], "source": [ "chrToPosN=extractedDf.set_index(['Chr-Site'])['Nucleotide']" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "chrToPosN=chrToPosN.groupby(chrToPosN.index).first()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Chr | \n", "Loc | \n", "rs | \n", "REF | \n", "ALT | \n", "\n", " | \n", " | Annot | \n", "Chr_uscs | \n", "Loc_+1 | \n", "Loc_-1 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "14727 | \n", "rs1045587 | \n", "G | \n", "A | \n", ". | \n", ". | \n", "RS=1045587;RSPOS=14727;RV;dbSNPBuildID=117;SSR... | \n", "chr1 | \n", "14727 | \n", "14726 | \n", "
1 | \n", "1 | \n", "630825 | \n", "rs9783068 | \n", "T | \n", "C | \n", ". | \n", ". | \n", "RS=9783068;RSPOS=630825;dbSNPBuildID=119;SSR=1... | \n", "chr1 | \n", "630825 | \n", "630824 | \n", "
2 | \n", "1 | \n", "630833 | \n", "rs9701099 | \n", "C | \n", "T | \n", ". | \n", ". | \n", "RS=9701099;RSPOS=630833;dbSNPBuildID=119;SSR=1... | \n", "chr1 | \n", "630833 | \n", "630832 | \n", "
3 | \n", "1 | \n", "817186 | \n", "rs3094315 | \n", "G | \n", "A | \n", ". | \n", ". | \n", "RS=3094315;RSPOS=817186;RV;dbSNPBuildID=103;SS... | \n", "chr1 | \n", "817186 | \n", "817185 | \n", "
4 | \n", "1 | \n", "833068 | \n", "rs12562034 | \n", "G | \n", "A | \n", ". | \n", ". | \n", "RS=12562034;RSPOS=833068;dbSNPBuildID=120;SSR=... | \n", "chr1 | \n", "833068 | \n", "833067 | \n", "