{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Purpose of this script:\n", "\n", "\n", "dbSNP use the reference strand to annnotate. \n", "\n", "\n", "Map genomic corrdinates with known human variants to other species (Like mouse)\n", "\n", "Input: Variant table" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "import gzip\n", "import re\n", "from tqdm import tqdm\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] } ], "source": [ "pattern='VP=0x\\d{4}(\\d{2})'\n", "prog = re.compile(pattern)\n", "inDbDir='/data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz'\n", "outDbDir=inDbDir.replace('.vcf.gz','.f1_byte2_not_00.vcf.gz')\n", "humanDf=pd.read_csv(outDbDir,sep='\\t',header=None)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "#!gunzip -c /data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz |head -n 20" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "humanDf.columns=['Chr','Loc','rs','REF','ALT','','','Annot']" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "## pos strand" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "#humanDf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### identify regions in human to LiftOVer ( mapping homologous) to mouse" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "hg38ToDanRer10.over.chain.gz\r\n", "hg38ToDanRer11.over.chain.gz\r\n" ] } ], "source": [ "!ls /cellar/users/btsui/Data/ucsc_chains/ | grep Dan" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "inputDir='/cellar/users/btsui/Data/ucsc_chains/'\n", "fileNameInFolderS=pd.Series(os.listdir(inputDir))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "143 hg38ToMm10.over.chain.gz\n", "dtype: object" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fileNameInFolderS[fileNameInFolderS.str.contains('mm10',case=False)]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "chainFiles=fileNameInFolderS[fileNameInFolderS.str.contains('.chain.gz$')]\n", "chainFiles=pd.concat([chainFiles[chainFiles.str.contains('mm10|DanRer',case=False)],\n", " chainFiles[~chainFiles.str.contains('mm10|DanRer',case=False)]])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "intervalLength=1\n", "refOffset=1\n", "humanDf['Chr_uscs']='chr'+humanDf['Chr'].astype(str)\n", "\n", "humanDf['Loc_+1']=humanDf['Loc']+intervalLength-refOffset\n", "\n", "humanDf['Loc_-1']=humanDf['Loc']-refOffset\n", "humanDf['Chr_uscs']=humanDf['Chr_uscs'].str.replace('chrMT','chrM')\n", "humanDf['Score']='.'\n", "humanDf['Strand']='+'\n", "humanDf[['Chr_uscs','Loc_-1','Loc_+1','rs','Score','Strand']\n", " ].to_csv('tmp.human.ucsc.bed',sep='\\t',header=None,index=None)\n", "#humanDf[['Chr','Loc_-1','Loc_+1','rs']].to_csv('tmp.human.ensembl.bed',sep='\\t',header=None,index=None)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "#!head tmp.human.ucsc.bed" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### make chains" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 1%| | 1/157 [00:01<03:20, 1.29s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 1%|▏ | 2/157 [00:03<04:30, 1.74s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 2%|▏ | 3/157 [08:08<6:58:07, 162.91s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 3%|▎ | 4/157 [08:10<5:12:26, 122.52s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 3%|▎ | 5/157 [08:19<4:13:16, 99.98s/it] " ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 4%|▍ | 6/157 [08:21<3:30:23, 83.60s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 4%|▍ | 7/157 [08:34<3:03:46, 73.51s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 5%|▌ | 8/157 [09:09<2:50:31, 68.67s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 6%|▌ | 9/157 [16:18<4:28:06, 108.69s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 6%|▋ | 10/157 [16:30<4:02:46, 99.09s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 7%|▋ | 11/157 [29:17<6:28:48, 159.78s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 8%|▊ | 12/157 [46:33<9:22:34, 232.79s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] } ], "source": [ "for chainFile in tqdm(chainFiles):\n", " outDir='/cellar/users/btsui/Data/ucsc_chains_human_homo_snps/{}'.format(chainFile+'.human_homo.bed') \n", " unmappedOutDir='/cellar/users/btsui/Data/ucsc_chains_human_homo_snps/{}'.format(chainFile+'.human_homo.unmapped.bed') \n", " liftOverCmd=\"liftOver {} {} {} {}\".format('tmp.human.ucsc.bed',\n", " inputDir+chainFile,\n", " outDir,unmappedOutDir)\n", " print (os.system(liftOverCmd))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#!mkdir /cellar/users/btsui/Data/ucsc_chains_human_homo_snps/" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "asdasd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#os.system(liftOverCmd)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Scratch" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### check SNP strand infos" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "humanDf[['Chr','Loc_-1','Loc','rs']].to_csv('tmp.human.ensembl.bed',sep='\\t',header=None,index=None)\n", "\n", "!bedtools getfasta -fi /cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa -bed tmp.human.ensembl.bed -fo tmp.human.ucsc.bed.fasta" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "tmpS=pd.read_csv('tmp.human.ucsc.bed.fasta',header=None)[0]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "extractedDf=pd.DataFrame({'Loc':tmpS[(tmpS.index%2)==0].values,\n", " 'Nucleotide':tmpS[(tmpS.index%2)==1].values})" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "scrolled": true }, "outputs": [], "source": [ "extractedDf['Chr']=extractedDf.Loc.str.extract('(\\w+):')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "extractedDf['Site']=extractedDf.Loc.str.extract('\\w+:\\d+-(\\d+)')" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "scrolled": true }, "outputs": [], "source": [ "extractedDf['Chr-Site']=extractedDf['Chr']+'-'+extractedDf['Site'].astype(str)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "scrolled": true }, "outputs": [], "source": [ "chrToPosN=extractedDf.set_index(['Chr-Site'])['Nucleotide']" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "chrToPosN=chrToPosN.groupby(chrToPosN.index).first()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ChrLocrsREFALTAnnotChr_uscsLoc_+1Loc_-1
0114727rs1045587GA..RS=1045587;RSPOS=14727;RV;dbSNPBuildID=117;SSR...chr11472714726
11630825rs9783068TC..RS=9783068;RSPOS=630825;dbSNPBuildID=119;SSR=1...chr1630825630824
21630833rs9701099CT..RS=9701099;RSPOS=630833;dbSNPBuildID=119;SSR=1...chr1630833630832
31817186rs3094315GA..RS=3094315;RSPOS=817186;RV;dbSNPBuildID=103;SS...chr1817186817185
41833068rs12562034GA..RS=12562034;RSPOS=833068;dbSNPBuildID=120;SSR=...chr1833068833067
\n", "
" ], "text/plain": [ " Chr Loc rs REF ALT \\\n", "0 1 14727 rs1045587 G A . . \n", "1 1 630825 rs9783068 T C . . \n", "2 1 630833 rs9701099 C T . . \n", "3 1 817186 rs3094315 G A . . \n", "4 1 833068 rs12562034 G A . . \n", "\n", " Annot Chr_uscs Loc_+1 Loc_-1 \n", "0 RS=1045587;RSPOS=14727;RV;dbSNPBuildID=117;SSR... chr1 14727 14726 \n", "1 RS=9783068;RSPOS=630825;dbSNPBuildID=119;SSR=1... chr1 630825 630824 \n", "2 RS=9701099;RSPOS=630833;dbSNPBuildID=119;SSR=1... chr1 630833 630832 \n", "3 RS=3094315;RSPOS=817186;RV;dbSNPBuildID=103;SS... chr1 817186 817185 \n", "4 RS=12562034;RSPOS=833068;dbSNPBuildID=120;SSR=... chr1 833068 833067 " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "humanDf.head()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "humanDf['Chr-Site']=humanDf['Chr'].astype(str)+'-'+humanDf['Loc'].astype(str)#chrToPosN[extractedDf['Chr-Site']].values" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "#posStrandDf=humanDf[~humanDf.Annot.str.contains(';RV;')]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 22, "metadata": { "scrolled": true }, "outputs": [], "source": [ "humanDf['Pos_N']=chrToPosN.loc[humanDf['Chr-Site']].values" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pos_N REF \n", "G GCCTCCAGCCCCAGCT 1\n", " GTGTAAACTCAGAAA 1\n", " GTGTACCCTT 1\n", " GTGTCAT 1\n", " GTGTGCCA 1\n", " GTGTGGGGCGGAGACTT 1\n", " GTGTGT 1\n", " GTGTGTCAGC 1\n", " GTGTTCCAAGGGGACAGTAGCCCC 1\n", " GTTA 1\n", " GTTAAA 1\n", " GTTAAATCCA 1\n", " GTTAAGGAA 1\n", " GTTACAAT 1\n", " GTTACAATTTACTGGCA 1\n", " GTTACAGGAACT 1\n", " GTTACC 1\n", " GTTAGATCC 1\n", " GTTCTCCTCATTGAAAAAGA 1\n", " GTTCTC 1\n", " GTTCTACAACAAGAGCGAGGAT 1\n", " GTTCCT 1\n", " GTTCCATA 1\n", " GTTCAGCT 1\n", " GTGTA 1\n", " GTTCACC 1\n", " GTTATTTTATT 1\n", " GTTATTT 1\n", " GTTATT 1\n", " GTTATC 1\n", " ... \n", "T TTC 103\n", " TCA 112\n", "A AAG 118\n", "C CAA 131\n", " CCT 154\n", " CAT 160\n", "A ACT 170\n", "C CTG 258\n", " CTT 263\n", " CAG 308\n", " CG 431\n", "G GT 487\n", "T TA 531\n", "A AT 637\n", "G GC 688\n", " GA 730\n", "A AC 817\n", "C CA 822\n", "T TC 833\n", " TG 883\n", "A AG 910\n", "C CT 941\n", "N A 14995\n", " T 15146\n", " G 18302\n", " C 18431\n", "T T 57432\n", "A A 57962\n", "G G 94487\n", "C C 94699\n", "Length: 5083, dtype: int64" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "humanDf.groupby(['Pos_N','REF']).size().sort_values()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#tmpS.str.extract('(\\d+):\\d+-(\\d+)',expand=True)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "#posStrandDf" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tmpS" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### map the human variants to mouse" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#!head tmp.mouse.bed" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## remove 1000 when not testing\n", "liftOverBedDf=pd.read_csv('tmp.mouse.bed',sep='\\t',nrows=100000,header=None) \n", "\n", "liftOverBedDf.columns=['Chr_uscs','Start','End','rs']\n", "\n", "###slice out the region for mouse with name\n", "\n", "liftOverBedDf['Chr_ensmbl']=liftOverBedDf['Chr_uscs'].str.replace('^chr','')\n", "liftOverBedDf['Start']=liftOverBedDf['Start']\n", "liftOverBedDf['End']=liftOverBedDf['End']\n", "liftOverBedDf[['Chr_ensmbl','Start','End','rs']].to_csv('tmp.specie.ensembl.bed',sep='\\t',header=None,index=None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "specieFastaDir='/cellar/users/btsui/Data/ensembl/release/fasta/Mus_musculus.GRCm38.dna_rm.toplevel.fa'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bedInterCmd=\"bedtools getfasta -name -fi {} -bed tmp.specie.ensembl.bed > liftOver.fasta\".format(specieFastaDir)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#bedInterCmdHuman=\"bedtools getfasta -name -fi {} -bed tmp.human.ucsc.bed > liftOver.human.fasta\".format(humanFastaDir)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#!head liftOver.human.fasta" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#!bedtools getfasta -name -fi /cellar/users/btsui/Data/ensembl/release/fasta/Homo_sapiens.GRCh38.dna_rm.toplevel.fa -bed tmp.human.ucsc.bed > liftOver.human.fasta" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "os.system(bedInterCmdHuman)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!bedtools getfasta -name -fi /cellar/users/btsui/Data/ensembl/release/fasta/Mus_musculus.GRCm38.dna_rm.toplevel.fa -bed tmp.specie.ensembl.bed > liftOver.fasta" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "os.system(bedInterCmd)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#pd.read_csv('liftOver.human.fasta')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "myFastaS=pd.read_csv( 'liftOver.fasta',header=None)[0]\n", "\n", "rsToSeqDf=pd.DataFrame({'rs':myFastaS[(myFastaS.index%2)==0].str.replace('>','').values,\n", " 'Seq':myFastaS[(myFastaS.index%2)==1].values})\n", "\n", "rsToSeqSpecie=rsToSeqDf.set_index('rs')['Seq']\n", "\n", "revS=pd.Series({'A':'T','T':'A','C':'G','G':'C'})\n", "\n", "\n", "rsToSeqSpecie_corrected=pd.Series(data=revS[rsToSeqSpecie].values,index=rsToSeqSpecie.index)\n", "\n", "humanDf['ref_sepcie']=rsToSeqSpecie_corrected[humanDf['rs']].values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#withValidDf.Chr" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "withValidDf=humanDf[humanDf['ref_sepcie'].isin(['A','C','G','T'])&(humanDf.Chr==3)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "(withValidDf['ref_sepcie']==withValidDf['REF']).mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#withValidDf#.mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#(withValidDf['REF']==withValidDf['ref_sepcie'])[withValidDf['Loc']<100000000].astype(int).reset_index().plot(x='index',y=0,kind='scatter',alpha=0.5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#withValidDf.Chr" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#withValidDf[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "(withValidDf['REF']==withValidDf['ref_sepcie'])[withValidDf['Loc']<100000000].astype(int).reset_index().plot(x='Loc',y=0,kind='scatter',alpha=0.5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#(withValidDf['REF']==withValidDf['specie_ref_seq']).mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "withValidDf[(withValidDf['REF']==withValidDf['specie_ref_seq'])]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#withValidDf" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "revS" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "myDict={}\n", "for i in range(0,50,1):\n", " myDict[i]={'rev':(withValidDf['specie_ref_seq'].str[i]==revS[withValidDf['REF']].values).mean(),\n", " 'pos':(withValidDf['specie_ref_seq'].str[i]==withValidDf['REF'].values).mean()\n", " }" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "withValidDf['rev_ref']=revS[withValidDf['REF']].values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#withValidDf[withValidDf.specie_ref_seq.str[0]!=revS[withValidDf['REF']].values]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#withValidDf" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pd.DataFrame(myDict).T#.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "i=25\n", "#pd.Series(myDict)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tmpDf=pd.DataFrame( {'specie':withValidDf['specie_ref_seq'].str[i].values,'ref':withValidDf['REF'].values})\n", "tmpDf['rev_ref']=revS[tmpDf['ref']].values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "(tmpDf['rev_ref']==tmpDf['specie']).mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pd.Series(myDict\n", " ).plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\"\"\"myDict={}\n", "for i in range(0,100,1):\n", " myDict[i]=(withValidDf['specie_ref_seq'].str[i]).value_counts()\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#withValidDf['specie_ref_seq'].str.len()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pd.DataFrame(myDict).T.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pd.Series(myDict)#.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!head tmp.human.ensembl.bed" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#!bedtools getfasta -fi /cellar/users/btsui/Data/ensembl/release/fasta/Mus_musculus.GRCm38.dna_rm.toplevel.fa -bed tmp.human.ensembl.bed > liftOver.bed" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#!getFastaFromBed -fi " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#getFastaFromBed [OPTIONS] -fi -bed \n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## get the fasta " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#liftOverBedDf" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!wc -l tmp.mouse.bed" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!wc -l unMapped" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#!liftOver" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### check in fasta to make sure ref base is correct" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }