{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "import gzip\n", "import re\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "##fileformat=VCFv4.0\r\n", "##fileDate=20170710\r\n", "##source=dbSNP\r\n", "##dbSNP_BUILD_ID=150\r\n", "##reference=GRCh38.p7\r\n", "##phasing=partial\r\n", "##variationPropertyDocumentationUrl=ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf\t\r\n", "##INFO=\r\n", "##INFO=\r\n", "##INFO=\r\n", "##INFO=\r\n", "##INFO=\r\n", "##INFO=\r\n", "##INFO=\r\n", "##INFO=\r\n", "##INFO=\r\n", "##INFO=\r\n", "##INFO=\r\n", "##INFO=\r\n", "##INFO=\r\n", "\r\n", "gzip: stdout: Broken pipe\r\n" ] } ], "source": [ "!gunzip -c /data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz| head -n 20\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "specie='Mus_musculus'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "#ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/mouse_10090/VCF/00-All.vcf.gz\n", "pattern='VP=\\d{2}(\\d{4})'\n", "#pattern2='VP=\\w{18}(\\d{2})'\n", "#pattern='VP=050000000005000000000100'\n", "prog2 = re.compile(pattern)\n", "inDbDir='/data/cellardata/users/btsui/dbsnp/'+specie+'/00-All.vcf.gz'\n", "outDbDir=inDbDir.replace('.vcf.gz','.f1_byte2_not_00.vcf.gz')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### identify snps with reference" ] }, { "cell_type": "code", "execution_count": 754, "metadata": {}, "outputs": [], "source": [ "#!gunzip -c /data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz | head -n 100" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'0000'" ] }, "execution_count": 120, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "byte: 8 bit\n", "256\n", "#10 at F2\n", "\"\"\"\n", "f1_byte2" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'1\\t3000020\\trs1133275841\\tT\\tA\\t.\\t.\\tRSPOS=3000020;dbSNPBuildID=150;SAO=0;VC=snp;VP=050000000005000000000100\\n'" ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'1\\t3000020\\trs1133275841\\tT\\tA\\t.\\t.\\tRSPOS=3000020;dbSNPBuildID=150;SAO=0;VC=snp;VP=050000000005000000000100\\n'" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 0\n", "1000000 621\n" ] } ], "source": [ "\n", "###\n", "TEST=True\n", "j=0\n", "with gzip.open(inDbDir, 'rb') as f:\n", " with gzip.open(outDbDir,'wb') as wf:\n", " for i,l in enumerate(f):\n", " if l[0]!='#':\n", " f1_byte2=prog.findall(l)[0]\n", " if f1_byte2!='0000':\n", " j+=1\n", " wf.write(l)\n", " if TEST and (i>10**6):\n", " break\n", " if (i%(10**6))==0:\n", " print i,j" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "161747\n" ] } ], "source": [] }, { "cell_type": "code", "execution_count": 127, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "81432284\n" ] } ], "source": [ "print i" ] }, { "cell_type": "code", "execution_count": 144, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.001986276106414994" ] }, "execution_count": 144, "metadata": {}, "output_type": "execute_result" } ], "source": [ "161747/81432284.0" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### identify snp window" ] }, { "cell_type": "code", "execution_count": 185, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\n", "tmpDf=pd.read_csv(outDbDir,sep='\\t',header=None)\n", "tmpDf.columns=['Chr','Loc','rs','REF','ALT','','','Annot']" ] }, { "cell_type": "code", "execution_count": 267, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "#65158616-65186500\n", "#chr1:65170978-65170978\n", "#tmpDf[(tmpDf['Chr']=='1')&((tmpDf['Start']>65158616)&(tmpDf['End']<65186500))]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Chr1:65158616" ] }, { "cell_type": "code", "execution_count": 186, "metadata": { "collapsed": true }, "outputs": [], "source": [ "### take only the non-\n", "#give it 1000\n", "window_size=1000\n", "tmpDf['Start']=tmpDf['Loc']-window_size\n", "tmpDf.loc[(tmpDf['Start']<0),'Start']=0\n", "tmpDf['End']=tmpDf['Loc']+window_size\n" ] }, { "cell_type": "code", "execution_count": 198, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 198, "metadata": {}, "output_type": "execute_result" } ], "source": [ "myCleanFaDir='/cellar/users/btsui/Data/ensembl/clean/'+specie+'.fa'\n", "os.system(' samtools faidx '+ myCleanFaDir)" ] }, { "cell_type": "code", "execution_count": 200, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#bedtools complement -i -g \n", "myFai=pd.read_csv(myCleanFaDir+'.fai',sep='\\t',header=None)" ] }, { "cell_type": "code", "execution_count": 205, "metadata": { "collapsed": true }, "outputs": [], "source": [ "FaOrder=myFai[0].unique()" ] }, { "cell_type": "code", "execution_count": 212, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tmpDf['Chr']=tmpDf['Chr'].astype(dtype=\"category\",categories=FaOrder,ordered=True)\n", "\n", "tmpDf2=tmpDf[['Chr','Start','End']].sort_values(['Chr','Start','End'])\n", "tmpDf2.to_csv('extracting_region.bed',sep='\\t',header=None,index=None)" ] }, { "cell_type": "code", "execution_count": 215, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#tmpDf2.Chr.unique()" ] }, { "cell_type": "code", "execution_count": 216, "metadata": { "collapsed": true }, "outputs": [], "source": [ "myFai[[0,1]].to_csv('genome',sep='\\t',header=None,index=None)" ] }, { "cell_type": "code", "execution_count": 217, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\t195471971\t59\t60\t61\r\n", "2\t182113224\t198729956\t60\t61\r\n", "3\t160039680\t383878460\t60\t61\r\n", "4\t156508116\t546585527\t60\t61\r\n", "5\t151834684\t705702171\t60\t61\r\n", "6\t149736546\t860067493\t60\t61\r\n", "7\t145441459\t1012299708\t60\t61\r\n", "8\t129401213\t1160165251\t60\t61\r\n", "9\t124595110\t1291723210\t60\t61\r\n", "10\t130694993\t1418394967\t60\t61\r\n" ] } ], "source": [ "!head /cellar/users/btsui/Data/ensembl/clean/Mus_musculus.fa.fai" ] }, { "cell_type": "code", "execution_count": 218, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!bedtools complement --help" ] }, { "cell_type": "code", "execution_count": 219, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!bedtools complement -i extracting_region.bed -g genome > complement.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### mask out the rest of the genome" ] }, { "cell_type": "code", "execution_count": 220, "metadata": { "collapsed": true }, "outputs": [], "source": [ "complementDf=pd.read_csv('complement.txt',sep='\\t',header=None)" ] }, { "cell_type": "code", "execution_count": 557, "metadata": {}, "outputs": [], "source": [ "#complementDf.shape" ] }, { "cell_type": "code", "execution_count": 227, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tmpDf[['Chr','Loc','Loc']].to_csv('/data/cellardata/users/btsui/dbsnp/snp_beds/'+specie+'.bed',\n", " sep='\\t',header=None,index=None)" ] }, { "cell_type": "code", "execution_count": 269, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/data/cellardata/users/btsui/dbsnp/snp_beds/Homo_sapiens.bed\r\n" ] } ], "source": [ "!echo /data/cellardata/users/btsui/dbsnp/snp_beds/Homo_sapiens.bed" ] }, { "cell_type": "code", "execution_count": 232, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!wc -l /data/cellardata/users/btsui/dbsnp/snp_beds/Homo_sapiens.bed" ] }, { "cell_type": "code", "execution_count": 233, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!rm pipe\n", "!mkfifo pipe" ] }, { "cell_type": "code", "execution_count": 234, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os" ] }, { "cell_type": "code", "execution_count": 235, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 235, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.system('maskFastaFromBed -fi /cellar/users/btsui/Data/ensembl/clean/'+specie+'.fa -bed complement.txt -fo pipe &')" ] }, { "cell_type": "code", "execution_count": 236, "metadata": { "collapsed": true }, "outputs": [], "source": [ "outDir='/cellar/users/btsui/Data/ensembl/snp_masked/'" ] }, { "cell_type": "code", "execution_count": 237, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 237, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.system('gzip -c pipe > '+outDir+specie+'.fa.gz')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### combine with microbe" ] }, { "cell_type": "code", "execution_count": 238, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#'microbe.fa.gz'" ] }, { "cell_type": "code", "execution_count": 239, "metadata": { "collapsed": true }, "outputs": [], "source": [ "inDir=outDir+specie+'.fa.gz'\n", "withMicrobeDir=inDir.replace('.fa.gz','.microbe.fa.gz')" ] }, { "cell_type": "code", "execution_count": 240, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 240, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.system('cat microbe.fa.gz '+inDir+' >'+withMicrobeDir)" ] }, { "cell_type": "code", "execution_count": 246, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#withMicrobeDir" ] }, { "cell_type": "code", "execution_count": 249, "metadata": {}, "outputs": [ { "ename": "OSError", "evalue": "[Errno 17] File exists: '/cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/Mus_musculus'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmkdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/'\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mspecie\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mOSError\u001b[0m: [Errno 17] File exists: '/cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/Mus_musculus'" ] } ], "source": [ "os.mkdir('/cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/'+specie)" ] }, { "cell_type": "code", "execution_count": 251, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 251, "metadata": {}, "output_type": "execute_result" } ], "source": [ "myDir='/cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/'+specie+'/bowtie2'\n", "\n", "faDir=withMicrobeDir\n", "\n", "\n", "cmd= '/cellar/users/btsui/Program/bowtie2-2.3.4-linux-x86_64/bowtie2-build --threads 4 '+faDir+' '+myDir\n", "#os.system('rm -r '+myDir)\n", "#os.system('mkdir '+myDir)\n", "os.system(cmd)" ] }, { "cell_type": "code", "execution_count": 252, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!bowtie2-build --threads 4 /cellar/users/btsui/Data/ensembl/snp_masked/Mus_musculus.microbe.fa.gz /cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/Mus_musculus/bowtie2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# scratch" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!rm /cellar/users/btsui/Data/ensembl/snp_masked/*" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'bowtie2-build --threads 48 /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa /cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/Homo_sapiens/'" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#run bowtie \n", "myDir='/cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/'+specie+'/'\n", "faDir='/cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa'\n", "cmd= 'bowtie2-build --threads 48 '+faDir+' '+myDir\n", "cmd" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!cp Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa /cellar/users/btsui/Data/ensembl/snp_masked/.\n" ] }, { "cell_type": "code", "execution_count": 282, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"\\nsubDf=complementDf[(complementDf.Chr=='2')]\\nsubDf['dist_idh']=(subDf['Start']-208236227).abs()\\nsubDf.sort_values('dist_idh')\"" ] }, "execution_count": 282, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#IDH1\n", "\"\"\"\n", "Chromosome:2\n", "Start:208,236,227 bp from pterEnd:208,266,074 bp from pter\n", "\n", "\"\"\"\n", "\n", "\"\"\"\n", "subDf=complementDf[(complementDf.Chr=='2')]\n", "subDf['dist_idh']=(subDf['Start']-208236227).abs()\n", "subDf.sort_values('dist_idh')\"\"\"" ] }, { "cell_type": "code", "execution_count": 262, "metadata": { "collapsed": true }, "outputs": [], "source": [ "spaceMasked=(complementDf.End-complementDf.Start)" ] }, { "cell_type": "code", "execution_count": 265, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10.705346589807313" ] }, "execution_count": 265, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.log10((spaceMasked.sum()))" ] }, { "cell_type": "code", "execution_count": 267, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "50739547498" ] }, "execution_count": 267, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import cPickle as pickle\n", "import bz2\n", "\n", "db = pickle.load(bz2.BZ2File('db_v20/mpa_v20_m200.pkl', 'r'))" ] }, { "cell_type": "code", "execution_count": 306, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from Bio import SeqIO" ] }, { "cell_type": "code", "execution_count": 270, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9934342390348803" ] }, "execution_count": 270, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "record = SeqIO.read(\"single.fasta\", \"fasta\")" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!cp ./Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa.gz /cellar/users/btsui/Data/ensembl/snp_masked/" ] }, { "cell_type": "code", "execution_count": 750, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n" ] } ], "source": [ "from Bio import SeqIO\n", "import gzip\n", "maskedFasta=outDir+specie+'.fa.gz'\n", "unmaskedFasta='/cellar/users/btsui/Data/ensembl/release/fasta/Homo_sapiens.GRCh38.dna_rm.toplevel.fa'\n", "with open(unmaskedFasta, \"r\") as handle:\n", " for human_record in SeqIO.parse(handle, \"fasta\") :\n", " print(human_record.id)\n", " if human_record.id=='1':\n", " break\n", " " ] }, { "cell_type": "code", "execution_count": 718, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n" ] } ], "source": [ "from Bio import SeqIO\n", "import gzip\n", "#maskedFasta=outDir+'Hm'+'.fa.gz'\n", "unmaskedFasta='/cellar/users/btsui/Data/ensembl/clean/Mus_musculus.fa'\n", "with open(unmaskedFasta, \"r\") as handle:\n", " for mouse_record in SeqIO.parse(handle, \"fasta\") :\n", " print(mouse_record.id)\n", " if mouse_record.id=='1':\n", " break\n", " " ] }, { "cell_type": "code", "execution_count": 753, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'N'" ] }, "execution_count": 753, "metadata": {}, "output_type": "execute_result" } ], "source": [ "human_record[10177]" ] }, { "cell_type": "code", "execution_count": 692, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\n#human chr1\\t14727\\t14728\\t0\\n#mouse chr17\\t66119103\\t66119104\\t0\\n'" ] }, "execution_count": 692, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "#human 2:208248388..208248388,\n", "#mouse chr1\t65170978\t65170979\t60497\n", "\n", "1\t10177\n", "\"\"\" \n", "mouse_record " ] }, { "cell_type": "code", "execution_count": 727, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'AATGACTTACTTGGTCCCCATATGCATGTCGGCCAATGATGATGGGTTTTACCCAGCCTG'" ] }, "execution_count": 727, "metadata": {}, "output_type": "execute_result" } ], "source": [ "i=65170978\n", "#\n", "str(mouse_record.seq)[i-30:i+30]" ] }, { "cell_type": "code", "execution_count": 720, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from Bio import Seq" ] }, { "cell_type": "code", "execution_count": 741, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'G'" ] }, "execution_count": 741, "metadata": {}, "output_type": "execute_result" } ], "source": [ "i=208248388\n", "(str(human_record.seq)[i-30:i+30])\n", "human_record.seq[i]" ] }, { "cell_type": "code", "execution_count": 733, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "''" ] }, "execution_count": 733, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## G➞A\n" ] }, { "cell_type": "code", "execution_count": 732, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'ACGA'" ] }, "execution_count": 732, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(str(human_record.seq)[(i-2):(i+2)])" ] }, { "cell_type": "code", "execution_count": 739, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'G'" ] }, "execution_count": 739, "metadata": {}, "output_type": "execute_result" } ], "source": [ "i=65170978\n", "#Seq.reverse_complement(str(human_record.seq)[i-20:i+20])\n", "(mouse_record.seq[i])" ] }, { "cell_type": "code", "execution_count": 364, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'TAAGCATGACGACCTATGAT'" ] }, "execution_count": 364, "metadata": {}, "output_type": "execute_result" } ], "source": [ "str(record.seq)[i-10:i+10]" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3660\n" ] } ], "source": [ "proteome='/cellar/users/btsui/Downloads/viral.2.protein.faa'\n", "with open(proteome) as handle:\n", " for record in SeqIO.parse(handle, \"fasta\") :\n", " print (len(record))*3\n", " break" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SeqRecord(seq=Seq('MAVNTSGKTRLPQPASEDYTQYARNTLKNLNNVYEKFAVRGPVLALVRPAQFSK...GAV', SingleLetterAlphabet()), id='YP_003620396.1', name='YP_003620396.1', description='YP_003620396.1 p130 [Providence virus]', dbxrefs=[])" ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "record" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-rw-r--r-- 1 btsui users 19M Dec 30 13:25 /cellar/users/btsui/Downloads/viral.2.protein.faa\r\n" ] } ], "source": [ "!ls -lah /cellar/users/btsui/Downloads/viral.2.protein.faa" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### create ids for the bed" ] }, { "cell_type": "code", "execution_count": 559, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 579, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "tmpBedDf=pd.read_csv('/data/cellardata/users/btsui/dbsnp/snp_beds/Homo_sapiens.bed',header=None,sep='\\t')" ] }, { "cell_type": "code", "execution_count": 580, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',\n", " '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y', 'MT'], dtype=object)" ] }, "execution_count": 580, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tmpBedDf[0].astype(np.str).unique()" ] }, { "cell_type": "code", "execution_count": 707, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "208248388" ] }, "execution_count": 707, "metadata": {}, "output_type": "execute_result" } ], "source": [ "208248388" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 582, "metadata": {}, "outputs": [], "source": [ "tmpBedDf.columns=['Chr','Pos','']\n", "tmpBedDf['Chr']=tmpBedDf['Chr'].astype(np.str)\n", "tmpBedDf['Pos']=tmpBedDf['Pos'].astype(np.str)\n", "#tmpBedDf['Chr_Pos']=tmpBedDf['Chr']+'-'+tmpBedDf['Pos']" ] }, { "cell_type": "code", "execution_count": 583, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tmpBedDf['Id']=tmpBedDf.index" ] }, { "cell_type": "code", "execution_count": 585, "metadata": {}, "outputs": [], "source": [ "#tmpBedDf[['Chr_Pos','Id']].drop_duplicates(['Chr_Pos']).to_pickle('/data/cellardata/users/btsui/dbsnp/snp_beds/Homo_sapiens_chrom_pos__id.pickle')" ] }, { "cell_type": "code", "execution_count": 589, "metadata": {}, "outputs": [], "source": [ "#tmpBedDf" ] }, { "cell_type": "code", "execution_count": 590, "metadata": {}, "outputs": [], "source": [ "#index=tmpDf.Chr.value_counts()" ] }, { "cell_type": "code", "execution_count": 591, "metadata": {}, "outputs": [], "source": [ "#index.astype(np.str)" ] }, { "cell_type": "code", "execution_count": 592, "metadata": { "collapsed": true }, "outputs": [], "source": [ "specie='Homo_sapiens'" ] }, { "cell_type": "code", "execution_count": 593, "metadata": { "collapsed": true }, "outputs": [], "source": [ "myBedDir='/data/cellardata/users/btsui/dbsnp/snp_beds/'+specie+'.bed'" ] }, { "cell_type": "code", "execution_count": 657, "metadata": { "collapsed": true }, "outputs": [], "source": [ "myBedDf=pd.read_csv(myBedDir,sep='\\t',header=None)" ] }, { "cell_type": "code", "execution_count": 658, "metadata": { "collapsed": true }, "outputs": [], "source": [ "myBedDf[3]=myBedDf.index" ] }, { "cell_type": "code", "execution_count": 659, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "code", "execution_count": 660, "metadata": { "collapsed": true }, "outputs": [], "source": [ "window=1" ] }, { "cell_type": "code", "execution_count": 661, "metadata": { "collapsed": true }, "outputs": [], "source": [ "myBedDf[0]='chr'+myBedDf[0].astype(np.str)\n", "myBedDf[2]=myBedDf[2]+window" ] }, { "cell_type": "code", "execution_count": 662, "metadata": {}, "outputs": [], "source": [ "#myBedDf[1]" ] }, { "cell_type": "code", "execution_count": 663, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
0chr114727147280
1chr16308256308261
2chr16308336308342
3chr18171868171873
4chr18330688330694
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "0 chr1 14727 14728 0\n", "1 chr1 630825 630826 1\n", "2 chr1 630833 630834 2\n", "3 chr1 817186 817187 3\n", "4 chr1 833068 833069 4" ] }, "execution_count": 663, "metadata": {}, "output_type": "execute_result" } ], "source": [ "myBedDf.head()" ] }, { "cell_type": "code", "execution_count": 664, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#ucscOutS.head(n=1000).to_csv('tmp.bed',sep='\\t',header=None,index=None)" ] }, { "cell_type": "code", "execution_count": 710, "metadata": { "collapsed": true }, "outputs": [], "source": [ "myBedDf[myBedDf[1]==208248388].head(n=1000).to_csv('tmp.bed',sep='\\t',header=None,index=None)" ] }, { "cell_type": "code", "execution_count": 715, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
60497chr220824838820824838960497
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "60497 chr2 208248388 208248389 60497" ] }, "execution_count": 715, "metadata": {}, "output_type": "execute_result" } ], "source": [ "myBedDf[myBedDf[1]==208248388]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 711, "metadata": {}, "outputs": [], "source": [ "#!~/liftOver" ] }, { "cell_type": "code", "execution_count": 712, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reading liftover chains\n", "Mapping coordinates\n" ] } ], "source": [ "!~/liftOver -minMatch=0.2 ./tmp.bed ~/hg38ToMm10.over.chain ./tmp.out.100.bed unMapped" ] }, { "cell_type": "code", "execution_count": 713, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "chr2\t208248388\t208248389\t60497\r\n" ] } ], "source": [ "!head ./tmp.bed" ] }, { "cell_type": "code", "execution_count": 682, "metadata": { "collapsed": true }, "outputs": [], "source": [ "oneBase=pd.read_csv('./tmp.out.100.bed',header=None,sep='\\t').set_index(3)[1]" ] }, { "cell_type": "code", "execution_count": 716, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "chr1\t65170978\t65170979\t60497\r\n" ] } ], "source": [ "!head ./tmp.out.100.bed" ] }, { "cell_type": "code", "execution_count": 675, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "chr17\t66119103\t66119104\t0\r\n", "chrM\t5086\t5087\t1\r\n", "chrM\t5095\t5096\t2\r\n", "chr4\t156244134\t156244135\t8\r\n", "chr4\t156239437\t156239438\t9\r\n", "chr4\t156225155\t156225156\t10\r\n", "chr4\t156199911\t156199912\t12\r\n", "chr4\t156199826\t156199827\t13\r\n", "chr4\t156199738\t156199739\t14\r\n", "chr4\t156199695\t156199696\t15\r\n" ] } ], "source": [ "!head tmp.out.100.bed" ] }, { "cell_type": "code", "execution_count": 678, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
0chr1766119103661191040
1chrM508650871
2chrM509550962
3chr41562441341562441358
4chr41562394371562394389
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "0 chr17 66119103 66119104 0\n", "1 chrM 5086 5087 1\n", "2 chrM 5095 5096 2\n", "3 chr4 156244134 156244135 8\n", "4 chr4 156239437 156239438 9" ] }, "execution_count": 678, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv('./tmp.out.100.bed',header=None,sep='\\t').head()\n" ] }, { "cell_type": "code", "execution_count": 553, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tmpDf=pd.DataFrame({'one':oneBase,'hundred':hundredBase})" ] }, { "cell_type": "code", "execution_count": 554, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%matplotlib inline\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 555, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.0 734\n", "124878053.0 1\n", "99.0 1\n", "100.0 1\n", "dtype: int64" ] }, "execution_count": 555, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tmpS=(tmpDf['hundred']-tmpDf['one']).abs()\n", "tmpS.value_counts()" ] }, { "cell_type": "code", "execution_count": 556, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 556, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZ8AAAGoCAYAAACZneiBAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAGiVJREFUeJzt3XuQVIWVx/HfyQyEEcVXKCOOFrqY\ngYgM6vCIvATUwcJCohhRTDRml62UMYmWLFIaw5KtihswxWaT7JaPClSJ6Io8rKhokYSI0UUGGAEh\nJFERZ3DNMC6KCjrg2T+6meUxDD2Ee24/vp+qLuk73X3PVdLf3Nt3bpu7CwCASJ9LewAAQOkhPgCA\ncMQHABCO+AAAwhEfAEA44gMACEd8AADhiA8AIBzxAQCEK097gINwuQUAhc7SHqAQsOcDAAiXb3s+\nABDq0ZVb21x+w6CzgicpLez5AADCER8AQDjiAwAIR3wAAOGIDwAgHPEBAIQjPgCAcMQHABCO+AAA\nwhEfAEA44gMACEd8AADhiA8AIBzxAQCEIz4AgHDEBwAQjvgAAMIRHwBAOOIDAAhHfAAA4YgPACAc\n8QEAhCM+AIBwxAcAEI74AADCER8AQDjiAwAIR3wAAOGIDwAgHPEBAIQjPgCAcMQHABCO+AAAwhEf\nAEA44gMACEd8AADhiA8AIBzxAQCEIz4AgHDEBwAQjvgAAMIRHwBAOOIDAAhHfAAA4YgPACAc8QEA\nhCM+AIBwxAcAEI74AADCER8AQLjytAc4Fh5dubXN5TcMOit4EgBALszd056hlZktlfSFFEf4gqTt\nKa4/LWx3aWG7k7Xd3ccErKeg5VV80mZmde5ek/Yc0dju0sJ2Ix/wmQ8AIBzxAQCEIz4HeiDtAVLC\ndpcWthup4zMfAEA49nwAAOGIDwAgHPEBAIQjPgCAcHkVnzFjxrgkbty4cSvkW86K9D0vJ3kVn+3b\nS/GKHwBKVSm/5+VVfAAApYH4AADCER8AQLi8/z6flpYWNTQ0aPfu3WmPgiLRpUsXVVZWqlOnTmmP\nApSsvI9PQ0ODTjjhBPXs2VNmlvY4KHDurubmZjU0NOjss89OexygZOX9Ybfdu3fr1FNPJTw4JsxM\np556KnvSQMryPj6SCA+OKf4+AekriPgAAIoL8cEBli5dqqqqKvXq1Uv33Xdfm4956623NHr0aPXr\n10+XXHKJGhoaWn82depU9e3bV3379tXjjz8eNfYBPvnkE1133XXq1auXBg0apC1btqQyB4DDIz4p\n2bNnTyKvu3fv3r/pubfeequeffZZbdy4UfPnz9fGjRsPedydd96pb3zjG1q3bp3uvfdeTZs2TZL0\n9NNPa82aNaqvr9fKlSs1c+ZMffDBB0c9z9F6+OGHdfLJJ+svf/mLbr/9dk2dOjV8BgDtIz5HsGXL\nFvXu3Vs33XST+vXrpwkTJujjjz+WJK1evVojRozQRRddpNraWr3zzjuSpAcffFADBgxQdXW1rrnm\nmtbH33zzzbrjjjs0cuRITZ06Vb///e/Vv39/9e/fXxdccIF27twpd9eUKVPUt29fnX/++a17D8uX\nL9cll1yiCRMmqHfv3po0aZL2fRFgz549NWPGDA0dOlRPPPHEUW/rK6+8ol69eumcc85R586dNXHi\nRC1ZsuSQx23cuFGjR4+WJI0cObL1MRs3btSIESNUXl6url27qrq6WkuXLpUk3XvvvXrqqacOea3p\n06fr61//ukaNGqVzzz1XDz744FHPv8+SJUt00003SZImTJig3/zmN+JLE9OzeG2jhtz3W51919Ma\nct9vtXhtY9ojHVbPu55u84Zjj/jkYPPmzZo8ebLWrVunbt266Ze//KVaWlp02223acGCBVq9erVu\nueUW3X333ZKkq6++WqtWrdKrr76qPn366OGHH259rT/96U9atmyZ7r//fs2aNUu/+MUvVF9frxUr\nVqiiokILFy5UfX29Xn31VS1btkxTpkxpjdratWs1e/Zsbdy4UW+88Yb+8Ic/tL5uly5d9OKLL2ri\nxIkHzD5v3rzWwO1/mzBhwiHb2djYqDPPPLP1fmVlpRobD32jqK6u1pNPPilJWrRokXbu3Knm5mZV\nV1fr2Wef1ccff6zt27frd7/7nd5++21J0owZMzRu3Lg2//2uW7dOTz/9tF5++WXNmDFD27ZtO+Qx\nw4YNa3M7li1b1u52lJeX68QTT1Rzc3Ob60ayFq9t1LSF69W4Y5dcUuOOXZq2cH1eBqi9yBCgYy/v\nf88nH5x55pkaMmSIJOnGG2/Uz372M40ZM0YbNmzQZZddJilzyOr000+XJG3YsEH33HOPduzYoQ8/\n/FC1tbWtr3XttdeqrKxMkjRkyBDdcccdmjRpkq6++mpVVlbqxRdf1PXXX6+ysjKddtppGjFihFat\nWqVu3bpp4MCBqqyslCT1799fW7Zs0dChQyVJ1113XZuzT5o0SZMmTcppO9vaO2jrzLBZs2bpO9/5\njubMmaPhw4frjDPOUHl5uS6//HKtWrVKF198sbp3766vfOUrKi8/8l+xq666ShUVFaqoqNDIkSP1\nyiuvaPz48Qc8ZsWKFTltQ0e2A8mb+dxm7Wo58FDwrpa9mvncZo2/4IyUpkI+ID45OPiNy8zk7jrv\nvPP08ssvH/L4m2++WYsXL1Z1dbXmzJmj5cuXt/6sa9eurX++6667NHbsWD3zzDMaPHiwli1b1u7h\noc9//vOtfy4rKzvgc6P9X3d/8+bN08yZMw9Z3qtXLy1YsOCAZZWVla17KlLmF3x79OhxyHN79Oih\nhQsXSpI+/PBDPfnkkzrxxBMlSXfffXfrHuANN9ygc88997Dbs09b/34PNmzYMO3cufOQ5bNmzdKl\nl17a5nZUVlZqz549ev/993XKKacccQ4ce9t27OrQ8lLz3kef6tGVW9Me4292w6CzOvwcDrvlYOvW\nra2RmT9/voYOHaqqqio1NTW1Lm9padFrr70mSdq5c6dOP/10tbS0aN68eYd93ddff13nn3++pk6d\nqpqaGv3xj3/U8OHD9fjjj2vv3r1qamrSCy+8oIEDBx717JMmTVJ9ff0ht4PDI0kDBgzQn//8Z735\n5pv69NNP9dhjj7V5qGz79u367LPPJEk//vGPdcstt0jK7P3tO7y1bt06rVu3Tpdffrkkadq0aVq0\naFGbMy5ZskS7d+9Wc3Ozli9frgEDBhzymBUrVrS5HQeHR5LGjRunuXPnSpIWLFigUaNGseeTkh4n\nVXRoOUoH8clBnz59NHfuXPXr10/vvfeevv3tb6tz585asGCBpk6dqurqavXv318vvfSSJOlHP/qR\nBg0apMsuu0y9e/c+7OvOnj1bffv2VXV1tSoqKnTFFVfoq1/9qvr166fq6mqNGjVKP/nJT/TFL34x\nZDvLy8v185//XLW1terTp4++9rWv6bzzzpN04AkDy5cvV1VVlb70pS/p3Xffbd3TaWlp0bBhw/Tl\nL39ZkydP1iOPPNJ62G39+vWH3Y6BAwdq7NixGjx4sH7wgx+0ubfVEd/61rfU3NysXr166ac//elh\nTxlH8qbUVqmiU9kByyo6lWlKbVVKEyFfWD6dBVRTU+N1dXUHLNu0aZP69OmT0kSZs92uvPJKbdiw\nIbUZikFtba2ee+65Q5ZPnz5dxx9/vO68887QedL+e1VKFq9t1MznNmvbjl3qcVKFptRW5eXnPe2d\nVLDlvrEdeamcd7PP6dPP/2XOrzvy2nnpoMNuOW0/n/kgRFvhQWkYf8EZeRmbg225b2ybAepgeJAj\n4nMEPXv2ZK8nQdOnT097BKAVoYlTEPFxdz4wxjGTT4eaUdpO6dr5qM4UKwZ5f8JBly5d1NzczBsG\njol93+fTpUuXtEcBSlre7/lUVlaqoaFBTU1NaY+CIrHvm0wBpCfR+JjZ7ZL+XpJLWi/pm+7eoW/x\n6tSpE984CQBFJrHDbmZ2hqTvSqpx976SyiRNbP9ZAIBSkPRnPuWSKsysXNJxkg69YiQAoOQkFh93\nb5Q0S9JWSe9Iet/dnz/4cWY22czqzKyOz3UAFDve8zKSPOx2sqSrJJ0tqYekrmZ248GPc/cH3L3G\n3Wu6d++e1DgAkBd4z8tI8rDbpZLedPcmd2+RtFDSxQmuDwBQIJKMz1ZJg83sOMv8huhoSZsSXB8A\noEAk+ZnPSkkLJK1R5jTrz0l6IKn1AQAKR6K/5+PuP5T0wyTXAQAoPHl/eR0AQPEhPgCAcMQHABAu\n7y8sCgAHK5RvR8XhER8ABWXx2kZNW7heu1r2SpIad+zStIXrJYkAFRAOuwEoKDOf29wann12tezV\nzOc2pzQRjgbxAVBQtu3Y1aHlyE/EB0BB6XFSRYeWIz8RHwAFZUptlSo6lR2wrKJTmabUVqU0EY4G\nJxwAKCj7TirgbLfCRnwAFJzxF5xBbAoch90AAOGIDwAgHPEBAIQjPgCAcMQHABCO+AAAwhEfAEA4\n4gMACEd8AADhiA8AIBzxAQCEIz4AgHDEBwAQjvgAAMIRHwBAOOIDAAhHfAAA4YgPACAc8QEAhCM+\nAIBwxAcAEI74AADCER8AQDjiAwAIR3wAAOGIDwAgHPEBAIQjPgCAcMQHABCO+AAAwhEfAEA44gMA\nCEd8AADhiA8AIBzxAQCEIz4AgHDEBwAQjvgAAMIRHwBAOOIDAAhHfAAA4YgPACAc8QEAhEssPmZW\nZWb1+90+MLPvJ7U+AEDhKE/qhd19s6T+kmRmZZIaJS1Kan0AgMIRddhttKTX3f2toPUBAPJYVHwm\nSprf1g/MbLKZ1ZlZXVNTU9A4AJAO3vMyEo+PmXWWNE7SE2393N0fcPcad6/p3r170uMAQKp4z8uI\n2PO5QtIad383YF0AgAIQEZ/rdZhDbgCA0pRofMzsOEmXSVqY5HoAAIUlsVOtJcndP5Z0apLrAAAU\nHq5wAAAIR3wAAOGIDwAgHPEBAIQjPgCAcMQHABCO+AAAwhEfAEA44gMACEd8AADhiA8AIBzxAQCE\nIz4AgHDEBwAQjvgAAMIRHwBAOOIDAAhHfAAA4YgPACAc8QEAhCM+AIBwxAcAEI74AADCER8AQDji\nAwAIR3wAAOGIDwAgHPEBAIQjPgCAcMQHABCO+AAAwhEfAEA44gMACEd8AADhiA8AIBzxAQCEIz4A\ngHDEBwAQjvgAAMIRHwBAOOIDAAhHfAAA4crb+6GZXdjez919zbEdBwBQCtqNj6T7s//sIqlG0quS\nTFI/SSslDU1uNABAsWr3sJu7j3T3kZLeknShu9e4+0WSLpD0l4gBAQDFJ9fPfHq7+/p9d9x9g6T+\nyYwEACh2Rzrsts8mM3tI0iOSXNKNkjYlNhUAoKjlGp9vSvq2pO9l778g6T8SmQgAUPRyio+77zaz\n/5T0jLtvTngmAECRy+kzHzMbJ6le0tLs/f5m9lSSgwEAileuJxz8UNJASTskyd3rJfVMaCYAQJHL\nNT573P39RCcBAJSMXE842GBmN0gqM7NzJX1X0kvJjQUAKGa57vncJuk8SZ9IelTS+5K+n9RQAIDi\ndsQ9HzMrk/TP7j5F0t0deXEzO0nSQ5L6KvP7Qbe4+8tHMygAoHgcMT7uvtfMLjrK1/83SUvdfYKZ\ndZZ03FG+DgCgiOT6mc/a7KnVT0j6aN9Cd194uCeYWTdJwyXdnH3sp5I+PepJAQBFI9f4nCKpWdKo\n/Za5pMPGR9I5kpok/crMqiWtlvQ9d/9o/weZ2WRJkyXprLPOynEcAChMvOdlmLsn88JmNZL+W9IQ\nd19pZv8m6QN3/8HhnlNTU+N1dXWJzAMAQSzXBxbpe15O23+kL5P7d2X2cNrk7t9t5+kNkhrcfWX2\n/gJJd+UyFACguB3pVOs6ZQ6XdZF0oaQ/Z2/9Je1t74nu/j+S3jazquyi0ZI2/k3TAgCKQrt7Pu4+\nV5LM7GZJI929JXv/PyU9n8Pr3yZpXvZMtzeUuTo2AKDE5XrCQQ9JJ0h6L3v/+OyydmWvAVdzdKMB\nAIpVrvG5T5nTrX+XvT9C0vREJgIAFL1cv8/nV2b2rKRB2UV3ZT/TAQCgw3K9tpsklSnzezv/K+lL\nZjY8mZEAAMUupz0fM/tXSddJek3SZ9nFrszXaQMA0CG5fuYzXlKVu3+S5DAAgNKQ62G3NyR1SnIQ\nAEDpyHXP52NJ9Wb2G2W+00fSEa9wAABAm3KNz1PZGwAAf7NcT7Wem/QgAIDSkevZbm+qjQuMuvs5\nx3wiAEDRy/Ww2/6XyOki6VplvuMHAIAOy+lsN3dv3u/W6O6zdeAXywEAkLNcD7tduN/dzymzJ3RC\nIhMBAIperofd7tf/f+azR9IWZQ69AQDQYbnG5wpJ10jqud9zJkqakcBMAIAil2t8FkvaIWmNpN3J\njQMAKAW5xqfS3cckOgkAoGTkem23l8zs/EQnAQCUjHb3fMxsvTInGpRL+qaZvaHMtd1Mkrt7v+RH\nBAAUmyMddrsyZAoAQElpNz7u/lbUIACA0tGRr9EGAOCYID4AgHDEBwAQjvgAAMIRHwBAOOIDAAhH\nfAAA4YgPACAc8QEAhCM+AIBwxAcAEI74AADCER8AQDjiAwAIR3wAAOGIDwAgHPEBAIQjPgCAcMQH\nABCO+AAAwhEfAEA44gMACEd8AADhiA8AIBzxAQCEIz4AgHDEBwAQjvgAAMIRHwBAOOIDAAhHfAAA\n4YgPACAc8QEAhCtP8sXNbIuknZL2Strj7jVJrg8AUBgSjU/WSHffHrAeAECB4LAbACBc0vFxSc+b\n2Wozm9zWA8xsspnVmVldU1NTwuMAQLp4z8tIOj5D3P1CSVdIutXMhh/8AHd/wN1r3L2me/fuCY8D\nAOniPS8j0fi4+7bsP/8qaZGkgUmuDwBQGBKLj5l1NbMT9v1Z0uWSNiS1PgBA4UjybLfTJC0ys33r\nedTdlya4PgBAgUgsPu7+hqTqpF4fAFC4ONUaABCO+AAAwhEfAEA44gMACEd8AADhiA8AIBzxAQCE\nIz4AgHDEBwAQjvgAAMIRHwBAOOIDAAhHfAAA4YgPACAc8QEAhCM+AIBwxAcAEI74AADCER8AQDji\nAwAIR3wAAOGIDwAgHPEBAIQjPgCAcMQHABCO+AAAwhEfAEA44gMACEd8AADhiA8AIBzxAQCEIz4A\ngHDEBwAQjvgAAMIRHwBAOOIDAAhHfAAA4YgPACAc8QEAhCM+AIBwxAcAEI74AADCER8AQDjiAwAI\nR3wAAOGIDwAgHPEBAIQjPgCAcMQHABCO+AAAwhEfAEA44gMACEd8AADhiA8AIFzi8TGzMjNba2a/\nTnpdAIDCELHn8z1JmwLWAwAoEInGx8wqJY2V9FCS6wEAFJak93xmS/onSZ8d7gFmNtnM6sysrqmp\nKeFxACBdvOdlJBYfM7tS0l/dfXV7j3P3B9y9xt1runfvntQ4AJAXeM/LSHLPZ4ikcWa2RdJjkkaZ\n2SMJrg8AUCASi4+7T3P3SnfvKWmipN+6+41JrQ8AUDj4PR8AQLjyiJW4+3JJyyPWBQDIf+z5AADC\nER8AQDjiAwAIR3wAAOGIDwAgHPEBAIQjPgCAcMQHABCO+AAAwoVc4SAp9yxer/kr39Zed5WZ6fpB\nZ+pfxp+f9lgAgCMo2Pjcs3i9Hvnvra3397q33idAAJDfCvaw2/yVb3doOQAgfxRsfPa6d2g5ACB/\nFGx8ysw6tBwAkD8KNj7XDzqzQ8sBAPmjYE842HdSAWe7AUDhKdj4SJkAERsAKDwFe9gNAFC4iA8A\nIBzxAQCEIz4AgHDEBwAQjvgAAMIRHwBAOOIDAAhHfAAA4YgPACCceR59BYGZNUl6K8URviBpe4rr\nTwvbXVrY7mRtd/cxuTzQzJbm+thik1fxSZuZ1bl7TdpzRGO7SwvbjXzAYTcAQDjiAwAIR3wO9EDa\nA6SE7S4tbDdSx2c+AIBw7PkAAMIRHwBAOOKzHzMrM7O1ZvbrtGeJYmZbzGy9mdWbWV3a80Qxs5PM\nbIGZ/dHMNpnZV9KeKUlmVpX9b7zv9oGZfT/tuSKY2e1m9pqZbTCz+WbWJe2ZwGc+BzCzOyTVSOrm\n7lemPU8EM9siqcbdS+qXDs1srqQV7v6QmXWWdJy770h7rghmViapUdIgd0/zl7oTZ2ZnSHpR0pfd\nfZeZ/ZekZ9x9TrqTgT2fLDOrlDRW0kNpz4JkmVk3ScMlPSxJ7v5pqYQna7Sk14s9PPspl1RhZuWS\njpO0LeV5IOKzv9mS/knSZ2kPEswlPW9mq81sctrDBDlHUpOkX2UPsz5kZl3THirQREnz0x4igrs3\nSpolaaukdyS97+7PpzsVJOIjSTKzKyX91d1Xpz1LCoa4+4WSrpB0q5kNT3ugAOWSLpT0H+5+gaSP\nJN2V7kgxsocYx0l6Iu1ZIpjZyZKuknS2pB6SuprZjelOBYn47DNE0rjs5x+PSRplZo+kO1IMd9+W\n/edfJS2SNDDdiUI0SGpw95XZ+wuUiVEpuELSGnd/N+1Bglwq6U13b3L3FkkLJV2c8kwQ8ZEkufs0\nd690957KHJL4rbsX/f87MrOuZnbCvj9LulzShnSnSp67/4+kt82sKrtotKSNKY4U6XqVyCG3rK2S\nBpvZcWZmyvy33pTyTFDm8ANK12mSFmX+N6lySY+6+9J0Rwpzm6R52cNQb0j6ZsrzJM7MjpN0maR/\nTHuWKO6+0swWSFojaY+kteIyO3mBU60BAOE47AYACEd8AADhiA8AIBzxAQCEIz4AgHDEBwAQjvgA\nAMIRHxQVM7sj+70tG8zs+2bWM/t9PQ9mv9PleTOryD7278xsafaiqivMrHfa8wOlgvigaJjZRcpc\nqWCQpMGS/kHSyZLOlfQLdz9P0g5J12Sf8oCk29z9Ikl3Svpl+NBAieLyOigmQyUtcvePJMnMFkoa\npsyFJeuzj1ktqaeZHa/MBSafyF5eSJI+HzwvULKID4qJHWb5J/v9ea+kCmX2+ne4e//EpwJwCA67\noZi8IGl89grGXSV9VdKKth7o7h9IetPMrpUky6iOGxUobcQHRcPd10iaI+kVSSuV+Ur0/23nKZMk\nfcvMXpX0mjJfOgYgAFe1BgCEY88HABCO+AAAwhEfAEA44gMACEd8AADhiA8AIBzxAQCE+z8nSjcW\nU6vIfgAAAABJRU5ErkJggg==\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sns.jointplot(data=np.log10(tmpDf+1),x='one',y='hundred')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "75% for using just 1 bp SNP\n", "% for using 100 bp window" ] }, { "cell_type": "code", "execution_count": 301, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#chr1\t817186\t817187\t3\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 1 }