{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import gzip\n", "import re\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\n", "pattern='VP=0x\\d{4}(\\d{2})'\n", "prog = re.compile(pattern)\n", "inDbDir='/data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz'\n", "outDbDir=inDbDir.replace('.vcf.gz','.f1_byte2_not_00.vcf.gz')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "number of snps retained" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "325174853\r\n" ] } ], "source": [ "!gunzip -c /data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz |wc -l" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "325174853\r\n" ] } ], "source": [ "!gunzip -c /data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz| wc -l " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### identify snps with reference" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\n", "###\n", "TEST=False\n", "with gzip.open(inDbDir, 'rb') as f:\n", " with gzip.open(outDbDir,'wb') as wf:\n", " for i,l in enumerate(f):\n", " if l[0]!='#':\n", " f1_byte2=prog.findall(l)[0]\n", " if f1_byte2!='00':\n", " wf.write(l)\n", " if TEST and (i>100000):\n", " break\n", " if (i%(10**6))==0:\n", " print i" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\n", "#inDbDir='/data/cellardata/users/btsui/dbsnp/All_20170710.vcf.gz'\n", "#outDbDir=inDbDir.replace('.vcf.gz','.f1_byte2_not_00.vcf.gz')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### identify snp window" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\n", "tmpDf=pd.read_csv(outDbDir,sep='\\t',header=None)\n", "tmpDf.columns=['Chr','Loc','rs','REF','ALT','','','Annot']" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "### take only the non-\n", "#give it 1000\n", "window_size=1000\n", "tmpDf['Start']=tmpDf['Loc']-window_size\n", "tmpDf.loc[(tmpDf['Start']<0),'Start']=0\n", "tmpDf['End']=tmpDf['Loc']+window_size\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "tmpDf2=tmpDf[['Chr','Start','End']]\n", "tmpDf2.to_csv('extracting_region.bed',sep='\\t',header=None,index=None)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#!head /cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa.fai" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "myCleanFaDir='/cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa'\n", "os.system(' samtools faidx '+ myCleanFaDir)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "#bedtools complement -i -g \n", "myFai=pd.read_csv(myCleanFaDir+'.fai',sep='\\t',header=None)\n", "\n", "myFai[[0,1]].to_csv('genome',sep='\\t',header=None,index=None)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "***** WARNING: MT:0-17519 exceeds the length of chromosome (MT)\r\n" ] } ], "source": [ "!bedtools complement -i extracting_region.bed -g genome > complement.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### mask out the rest of the genome" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'pd' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcomplementDf\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'complement.txt'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0msep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'\\t'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined" ] } ], "source": [ "complementDf=pd.read_csv('complement.txt',sep='\\t',header=None)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bedtools v2.26.0\r\n" ] } ], "source": [ "!bedtools --version" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!rm -r /data/cellardata/users/btsui/dbsnp/snp_bed/" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mkdir: cannot create directory ‘/data/cellardata/users/btsui/dbsnp/snp_beds’: File exists\r\n" ] } ], "source": [ "!mkdir /data/cellardata/users/btsui/dbsnp/snp_beds" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "tmpDf[['Chr','Loc','Loc']].to_csv('/data/cellardata/users/btsui/dbsnp/snp_beds/Homo_sapiens.bed',\n", " sep='\\t',header=None,index=None)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "#!wc -l /data/cellardata/users/btsui/dbsnp/snp_beds/Homo_sapiens.bed" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "#! maskFastaFromBed" ] }, { "cell_type": "code", "execution_count": 76, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!rm -r fifo pipe" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [], "source": [ "!rm pipe\n", "!mkfifo pipe" ] }, { "cell_type": "code", "execution_count": 103, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.system('maskFastaFromBed -fi /cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa -bed complement.txt -fo pipe &')" ] }, { "cell_type": "code", "execution_count": 106, "metadata": { "collapsed": true }, "outputs": [], "source": [ "specie='Homo_sapiens'" ] }, { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 480M\r\n", "drwxr-xr-x 2 btsui users 5 Dec 30 12:57 .\r\n", "drwxr-xr-x 5 btsui users 5 Dec 26 22:58 ..\r\n", "-rw-r--r-- 1 btsui users 86M Dec 30 12:51 Homo_sapiens.fa.gz\r\n", "-rw-r--r-- 1 btsui users 3.0G Dec 30 12:51 Homo_sapiens.microbe.fa\r\n", "-rw-r--r-- 1 btsui users 1.7M Dec 30 12:58 Homo_sapiens.microbe.fa.fai\r\n" ] } ], "source": [ "!ls -alh /cellar/users/btsui/Data/ensembl/snp_masked/" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "outDir='/cellar/users/btsui/Data/ensembl/snp_masked/'" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.system('gzip -c pipe > '+outDir+specie+'.fa.gz')" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!rm /cellar/users/btsui/Data/ensembl/snp_masked/*" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "#!ls -lah /cellar/users/btsui/Data/ensembl/snp_masked/" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'bowtie2-build --threads 48 /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa /cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/Homo_sapiens/'" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#run bowtie \n", "myDir='/cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/Homo_sapiens/'\n", "faDir='/cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa'\n", "cmd= 'bowtie2-build --threads 48 '+faDir+' '+myDir\n", "cmd" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "#!cp Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa /cellar/users/btsui/Data/ensembl/snp_masked/.\n" ] }, { "cell_type": "code", "execution_count": 282, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"\\nsubDf=complementDf[(complementDf.Chr=='2')]\\nsubDf['dist_idh']=(subDf['Start']-208236227).abs()\\nsubDf.sort_values('dist_idh')\"" ] }, "execution_count": 282, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#IDH1\n", "\"\"\"\n", "Chromosome:2\n", "Start:208,236,227 bp from pterEnd:208,266,074 bp from pter\n", "\n", "\"\"\"\n", "\n", "\"\"\"\n", "subDf=complementDf[(complementDf.Chr=='2')]\n", "subDf['dist_idh']=(subDf['Start']-208236227).abs()\n", "subDf.sort_values('dist_idh')\"\"\"" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'complementDf' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mspaceMasked\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcomplementDf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEnd\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mcomplementDf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mStart\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'complementDf' is not defined" ] } ], "source": [ "spaceMasked=(complementDf.End-complementDf.Start)" ] }, { "cell_type": "code", "execution_count": 265, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10.705346589807313" ] }, "execution_count": 265, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.log10((spaceMasked.sum()))" ] }, { "cell_type": "code", "execution_count": 267, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "50739547498" ] }, "execution_count": 267, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import cPickle as pickle\n", "import bz2\n", "\n", "db = pickle.load(bz2.BZ2File('db_v20/mpa_v20_m200.pkl', 'r'))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "specie='Mus_musculus'" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from Bio import SeqIO" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "\n", "#record = SeqIO.read(\"single.fasta\", \"fasta\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "#!cp ./Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa.gz /cellar/users/btsui/Data/ensembl/snp_masked/" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 806M\r\n", "drwxr-xr-x 2 btsui users 7 Oct 28 16:20 .\r\n", "-rw-r--r-- 1 btsui users 62M Oct 28 16:20 Mus_musculus.fa.gz\r\n", "-rw-r--r-- 1 btsui users 119M Mar 2 2018 Mus_musculus.microbe.fa.gz\r\n", "drwxr-xr-x 6 btsui users 6 Jan 1 2018 ..\r\n", "-rw-r--r-- 1 btsui users 869K Dec 30 2017 Homo_sapiens.microbe.fa.fai\r\n", "-rw-r--r-- 1 btsui users 3.2G Dec 30 2017 Homo_sapiens.microbe.fa\r\n", "-rw-r--r-- 1 btsui users 86M Dec 30 2017 Homo_sapiens.fa.gz\r\n" ] } ], "source": [ "!ls -laht /cellar/users/btsui/Data/ensembl/snp_masked/| head" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Homo_sapiens.fa.gz\n" ] } ], "source": [] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'GGCAGGAAGGAATGCTTTAGTAAGGCTGCTTTCAACTACCGAGTCAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGTTTTTTCTTTTATGCATGATGGGATCATGTTTAATACAATCTTTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGAATAAAAGGATAAAGGNNNNNNNNNNNNNNNNNNNNNNNNNNNTGTGTTGAGATGGACGCCTATTTGTAAGTTTATTTGTATTTGCCTTTAGCTAAATGTGTGTAAATATACAGTTATACATATATGCATTTCTCAATTTCATACCTTGCTTAATGGGTGTAGATACCAAAAGATAAGAATAAAACACATACAAGTTGGAAATTTCTGGGCCATGNNNNNNNNNNCATGCAAAATCACATTATTGCCAACATGACTTACTTGATCCCCATAAGCATGACGACCTATGATGATAGGTTTTACCCATCCACTCACAAGCCGGGGGATATTTTTGCAGATAATGGCTTCTCTGAAGACCGTGCCACCCAGAATATTTCGTATGGTGCCATTTGGTGATTTCCACATTTGTTTCAACTTGAACTCCTCAACCCTCTTCTCATCAGGAGTGATAGTGGCACATTTGACGCCAACATTATGCTTCTTTATAGCTTCTGCAGCATCCTTGGTGACTTGGTCGTTGGTGGCATCACGATTCTCTATGCCTAAATCATAGCTTGAAAGAGAAAAATTAGAAGCAAAGTTTTTCAGACAAATGGATAGTTATAACCTACAACTGCAGTGATGGCATATAGAGCTCATTATGCAGAAAGCGAAGGCTCTGAGTACACCAAAATCTGCCAAGTTTTAGCACTGGCACACCCTAAACACAGAAGATGGGTGCCAACTACAAAGAGAACTAAGAGAGGCTAGACTGCTGGGCTGCCCCTCCTCTTGTAATGGGAAGCTGTCTATCAGGAAAATGAATGGAAACCATCAAACTCTGCCCTTTGCCTCCTGTTTTTCACCAGATAGAGGCAATAATGGCCTGGTTTCTCTCCATATAATTAGTATAATAAGTGTAAGTCTAAGGACCACCTTTTTCCCTTAGTAAATATTGCTCTTTAAAAATAAAAGGGGGTGGGAGGGGTTAAAGGATTTTTTGTATTTTGACGTTGAAGTGGTGGGAGGTAAAGATGTGACAGTTGATGCCAAAAATGCGAATGACTACTTCATTAATGCCAGTTAGTCATAAAATGAGCCTTTCCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCTTCTGTAATTCTTGAGCAAGTGTTTGCT'" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "i=208248388\n", "#0\t3134601\n", "str(record.seq)[i-1000:i+1000]" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "#!head /cellar/users/btsui/Data/ensembl/snp_masked/Mus_musculus.fa.gz" ] }, { "cell_type": "code", "execution_count": 364, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'TAAGCATGACGACCTATGAT'" ] }, "execution_count": 364, "metadata": {}, "output_type": "execute_result" } ], "source": [ "str(record.seq)[i-10:i+10]" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3660\n" ] } ], "source": [ "proteome='/cellar/users/btsui/Downloads/viral.2.protein.faa'\n", "with open(proteome) as handle:\n", " for record in SeqIO.parse(handle, \"fasta\") :\n", " print (len(record))*3\n", " break" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SeqRecord(seq=Seq('MAVNTSGKTRLPQPASEDYTQYARNTLKNLNNVYEKFAVRGPVLALVRPAQFSK...GAV', SingleLetterAlphabet()), id='YP_003620396.1', name='YP_003620396.1', description='YP_003620396.1 p130 [Providence virus]', dbxrefs=[])" ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "record" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-rw-r--r-- 1 btsui users 19M Dec 30 13:25 /cellar/users/btsui/Downloads/viral.2.protein.faa\r\n" ] } ], "source": [ "!ls -lah /cellar/users/btsui/Downloads/viral.2.protein.faa" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### create ids for the bed" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "scrolled": true }, "outputs": [], "source": [ "tmpBedDf=pd.read_csv('/data/cellardata/users/btsui/dbsnp/snp_beds/Homo_sapiens.bed',header=None,sep='\\t')" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',\n", " '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y', 'MT'], dtype=object)" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tmpBedDf[0].astype(np.str).unique()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "tmpBedDf['Chr']=tmpBedDf['Chr'].astype(np.str)\n", "tmpBedDf['Pos']=tmpBedDf['Pos'].astype(np.str)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "tmpBedDf.columns=['Chr','Pos','']\n", "tmpBedDf['Chr_Pos']=tmpBedDf['Chr']+'-'+tmpBedDf['Pos']" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "tmpBedDf['Id']=tmpBedDf.index" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "tmpBedDf[['Chr_Pos','Id']].drop_duplicates(['Chr_Pos']).to_pickle('/data/cellardata/users/btsui/dbsnp/snp_beds/Homo_sapiens_chrom_pos__id.pickle')" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "index=tmpDf.Chr.value_counts()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2 35223\n", "1 30833\n", "6 28444\n", "11 23637\n", "5 21139\n", "3 21012\n", "7 20244\n", "12 17929\n", "10 17399\n", "16 17252\n", "9 14998\n", "17 14816\n", "X 14761\n", "19 14662\n", "4 14607\n", "8 13743\n", "13 13635\n", "15 12346\n", "14 10423\n", "20 7769\n", "18 6810\n", "22 6742\n", "17 6050\n", "21 4910\n", "Y 3346\n", "MT 512\n", "Name: Chr, dtype: object" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index.astype(np.str)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 1 }