{ "cells": [ { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": true }, "outputs": [], "source": [ "fname='Homo_sapiens.fa.gz'\n", "inDir='/cellar/users/btsui/Data/ensembl/snp_masked/'+fname\n", "import os\n", "withMicrobeDir=inDir.replace('.fa.gz','.microbe.fa.gz')" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-rw-r--r-- 1 btsui users 25 Dec 30 12:30 /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.fa.gz\r\n" ] } ], "source": [ "!ls -lah /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.fa.gz" ] }, { "cell_type": "code", "execution_count": 112, "metadata": { "collapsed": true }, "outputs": [], "source": [ "specieName='Homo_sapiens'" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.microbe.fa.gz'" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "withMicrobeDir" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "### concat viral signatures from Methplan and 16s genes from mothur\n", "\n", "https://www.mothur.org/wiki/RDP_reference_files\n", "\n", "## bacteria and microbes reference mothur: \n", "1. trainset16_022016.rdp.tax : id to sequence tax mapping\n", "2. trainset16_022016.rdp.fasta.gz\n", "\n", "## viral \n", "1. ftp://ftp.ncbi.nih.gov/refseq/release/viral/viral.1.1.genomic.fna.gz\n", "2. ftp://ftp.ncbi.nih.gov/refseq/release/viral/viral.2.1.genomic.fna.gz" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!gunzip -c ../Microbiome/viral.1.1_2.1.genomic.fna.gz > ../Microbiome/viral.1.1_2.1.genomic.fa" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!head ../Microbiome/viral.1.1_2.1.genomic.fa" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/../Microbiome/viral.1.1_2.1.genomic.fa.fai\r\n" ] } ], "source": [ "!echo $PWD/../Microbiome/viral.1.1_2.1.genomic.fa.fai" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "pd.read_csv('../Microbiome/viral.1.1_2.1.genomic.fa.fai',sep='\\t',header=None)\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!samtools faidx ../Microbiome/viral.1.1_2.1.genomic.fa" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!gunzip -c ../Microbiome/viral.1.1.genomic.fna.gz | head " ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!echo $PWD/../Microbiome/" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!gzip ../Microbiome/trainset16_022016.rdp.fasta" ] }, { "cell_type": "code", "execution_count": 120, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!gunzip -c ../Microbiome/viral.1.1_2.1.genomic.fna.gz | head -n 10" ] }, { "cell_type": "code", "execution_count": 121, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!cat ../Microbiome/trainset16_022016.rdp.fasta.gz ../Microbiome/viral.1.1_2.1.genomic.fna.gz > microbe.fa.gz" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-rw-r--r-- 1 btsui users 86M Dec 30 12:51 /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.fa.gz\r\n" ] } ], "source": [ "!ls -lah /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.fa.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.system('cat microbe.fa.gz '+inDir+' >'+withMicrobeDir)" ] }, { "cell_type": "code", "execution_count": 139, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!ls -alh /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.microbe.fa" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-rw-r--r-- 1 btsui users 86M Dec 30 12:51 /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.fa.gz\r\n" ] } ], "source": [ "!ls -lah /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.fa.gz" ] }, { "cell_type": "code", "execution_count": 125, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#withMicrobeDir" ] }, { "cell_type": "code", "execution_count": 126, "metadata": { "collapsed": true }, "outputs": [], "source": [ "gunzipedFaDir=withMicrobeDir.replace('.fa.gz','.fa')" ] }, { "cell_type": "code", "execution_count": 127, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.system('rm '+gunzipedFaDir)" ] }, { "cell_type": "code", "execution_count": 128, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 128, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.system('gunzip '+withMicrobeDir)" ] }, { "cell_type": "code", "execution_count": 130, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 130, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.system('samtools faidx '+gunzipedFaDir)" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 151, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#import pandas as pd\n", "tmpDf=pd.read_csv(faDir+'.fai',sep='\\t',header=None)\n", "\n" ] }, { "cell_type": "code", "execution_count": 159, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"m1=tmpDf[0].str.contains('\\\\|')\\nm2=tmpDf[0].str.contains('_')\\ntmpDf.loc[m1,'state']='rdp'\\ntmpDf.loc[m2,'state']='ncbi_virus'\\ntmpDf.loc[~(m1|m2),'state']='human'\\ntmpDf\"" ] }, "execution_count": 159, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"m1=tmpDf[0].str.contains('\\|')\n", "m2=tmpDf[0].str.contains('_')\n", "tmpDf.loc[m1,'state']='rdp'\n", "tmpDf.loc[m2,'state']='ncbi_virus'\n", "tmpDf.loc[~(m1|m2),'state']='human'\n", "tmpDf\"\"\"" ] }, { "cell_type": "code", "execution_count": 160, "metadata": { "collapsed": true }, "outputs": [], "source": [ "myDir='/cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/'+specieName+'/'\n", "faDir=gunzipedFaDir\n", "#tmpDir='/tmp/btsui/'+fname\n", "\n", "\n", "\n", "\n", "#os.system('cp '+faDir+' '+tmpDir)\n", "#os.system('gunzip '+tmpDir)\n", "\n", "\n", "\n", "\n", "\n", "\n", "cmd= 'bowtie2-build --threads 64 '+faDir+' '+myDir\n", "os.system('rm -r '+myDir)\n", "os.system('mkdir '+myDir)\n", "cmd" ] }, { "cell_type": "code", "execution_count": 118, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#os.system('mkdir '+myDir)" ] }, { "cell_type": "code", "execution_count": 166, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tmpDf3=pd.read_csv('/cellar/users/btsui/per_fa_record_stat.txt',sep='\\t',header=None).sort_values(2)" ] }, { "cell_type": "code", "execution_count": 171, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
15804NC_001357.1785700
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "15804 NC_001357.1 7857 0 0" ] }, "execution_count": 171, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tmpDf3[tmpDf3[0].str.contains('NC_001357')]" ] }, { "cell_type": "code", "execution_count": 177, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
14162NC_001493.2134226330
14812NC_024382.1137090330
18228NC_009823.19711350
17966NC_004102.19646390
15950NC_006641.115959420
17914NC_030200.1137448430
20658NC_021858.11908524590
22235NC_009127.1295146590
13939NC_020231.1233501610
22138NC_008912.13141610
14731NC_020474.2180421610
13843NC_019491.1291144660
20983NC_005261.2137821680
14853NC_024709.133452690
15780NC_001499.15894770
14004NC_021312.1459984780
15691NC_002794.11958591020
17235NC_028834.1482161150
20659NC_022098.124738701260
14829NC_024697.13709203580
14211NC_022518.194724830
15776NC_001506.138115060
21974NC_008168.110471015350
14240NC_018464.192731230
18783NC_032111.116300586880
22813Y57227415163920
228071880373285725410
228102146709983797440
22802131143643281013290
2281122508184681520590
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "14162 NC_001493.2 134226 33 0\n", "14812 NC_024382.1 137090 33 0\n", "18228 NC_009823.1 9711 35 0\n", "17966 NC_004102.1 9646 39 0\n", "15950 NC_006641.1 15959 42 0\n", "17914 NC_030200.1 137448 43 0\n", "20658 NC_021858.1 1908524 59 0\n", "22235 NC_009127.1 295146 59 0\n", "13939 NC_020231.1 233501 61 0\n", "22138 NC_008912.1 3141 61 0\n", "14731 NC_020474.2 180421 61 0\n", "13843 NC_019491.1 291144 66 0\n", "20983 NC_005261.2 137821 68 0\n", "14853 NC_024709.1 33452 69 0\n", "15780 NC_001499.1 5894 77 0\n", "14004 NC_021312.1 459984 78 0\n", "15691 NC_002794.1 195859 102 0\n", "17235 NC_028834.1 48216 115 0\n", "20659 NC_022098.1 2473870 126 0\n", "14829 NC_024697.1 370920 358 0\n", "14211 NC_022518.1 9472 483 0\n", "15776 NC_001506.1 3811 506 0\n", "21974 NC_008168.1 104710 1535 0\n", "14240 NC_018464.1 927 3123 0\n", "18783 NC_032111.1 163005 8688 0\n", "22813 Y 57227415 16392 0\n", "22807 18 80373285 72541 0\n", "22810 21 46709983 79744 0\n", "22802 13 114364328 101329 0\n", "22811 22 50818468 152059 0" ] }, "execution_count": 177, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tmpDf3[tmpDf3[2]>5].iloc[-50:-20]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 1 }