{
"cells": [
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"fname='Homo_sapiens.fa.gz'\n",
"inDir='/cellar/users/btsui/Data/ensembl/snp_masked/'+fname\n",
"import os\n",
"withMicrobeDir=inDir.replace('.fa.gz','.microbe.fa.gz')"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-rw-r--r-- 1 btsui users 25 Dec 30 12:30 /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.fa.gz\r\n"
]
}
],
"source": [
"!ls -lah /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.fa.gz"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"specieName='Homo_sapiens'"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.microbe.fa.gz'"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"withMicrobeDir"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"### concat viral signatures from Methplan and 16s genes from mothur\n",
"\n",
"https://www.mothur.org/wiki/RDP_reference_files\n",
"\n",
"## bacteria and microbes reference mothur: \n",
"1. trainset16_022016.rdp.tax : id to sequence tax mapping\n",
"2. trainset16_022016.rdp.fasta.gz\n",
"\n",
"## viral \n",
"1. ftp://ftp.ncbi.nih.gov/refseq/release/viral/viral.1.1.genomic.fna.gz\n",
"2. ftp://ftp.ncbi.nih.gov/refseq/release/viral/viral.2.1.genomic.fna.gz"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"!gunzip -c ../Microbiome/viral.1.1_2.1.genomic.fna.gz > ../Microbiome/viral.1.1_2.1.genomic.fa"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#!head ../Microbiome/viral.1.1_2.1.genomic.fa"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/../Microbiome/viral.1.1_2.1.genomic.fa.fai\r\n"
]
}
],
"source": [
"!echo $PWD/../Microbiome/viral.1.1_2.1.genomic.fa.fai"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"pd.read_csv('../Microbiome/viral.1.1_2.1.genomic.fa.fai',sep='\\t',header=None)\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#!samtools faidx ../Microbiome/viral.1.1_2.1.genomic.fa"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#!gunzip -c ../Microbiome/viral.1.1.genomic.fna.gz | head "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#!echo $PWD/../Microbiome/"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#!gzip ../Microbiome/trainset16_022016.rdp.fasta"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#!gunzip -c ../Microbiome/viral.1.1_2.1.genomic.fna.gz | head -n 10"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"!cat ../Microbiome/trainset16_022016.rdp.fasta.gz ../Microbiome/viral.1.1_2.1.genomic.fna.gz > microbe.fa.gz"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-rw-r--r-- 1 btsui users 86M Dec 30 12:51 /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.fa.gz\r\n"
]
}
],
"source": [
"!ls -lah /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.fa.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 123,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.system('cat microbe.fa.gz '+inDir+' >'+withMicrobeDir)"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#!ls -alh /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.microbe.fa"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-rw-r--r-- 1 btsui users 86M Dec 30 12:51 /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.fa.gz\r\n"
]
}
],
"source": [
"!ls -lah /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.fa.gz"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#withMicrobeDir"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"gunzipedFaDir=withMicrobeDir.replace('.fa.gz','.fa')"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.system('rm '+gunzipedFaDir)"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 128,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.system('gunzip '+withMicrobeDir)"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 130,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.system('samtools faidx '+gunzipedFaDir)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 151,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#import pandas as pd\n",
"tmpDf=pd.read_csv(faDir+'.fai',sep='\\t',header=None)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"m1=tmpDf[0].str.contains('\\\\|')\\nm2=tmpDf[0].str.contains('_')\\ntmpDf.loc[m1,'state']='rdp'\\ntmpDf.loc[m2,'state']='ncbi_virus'\\ntmpDf.loc[~(m1|m2),'state']='human'\\ntmpDf\""
]
},
"execution_count": 159,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"\"\"m1=tmpDf[0].str.contains('\\|')\n",
"m2=tmpDf[0].str.contains('_')\n",
"tmpDf.loc[m1,'state']='rdp'\n",
"tmpDf.loc[m2,'state']='ncbi_virus'\n",
"tmpDf.loc[~(m1|m2),'state']='human'\n",
"tmpDf\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"myDir='/cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/'+specieName+'/'\n",
"faDir=gunzipedFaDir\n",
"#tmpDir='/tmp/btsui/'+fname\n",
"\n",
"\n",
"\n",
"\n",
"#os.system('cp '+faDir+' '+tmpDir)\n",
"#os.system('gunzip '+tmpDir)\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"cmd= 'bowtie2-build --threads 64 '+faDir+' '+myDir\n",
"os.system('rm -r '+myDir)\n",
"os.system('mkdir '+myDir)\n",
"cmd"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#os.system('mkdir '+myDir)"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"tmpDf3=pd.read_csv('/cellar/users/btsui/per_fa_record_stat.txt',sep='\\t',header=None).sort_values(2)"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" \n",
" \n",
" 15804 | \n",
" NC_001357.1 | \n",
" 7857 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3\n",
"15804 NC_001357.1 7857 0 0"
]
},
"execution_count": 171,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tmpDf3[tmpDf3[0].str.contains('NC_001357')]"
]
},
{
"cell_type": "code",
"execution_count": 177,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" \n",
" \n",
" 14162 | \n",
" NC_001493.2 | \n",
" 134226 | \n",
" 33 | \n",
" 0 | \n",
"
\n",
" \n",
" 14812 | \n",
" NC_024382.1 | \n",
" 137090 | \n",
" 33 | \n",
" 0 | \n",
"
\n",
" \n",
" 18228 | \n",
" NC_009823.1 | \n",
" 9711 | \n",
" 35 | \n",
" 0 | \n",
"
\n",
" \n",
" 17966 | \n",
" NC_004102.1 | \n",
" 9646 | \n",
" 39 | \n",
" 0 | \n",
"
\n",
" \n",
" 15950 | \n",
" NC_006641.1 | \n",
" 15959 | \n",
" 42 | \n",
" 0 | \n",
"
\n",
" \n",
" 17914 | \n",
" NC_030200.1 | \n",
" 137448 | \n",
" 43 | \n",
" 0 | \n",
"
\n",
" \n",
" 20658 | \n",
" NC_021858.1 | \n",
" 1908524 | \n",
" 59 | \n",
" 0 | \n",
"
\n",
" \n",
" 22235 | \n",
" NC_009127.1 | \n",
" 295146 | \n",
" 59 | \n",
" 0 | \n",
"
\n",
" \n",
" 13939 | \n",
" NC_020231.1 | \n",
" 233501 | \n",
" 61 | \n",
" 0 | \n",
"
\n",
" \n",
" 22138 | \n",
" NC_008912.1 | \n",
" 3141 | \n",
" 61 | \n",
" 0 | \n",
"
\n",
" \n",
" 14731 | \n",
" NC_020474.2 | \n",
" 180421 | \n",
" 61 | \n",
" 0 | \n",
"
\n",
" \n",
" 13843 | \n",
" NC_019491.1 | \n",
" 291144 | \n",
" 66 | \n",
" 0 | \n",
"
\n",
" \n",
" 20983 | \n",
" NC_005261.2 | \n",
" 137821 | \n",
" 68 | \n",
" 0 | \n",
"
\n",
" \n",
" 14853 | \n",
" NC_024709.1 | \n",
" 33452 | \n",
" 69 | \n",
" 0 | \n",
"
\n",
" \n",
" 15780 | \n",
" NC_001499.1 | \n",
" 5894 | \n",
" 77 | \n",
" 0 | \n",
"
\n",
" \n",
" 14004 | \n",
" NC_021312.1 | \n",
" 459984 | \n",
" 78 | \n",
" 0 | \n",
"
\n",
" \n",
" 15691 | \n",
" NC_002794.1 | \n",
" 195859 | \n",
" 102 | \n",
" 0 | \n",
"
\n",
" \n",
" 17235 | \n",
" NC_028834.1 | \n",
" 48216 | \n",
" 115 | \n",
" 0 | \n",
"
\n",
" \n",
" 20659 | \n",
" NC_022098.1 | \n",
" 2473870 | \n",
" 126 | \n",
" 0 | \n",
"
\n",
" \n",
" 14829 | \n",
" NC_024697.1 | \n",
" 370920 | \n",
" 358 | \n",
" 0 | \n",
"
\n",
" \n",
" 14211 | \n",
" NC_022518.1 | \n",
" 9472 | \n",
" 483 | \n",
" 0 | \n",
"
\n",
" \n",
" 15776 | \n",
" NC_001506.1 | \n",
" 3811 | \n",
" 506 | \n",
" 0 | \n",
"
\n",
" \n",
" 21974 | \n",
" NC_008168.1 | \n",
" 104710 | \n",
" 1535 | \n",
" 0 | \n",
"
\n",
" \n",
" 14240 | \n",
" NC_018464.1 | \n",
" 927 | \n",
" 3123 | \n",
" 0 | \n",
"
\n",
" \n",
" 18783 | \n",
" NC_032111.1 | \n",
" 163005 | \n",
" 8688 | \n",
" 0 | \n",
"
\n",
" \n",
" 22813 | \n",
" Y | \n",
" 57227415 | \n",
" 16392 | \n",
" 0 | \n",
"
\n",
" \n",
" 22807 | \n",
" 18 | \n",
" 80373285 | \n",
" 72541 | \n",
" 0 | \n",
"
\n",
" \n",
" 22810 | \n",
" 21 | \n",
" 46709983 | \n",
" 79744 | \n",
" 0 | \n",
"
\n",
" \n",
" 22802 | \n",
" 13 | \n",
" 114364328 | \n",
" 101329 | \n",
" 0 | \n",
"
\n",
" \n",
" 22811 | \n",
" 22 | \n",
" 50818468 | \n",
" 152059 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3\n",
"14162 NC_001493.2 134226 33 0\n",
"14812 NC_024382.1 137090 33 0\n",
"18228 NC_009823.1 9711 35 0\n",
"17966 NC_004102.1 9646 39 0\n",
"15950 NC_006641.1 15959 42 0\n",
"17914 NC_030200.1 137448 43 0\n",
"20658 NC_021858.1 1908524 59 0\n",
"22235 NC_009127.1 295146 59 0\n",
"13939 NC_020231.1 233501 61 0\n",
"22138 NC_008912.1 3141 61 0\n",
"14731 NC_020474.2 180421 61 0\n",
"13843 NC_019491.1 291144 66 0\n",
"20983 NC_005261.2 137821 68 0\n",
"14853 NC_024709.1 33452 69 0\n",
"15780 NC_001499.1 5894 77 0\n",
"14004 NC_021312.1 459984 78 0\n",
"15691 NC_002794.1 195859 102 0\n",
"17235 NC_028834.1 48216 115 0\n",
"20659 NC_022098.1 2473870 126 0\n",
"14829 NC_024697.1 370920 358 0\n",
"14211 NC_022518.1 9472 483 0\n",
"15776 NC_001506.1 3811 506 0\n",
"21974 NC_008168.1 104710 1535 0\n",
"14240 NC_018464.1 927 3123 0\n",
"18783 NC_032111.1 163005 8688 0\n",
"22813 Y 57227415 16392 0\n",
"22807 18 80373285 72541 0\n",
"22810 21 46709983 79744 0\n",
"22802 13 114364328 101329 0\n",
"22811 22 50818468 152059 0"
]
},
"execution_count": 177,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tmpDf3[tmpDf3[2]>5].iloc[-50:-20]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 1
}