{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os\n", "\n", "baseGenomesDir='/cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/'\n", "###baseGenomesDir='/cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/'\n", "####out_dir='\n", "specie='Homo_sapiens'\n", "adaptor='AGATCGGAAGAGC'\n", "myTmpDir='/nrnb/users/btsui/Data/tcga_raw/2b0048e0-a062-40d2-a1e1-4bb763ea0ead/'\n", "bam_read_count_dir='/cellar/users/btsui/Program/bam_read_count/bam-readcount-master/bin/bam-readcount'\n", "\n", "genomeDir=baseGenomesDir+specie+'/'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "os.chdir(myTmpDir)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tcgaBamName='C494.TCGA-S9-A6U1-01A-21D-A33T-08.1_gdc_realn.bam'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "os.system('bamToFastq -i '+tcgaBamName + ' -fq /dev/stdout | head -n 4000 > test.fq')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "trim_galore_Dir='/cellar/users/btsui/Program/TRIMAGLORE//trim_galore'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "os.system(trim_galore_Dir+' '+'test.fq')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "os.system('rm fastq_pipe')\n", "os.system('mkfifo fastq_pipe ')\n", "os.system('bamToFastq -i '+tcgaBamName+' -fq /dev/stdout | cutadapt -a AGATCGGAAGAGC - > fastq_pipe &')\n", "\n", "cmd_1='bowtie2 --local -x {ref} -U fastq_pipe --no-unal --threads 64 | samtools view -bS - | samtools sort - > sorted.bam 2>bowtie_log.txt'.format(ref=genomeDir)\n", "os.system(cmd_1)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "snpBed='/cellar/users/btsui/Data/dbsnp/snp_beds/'+specie+'.bed'\n", "fa_dir='/cellar/users/btsui/Data/ensembl/snp_masked/'+specie+'.microbe.fa'" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/cellar/users/btsui/Program/bam_read_count/bam-readcount-master/bin/bam-readcount C494.TCGA-S9-A6U1-01A-21D-A33T-08.1_gdc_realn.bam |gzip > tcga.txt.gz'" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "###extract bam read count from TCGA files\n", "cmd_5=bam_read_count_dir+' C494.TCGA-S9-A6U1-01A-21D-A33T-08.1_gdc_realn.bam |gzip > tcga.bam_read_count.txt.gz'\n", "cmd_5" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/cellar/users/btsui/Program/bam_read_count/bam-readcount-master/bin/bam-readcount -l /cellar/users/btsui/Data/dbsnp/snp_beds/Homo_sapiens.bed -f /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.microbe.fa file_sorted |gzip > snp.txt.gz'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "specie='Homo_sapiens'\n", "snpBed='/cellar/users/btsui/Data/dbsnp/snp_beds/'+specie+'.bed'\n", "fa_dir='/cellar/users/btsui/Data/ensembl/snp_masked/'+specie+'.microbe.fa'\n", "\n", "cmd_5=bam_read_count_dir+' -l '+snpBed+' -f '+fa_dir+' file_sorted |gzip > snp.txt.gz'\n", "cmd_5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### scratch" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#!ls /cellar/users/btsui/Program/bam_read_count/bam-readcount-master/bin/bam-readcount" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "'bamToFastq -i TCGA-FG-6690-01A-11D-1891_130919_SN590_0238_AC29TRACXX_s_7_rg.sorted.bam -fq /dev/stdout |head -n 2000000 > test.out '" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#'bamToFastq -i G35154.TCGA-DU-6407-01A-13D-1705-08.1.bam -fq test.fq '" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!ls /cellar/users/btsui/Data/dbsnp/snp_beds/Homo_sapiens.bed" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'bowtie2 -x /cellar/users/btsui/Data/BOWTIE2_GENOME_INDEX/Homo_sapiens/ -U test.out --threads 2 > out.sam 2>bowtie_log.txt'" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "### try processing one\n", "myTmpDir='/nrnb/users/btsui/Data/tcga_raw/52ae2dd2-f573-41c6-ad1a-18b19c9eea35/'\n", "cmd_1='bowtie2 -x {ref} -U test.out --threads 2 > out.sam 2>bowtie_log.txt'.format(ref=genomeDir)\n", "cmd_1" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\nmilestone:\\n 1. past fastq\\nstill need:\\n 2. complete the bam\\n 3. Output entire bam \\n 4. check for IDH1 mutation for the ATGC\\n'" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "milestone:\n", " 1. past fastq\n", "still need:\n", " 2. complete the bam\n", " 3. Output entire bam \n", " 4. check for IDH1 mutation for the ATGC\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'bamToFastq -i TCGA-FG-6690-01A-11D-1891_130919_SN590_0238_AC29TRACXX_s_7_rg.sorted.bam -fq fastq_pipe'" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'mkfifo fastq_pipe '\n", "'bamToFastq -i 2.bam -fq fastq_pipe &'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "'bamToFastq -i G35154.TCGA-DU-6407-01A-13D-1705-08.1.bam -fq /dev/stdout | head -n 400000 > test.fq'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "'bamToFastq -i '" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'bowtie2 --local -x /cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX//Homo_sapiens/ -U small.name.sorted.fq --no-unal --threads 64 | samtools view -bS - | samtools sort - > sorted.bam 2>bowtie_log.txt'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cmd_1='bowtie2 --local -x {ref} -U small.name.sorted.fq --no-unal --threads 64 | samtools view -bS - | samtools sort - > sorted.bam 2>bowtie_log.txt'.format(ref=genomeDir)\n", "cmd_1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#'samtools sort output.bam > output.sorted'\n", "#test case vcf: 6f5b793c-9040-4fd7-8b32-2fe33bc8c7d2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "fa_dir='/cellar/users/btsui/Data/ensembl/release/hg19/Homo_sapiens.GRCh37.75.dna_rm.toplevel.fa'" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#fa_dir='/cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa'" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Homo_sapiens\r\n" ] } ], "source": [ "!ls /cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'samtools index sorted.bam'" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'samtools index sorted.bam'" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "collapsed": true }, "outputs": [], "source": [ "bam_read_count_dir='/cellar/users/btsui/Program/bam_read_count/bam-readcount-master/bin/bam-readcount'" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'samtools index sorted.bam'" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'samtools index sorted.bam'" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/cellar/users/btsui/Program/bam_read_count/bam-readcount-master/bin/bam-readcount -l loci.txt -f /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa sorted.bam'" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#bam-readcount -f ref.fa some.bam\n", "bam_read_count_dir+' -l loci.txt -f '+fa_dir+' sorted.bam'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#R132, R133" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'gdc-client download -t /cellar/users/btsui/../hcarter/gdc-user-token.2017-12-11T21_35_55.818Z.txt -d /nrnb/users/btsui/Data/tcga_raw/ 6f5b793c-9040-4fd7-8b32-2fe33bc8c7d2'" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "it's the bam to fastq is almost at 0 utilization. \n", "\"\"\"\n", "token_dir='/cellar/users/btsui/../hcarter/gdc-user-token.2017-12-11T21_35_55.818Z.txt'\n", "out_dir='/nrnb/users/btsui/Data/tcga_raw/'\n", "file_uuid='6f5b793c-9040-4fd7-8b32-2fe33bc8c7d2'\n", "gdc_cmd_fmt='gdc-client download -t {token_dir} -d {out_dir} {file_uuid}'\n", "gdc_cmd_fmt.format(token_dir=token_dir,out_dir=out_dir,file_uuid=file_uuid)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#rs121913500, 86 \n", "#209113110\n", "#2 209113112 rs121913500 C T 61 PASS DB;DP=165;Gene=IDH1;MQ0=0;SOMATIC;SS=Somatic;VC=Missense_Mutation;VT=SNP;TID=uc002vcs.3;VLSC=255 GT:AD:DP:FA:MQ0:BQ:SS:SSC 0/0:79,0:79:0.000:0:.:2:. 0/1:63,23:86:0.267:0:31:2:.\n", "#86 for primary tumor \n", "\n", "#up the alignment rate for the unmasked genome first, \n", "\n", "\n", "## test with chromosome 2 bam \n", "'samtools view -b -f 0x02 small.bam | samtools sort -n - > small.name.sorted.bam'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "'bamToFastq -i small.name.sorted.bam -fq small.name.sorted.fq'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "'samtools view -h 2.name.sorted.bam | head -n 1000 > 2.name.sorted.little.sam'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "'samtools view -h 2.name.sorted.bam | head -n 100000 | samtools view -bS - > 2.name.sorted.little.bam'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "'samtools view -b -f 0x02 G35154.TCGA-DU-6407-01A-13D-1705-08.1.bam 2 > 2.bam'" ] }, { "cell_type": "code", "execution_count": 88, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [], "source": [ "sra_dump=pd.read_pickle('/cellar/users/btsui/Data/METAMAP/deploy_ready/syn11415787/meta_data/technical_meta_df.pickle')" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Samplepercent_reads_alignedtotal_num_reads_alignedScientificNameRunExperimentStudyLibraryStrategyproj_accession_BioProjectproj_accession_BioSampleproj_accession_Spotsproj_accession_Basesproj_accession_Loadedproj_accession_Visibilityproj_accession_Centerproj_accession_Type
Run
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [Sample, percent_reads_aligned, total_num_reads_aligned, ScientificName, Run, Experiment, Study, LibraryStrategy, proj_accession_BioProject, proj_accession_BioSample, proj_accession_Spots, proj_accession_Bases, proj_accession_Loaded, proj_accession_Visibility, proj_accession_Center, proj_accession_Type]\n", "Index: []" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sra_dump[sra_dump.ScientificName=='Homo_sapiens']" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Mus musculus 113073\n", "Name: ScientificName, dtype: int64" ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sra_dump.ScientificName.value_counts()" ] }, { "cell_type": "code", "execution_count": 115, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!cp ../../Scratch/MaskingGenomeWithSnp.ipynb ." ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-rw-r--r-- 1 btsui users 4873 Jan 24 13:27 ./tmp.tcga.txt.gz\r\n" ] } ], "source": [ "!ls -alt ./tmp.tcga.txt.gz" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!mv ./2b0048e0-a062-40d2-a1e1-4bb763ea0ead.tcga.txt.gz ../Analysis/." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 1 }