{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import synapseclient\n", "import os\n", "import sys\n", "import itertools\n", "#adding the lib path\n", "sys.path.append('/Users/abhishek/dev/appys/lib/')\n", "\n", "#internal modules\n", "import utils" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "! ls -l $synapseclient.__file__" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "#login to Synapse\n", "syn = synapseclient.Synapse()\n", "syn.login()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "#settings\n", "syn_forceVersion = False\n", "syn_STORE = False" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "#syn ids\n", "project_folder = 'syn2275628'\n", "scripts_folder = 'syn2276110'\n", "tophat_used = 'syn2243144'\n", "cufflinks_used = 'syn2243146'\n", "fastqs_folder = 'syn2276483'\n", "bams_folder = 'syn2276484'\n", "expressions_file_folder = 'syn2276109'" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "#gather data\n", "basedata_dir = \"/Users/abhishek/apratap_bt/projects/PCBC_integrative_analysis/data/\"\n", "read1_fastqs = utils.get_FilesList(basedata_dir, pattern=\"*LIB*R1.fastq.bz2\")\n", "read2_fastqs = utils.get_FilesList(basedata_dir, pattern=\"*LIB*R2.fastq.bz2\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "#push scripts\n", "#upload scripts to cufflinks\n", "tophatScript = syn.store(synapseclient.File(\"/Users/abhishek/apratap_bt/projects/PCBC_integrative_analysis/scripts/runTophat.sh\",\n", " name='tophat script', parent=scripts_folder),forceVersion=syn_forceVersion)\n", "tophat = syn.get(tophat_used,downloadFile=False,version=1)\n", "\n", "cufflinksScript = syn.store(synapseclient.File(\"/Users/abhishek/apratap_bt/projects/PCBC_integrative_analysis/scripts/runCufflinks.sh\",\n", " name='cufflinks script', parent=scripts_folder),forceVersion=syn_forceVersion)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "def create_fastq_page_markdown(fastq,attachments):\n", "\n", " fastq_name = os.path.basename(fastq)\n", " markdown = '## QC Report \\n'\n", " \n", " for image in attachments:\n", " image_file = os.path.basename(image)\n", " markdown += 'Metric %s' % image_file.replace('.png','').replace('_',' ')\n", " markdown += '${image?fileName=%s&align=none&scale=50}' % image_file \n", " return(markdown)\n", " " ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "def readAlignSummary(fileName):\n", " with open(fileName) as f:\n", " f.readline() #removes first line\n", " numberReads_1 = int(f.readline().split(':')[1])\n", " mappedReads_1 = int(f.readline().split(':')[1].split('(')[0])\n", " f.readline()\n", " f.readline()\n", " numberReads_2 = int(f.readline().split(':')[1])\n", " mappedReads_2 = int(f.readline().split(':')[1].split('(')[0])\n", " f.readline()\n", " percentMapped = f.readline().split('%')[0]+'%'\n", " return numberReads_1, mappedReads_1,numberReads_2, mappedReads_2, percentMapped" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "for read1_fastq,read2_fastq in itertools.izip(read1_fastqs,read2_fastqs):\n", " read1_prefix = os.path.basename(read1_fastq).replace('_R1.fastq.bz2','')\n", " read2_prefix = os.path.basename(read2_fastq).replace('_R2.fastq.bz2','')\n", " \n", " #sanity check for read 1 and read 2 of same fastq/ sample\n", " if read1_prefix != read2_prefix:\n", " print('not read 1 and read 2.. prefix different')\n", " print ('%s != %s' % (read1_prefix,read2_preifx))\n", " break\n", " \n", " lib,sample,index,lane = read1_prefix.split('_')\n", " fastq_annotations = {'lib':lib, 'sample':sample, 'index':index, \n", " 'lane':lane, 'dataType':'mRNA'}\n", " \n", " \n", " \n", " tophat_dir = os.path.dirname(read1_fastq) + '/tophat_out/'\n", " align_summary_file = tophat_dir + 'align_summary.txt'\n", " mapped_bam_file = tophat_dir + 'accepted_hits.bam'\n", " \n", " cufflinks_dir = os.path.dirname(read1_fastq) + '/cufflinks/'\n", " genes_fpkm_file = cufflinks_dir + 'genes.fpkm_tracking'\n", " \n", " #mapping stats\n", " numberReads_1, mappedReads_1, numberReads_2, mappedReads_2, percentMapped = readAlignSummary(align_summary_file)\n", " \n", " mapping_annotations = {'nReads_1' : numberReads_1, 'nReads_2' : numberReads_2,\n", " 'mappedReads_1' : mappedReads_1, 'mappedReads_2' : mappedReads_2,\n", " 'percentMapped' : percentMapped }\n", " \n", " mapping_annotations = dict(fastq_annotations.items() + mapping_annotations.items())\n", " \n", " \n", " ################\n", " #Upload files to synapse\n", " ################\n", " \n", " ######\n", " #fastqs\n", " #######\n", " #add read 1\n", " syn_read1_fastq = synapseclient.File(read1_fastq, parent= fastqs_folder,\n", " synapseStore = syn_STORE, \n", " name = sample+'_read1.fastq', annotations=fastq_annotations)\n", " syn_read1_fastq['read'] = 1\n", " syn_read1_fastq = syn.store(syn_read1_fastq ,forceVersion = syn_forceVersion) \n", " #add read 2\n", " syn_read2_fastq = synapseclient.File(read2_fastq, parent= fastqs_folder,\n", " synapseStore = syn_STORE, \n", " name = sample+'_read2.fastq', annotations=fastq_annotations)\n", " syn_read2_fastq['read'] = 2\n", " syn_read2_fastq = syn.store(syn_read2_fastq ,forceVersion = syn_forceVersion)\n", "\n", " \n", " #########\n", " #push QC images to a wiki\n", " #########\n", " \n", " #get list of files to attach\n", " fastqc_folder_read1 = os.path.dirname(read1_fastq) + '/FastQC/' + read1_prefix + '_R1_fastqc/'\n", " fastqc_read1_images = utils.get_FilesList(fastqc_folder_read1+'/Images',pattern=\"*png\")\n", " #create the wiki\n", " wiki_read1_fastq = synapseclient.Wiki(title ='QC Report %s' % sample,\n", " owner = syn_read1_fastq.id,\n", " attachments = fastqc_read1_images)\n", " #create and push the markdown\n", " wiki_read1_fastq.markdown = create_fastq_page_markdown(read1_fastq,fastqc_read1_images)\n", " wiki_read1_fastq = syn.store(wiki_read1_fastq)\n", " \n", " \n", " #get list of files to attach\n", " fastqc_folder_read2 = os.path.dirname(read2_fastq) + '/FastQC/' + read2_prefix + '_R2_fastqc/'\n", " fastqc_read2_images = utils.get_FilesList(fastqc_folder_read2+'/Images',pattern=\"*png\")\n", " #create the wiki\n", " wiki_read2_fastq = synapseclient.Wiki(title ='QC Report %s' % sample,\n", " owner = syn_read2_fastq.id,\n", " attachments = fastqc_read2_images)\n", " #create and push the markdown\n", " wiki_read2_fastq.markdown = create_fastq_page_markdown(read2_fastq,fastqc_read2_images)\n", " wiki_read2_fastq = syn.store(wiki_read2_fastq)\n", "\n", " \n", " #######\n", " #mapped bams\n", " #######\n", " mapped_bam = synapseclient.File(mapped_bam_file, parent=bams_folder,\n", " annotations=mapping_annotations,\n", " name=sample+'.bam',\n", " synapseStore = syn_STORE)\n", " mapped_bam['fileType'] = 'bam'\n", " mapped_bam['bamType'] = 'mapped'\n", " mapped_bam = syn.store(mapped_bam , used=[syn_read1_fastq,syn_read2_fastq],\n", " executed=[tophat, tophatScript],\n", " forceVersion = syn_forceVersion)\n", " \n", " ######\n", " #expression counts : genes\n", " ######\n", " genes_fpkm = synapseclient.File(genes_fpkm_file, parent=expressions_file_folder,\n", " annotations=mapping_annotations,\n", " name = sample+'_genes.fpkm_tracking',\n", " synapseStore = True)\n", " genes_fpkm['fileType'] = 'genes_fpkm'\n", " genes_fpkm = syn.store(genes_fpkm, used = [mapped_bam],\n", " executed = [cufflinks_used,cufflinksScript],\n", " forceVersion = syn_forceVersion)\n", " " ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "#merging the epxression calls\n", "mergeScript = syn.store(synapseclient.File(\"/Users/abhishek/apratap_bt/projects/PCBC_integrative_analysis/scripts/mergeExpression.py\",\n", " name=\"merge Expression script\"\n", " , parent=scripts_folder),forceVersion=syn_forceVersion)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "fpkm_files_used = [l['entity.id'] for l in list(syn.chunkedQuery(\"select id from entity where parentId=='syn2276109'\"))]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "#push the merged data\n", "mergedExp = synapseclient.File(\"/Users/abhishek/apratap_bt/projects/PCBC_integrative_analysis/data/summarized_expression_calls.tsv\",\n", " name = \"Summarized Expression Calls\",\n", " parent = project_folder)\n", "\n", "mergedExp = syn.store(mergedExp, used = fpkm_files_used,\n", " executed = mergeScript\n", " )\n", " " ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "," ] }, { "output_type": "stream", "stream": "stdout", "text": [ "!" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "!" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "!\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Upload completed in 6 seconds.\n" ] } ], "prompt_number": 30 }, { "cell_type": "code", "collapsed": false, "input": [ "#push the heatmap / correlation script\n", "geneExp_analysis_script = syn.store( synapseclient.File('/Users/abhishek/apratap_bt/dev/apRs/\n" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "?synapseclient.File" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "\n" ], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }