{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Authentication\n", "The first step is to authorize access to BigQuery and the Google Cloud. For more information see ['Quick Start Guide to ISB-CGC'](https://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/HowToGetStartedonISB-CGC.html) and alternative authentication methods can be found [here](https://googleapis.github.io/google-cloud-python/latest/core/auth.html)." ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The autoreload extension is already loaded. To reload it, use:\n", " %reload_ext autoreload\n" ] } ], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline\n", "from google.cloud import bigquery\n", "import seaborn as sns\n", "from scipy import stats\n", "import re_module.bq_functions as regulome" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "## PARAMETERS\n", "\n", "# information from Bigquery tables\n", "Features = { 'Gene Expression' : { 'table' : 'isb-cgc.TCGA_hg38_data_v0.RNAseq_Gene_Expression',\n", " 'symbol' : 'gene_name',\n", " 'study' : 'project_short_name',\n", " 'data' : 'AVG( LOG10( HTSeq__Counts + 1 ) ) ',\n", " 'rnkdata': 'data',\n", " 'avgdat' : 'avgdata', \n", " 'patientcode': 'case_barcode',\n", " 'samplecode': 'sample_barcode',\n", " 'where' : 'AND HTSeq__Counts IS NOT NULL',\n", " 'dattype': 'numeric' },\n", " 'MicroRNA Expression': { 'table' : 'isb-cgc.TCGA_hg38_data_v0.miRNAseq_Expression',\n", " 'symbol' : 'mirna_id',\n", " 'study' : 'project_short_name',\n", " 'data' : 'AVG( reads_per_million_miRNA_mapped )',\n", " 'rnkdata': 'data',\n", " 'avgdat' : 'avgdata',\n", " 'patientcode': 'case_barcode',\n", " 'samplecode': 'sample_barcode',\n", " 'where' : 'AND reads_per_million_miRNA_mapped IS NOT NULL',\n", " 'dattype': 'numeric'}\n", " \n", " }\n", "\n", "# parameteres for Gene expression data\n", "feat1 = Features['Gene Expression']\n", "cohort1 = feat1['patientcode'] + \" IN ( SELECT case_barcode FROM cohort ) \" #\" IN UNNEST(@PATIENTLIST) \"\n", "labels1 = ' IS NOT NULL ' #' = \\'TP53\\' '\n", "\n", "# parameteres for Gene expression data\n", "feat2 = Features['MicroRNA Expression'] \n", "cohort2 = feat2['patientcode'] + \" IN ( SELECT case_barcode FROM cohort ) \" # \" IN UNNEST(@PATIENTLIST) \"\n", "labels2 = ' IS NOT NULL '\n", "\n", "# minimum correlation\n", "min_corr = 0.5\n", "\n", "# minimun number of samples\n", "nsamples = 25" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read cohort" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "cohort = \"\"\"\n", "cohort AS(\n", "SELECT case_barcode FROM `isb-cgc.TCGA_bioclin_v0.Clinical`\n", "WHERE project_short_name = \"TCGA-BRCA\" AND age_at_diagnosis <=50 AND\n", " pathologic_stage IN ('Stage II', 'Stage IIA', 'Stage IIB') AND\n", " race IN ('WHITE', 'BLACK OR AFRICAN AMERICAN') AND ethnicity = \"NOT HISPANIC OR LATINO\"\n", ")\n", "\"\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Queries to read tables" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "query_t1 = regulome.generic_numeric_bqtable ( 'table1' , feat1, cohort1, labels1 )\n", "query_t2 = regulome.generic_numeric_bqtable ( 'table2' , feat2, cohort2, labels2 ) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Queries to join tables and perform statistics" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WITH\n", "cohort AS(\n", "SELECT case_barcode FROM `isb-cgc.TCGA_bioclin_v0.Clinical`\n", "WHERE project_short_name = \"TCGA-BRCA\" AND age_at_diagnosis <=50 AND\n", " pathologic_stage IN ('Stage II', 'Stage IIA', 'Stage IIB') AND\n", " race IN ('WHITE', 'BLACK OR AFRICAN AMERICAN') AND ethnicity = \"NOT HISPANIC OR LATINO\"\n", ")\n", ",\n", "table1 AS (\n", "SELECT\n", " symbol,\n", " data AS rnkdata,\n", " ParticipantBarcode\n", "FROM (\n", " SELECT\n", " gene_name AS symbol, \n", " AVG( LOG10( HTSeq__Counts + 1 ) ) AS data,\n", " case_barcode AS ParticipantBarcode\n", " FROM `isb-cgc.TCGA_hg38_data_v0.RNAseq_Gene_Expression`\n", " WHERE case_barcode IN ( SELECT case_barcode FROM cohort ) # cohort \n", " AND gene_name IS NOT NULL # labels \n", " AND HTSeq__Counts IS NOT NULL \n", " GROUP BY\n", " ParticipantBarcode, symbol\n", " )\n", ")\n", ",\n", "table2 AS (\n", "SELECT\n", " symbol,\n", " data AS rnkdata,\n", " ParticipantBarcode\n", "FROM (\n", " SELECT\n", " mirna_id AS symbol, \n", " AVG( reads_per_million_miRNA_mapped ) AS data,\n", " case_barcode AS ParticipantBarcode\n", " FROM `isb-cgc.TCGA_hg38_data_v0.miRNAseq_Expression`\n", " WHERE case_barcode IN ( SELECT case_barcode FROM cohort ) # cohort \n", " AND mirna_id IS NOT NULL # labels \n", " AND reads_per_million_miRNA_mapped IS NOT NULL \n", " GROUP BY\n", " ParticipantBarcode, symbol\n", " )\n", ")\n", ",\n", "summ_table AS (\n", "SELECT \n", " n1.symbol as symbol1,\n", " n2.symbol as symbol2,\n", " COUNT( n1.ParticipantBarcode ) as n,\n", " ABS(CORR(n1.rnkdata , n2.rnkdata)) as correlation\n", " \n", "FROM\n", " table1 AS n1\n", "INNER JOIN\n", " table2 AS n2\n", "ON\n", " n1.ParticipantBarcode = n2.ParticipantBarcode\n", " \n", "GROUP BY\n", " symbol1, symbol2\n", ")\n", "SELECT symbol1, symbol2, n, correlation AS abs_correlation\n", "FROM summ_table\n", "WHERE \n", " n > 25 AND correlation > 0.5\n", "LIMIT 100\n", "\n" ] } ], "source": [ "summ_query = regulome.get_summarized_table('Gene Expression',feat1,'MicroRNA Expression',feat2)\n", "\n", "sql = ( 'WITH' + cohort + ',' + query_t1 + ',' + query_t2 + ',' + summ_query + \"\"\"\n", "SELECT symbol1, symbol2, n, correlation AS abs_correlation\n", "FROM summ_table\n", "WHERE \n", " n > {0} AND correlation > {1}\n", "LIMIT 100\n", "\"\"\".format( str(nsamples) , str(min_corr) ) )\n", "\n", "print( sql ) # THIS Query perform the computations" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run the Query" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " in runQuery ... \n", " FATAL ERROR: query execution failed \n" ] } ], "source": [ "bqclient = bigquery.Client()\n", "df_results = regulome.runQuery ( bqclient, sql, [], [], PatientList, dryRun=False )\n", "df_results" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }