{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Authentication\n",
    "The first step is to authorize access to BigQuery and the Google Cloud. For more information see ['Quick Start Guide to ISB-CGC'](https://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/HowToGetStartedonISB-CGC.html) and alternative authentication methods can be found [here](https://googleapis.github.io/google-cloud-python/latest/core/auth.html)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "%matplotlib inline\n",
    "from google.cloud import bigquery\n",
    "import seaborn as sns\n",
    "from scipy import stats\n",
    "import re_module.bq_functions as regulome"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "## PARAMETERS\n",
    "\n",
    "# information from Bigquery tables\n",
    "Features = {     'Gene Expression' : { 'table'  : 'isb-cgc.TCGA_hg38_data_v0.RNAseq_Gene_Expression',\n",
    "                                       'symbol' : 'gene_name',\n",
    "                                       'study'  : 'project_short_name',\n",
    "                                       'data'   : 'AVG( LOG10( HTSeq__Counts + 1 ) ) ',\n",
    "                                       'rnkdata': 'data',\n",
    "                                       'avgdat' : 'avgdata',  \n",
    "                                   'patientcode': 'case_barcode',\n",
    "                                    'samplecode': 'sample_barcode',\n",
    "                                       'where'  : 'AND HTSeq__Counts IS NOT NULL',\n",
    "                                       'dattype': 'numeric' },\n",
    "             'MicroRNA Expression': {  'table'  : 'isb-cgc.TCGA_hg38_data_v0.miRNAseq_Expression',\n",
    "                                       'symbol' : 'mirna_id',\n",
    "                                        'study' : 'project_short_name',\n",
    "                                       'data'   : 'AVG( reads_per_million_miRNA_mapped )',\n",
    "                                       'rnkdata': 'data',\n",
    "                                       'avgdat' : 'avgdata',\n",
    "                                   'patientcode': 'case_barcode',\n",
    "                                    'samplecode': 'sample_barcode',\n",
    "                                       'where'  : 'AND reads_per_million_miRNA_mapped IS NOT NULL',\n",
    "                                       'dattype': 'numeric'}\n",
    "                \n",
    "               }\n",
    "\n",
    "# parameteres for Gene expression data\n",
    "feat1 = Features['Gene Expression']\n",
    "cohort1 = feat1['patientcode'] + \" IN ( SELECT case_barcode FROM cohort ) \"  #\" IN UNNEST(@PATIENTLIST) \"\n",
    "labels1 = ' IS NOT NULL '     #' = \\'TP53\\' '\n",
    "\n",
    "# parameteres for Gene expression data\n",
    "feat2 = Features['MicroRNA Expression'] \n",
    "cohort2 = feat2['patientcode'] + \" IN ( SELECT case_barcode FROM cohort ) \" # \" IN UNNEST(@PATIENTLIST) \"\n",
    "labels2 = ' IS NOT NULL '\n",
    "\n",
    "# minimum correlation\n",
    "min_corr = 0.5\n",
    "\n",
    "# minimun number of samples\n",
    "nsamples = 25"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read cohort"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "cohort = \"\"\"\n",
    "cohort AS(\n",
    "SELECT case_barcode FROM `isb-cgc.TCGA_bioclin_v0.Clinical`\n",
    "WHERE project_short_name = \"TCGA-BRCA\" AND age_at_diagnosis <=50 AND\n",
    "    pathologic_stage IN ('Stage II', 'Stage IIA', 'Stage IIB') AND\n",
    "    race IN ('WHITE', 'BLACK OR AFRICAN AMERICAN') AND ethnicity = \"NOT HISPANIC OR LATINO\"\n",
    ")\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Queries to read tables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "query_t1 = regulome.generic_numeric_bqtable ( 'table1' , feat1, cohort1, labels1 )\n",
    "query_t2 = regulome.generic_numeric_bqtable ( 'table2' , feat2, cohort2, labels2 ) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Queries to join tables and perform statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WITH\n",
      "cohort AS(\n",
      "SELECT case_barcode FROM `isb-cgc.TCGA_bioclin_v0.Clinical`\n",
      "WHERE project_short_name = \"TCGA-BRCA\" AND age_at_diagnosis <=50 AND\n",
      "    pathologic_stage IN ('Stage II', 'Stage IIA', 'Stage IIB') AND\n",
      "    race IN ('WHITE', 'BLACK OR AFRICAN AMERICAN') AND ethnicity = \"NOT HISPANIC OR LATINO\"\n",
      ")\n",
      ",\n",
      "table1 AS (\n",
      "SELECT\n",
      "   symbol,\n",
      "   data AS rnkdata,\n",
      "   ParticipantBarcode\n",
      "FROM (\n",
      "   SELECT\n",
      "      gene_name AS symbol, \n",
      "      AVG( LOG10( HTSeq__Counts + 1 ) )  AS data,\n",
      "      case_barcode AS ParticipantBarcode\n",
      "   FROM `isb-cgc.TCGA_hg38_data_v0.RNAseq_Gene_Expression`\n",
      "   WHERE case_barcode IN ( SELECT case_barcode FROM cohort )     # cohort \n",
      "         AND gene_name  IS NOT NULL   # labels \n",
      "         AND HTSeq__Counts IS NOT NULL  \n",
      "   GROUP BY\n",
      "      ParticipantBarcode, symbol\n",
      "   )\n",
      ")\n",
      ",\n",
      "table2 AS (\n",
      "SELECT\n",
      "   symbol,\n",
      "   data AS rnkdata,\n",
      "   ParticipantBarcode\n",
      "FROM (\n",
      "   SELECT\n",
      "      mirna_id AS symbol, \n",
      "      AVG( reads_per_million_miRNA_mapped ) AS data,\n",
      "      case_barcode AS ParticipantBarcode\n",
      "   FROM `isb-cgc.TCGA_hg38_data_v0.miRNAseq_Expression`\n",
      "   WHERE case_barcode IN ( SELECT case_barcode FROM cohort )     # cohort \n",
      "         AND mirna_id  IS NOT NULL   # labels \n",
      "         AND reads_per_million_miRNA_mapped IS NOT NULL  \n",
      "   GROUP BY\n",
      "      ParticipantBarcode, symbol\n",
      "   )\n",
      ")\n",
      ",\n",
      "summ_table AS (\n",
      "SELECT \n",
      "   n1.symbol as symbol1,\n",
      "   n2.symbol as symbol2,\n",
      "   COUNT( n1.ParticipantBarcode ) as n,\n",
      "   ABS(CORR(n1.rnkdata , n2.rnkdata)) as correlation\n",
      "    \n",
      "FROM\n",
      "   table1 AS n1\n",
      "INNER JOIN\n",
      "   table2 AS n2\n",
      "ON\n",
      "   n1.ParticipantBarcode = n2.ParticipantBarcode\n",
      "   \n",
      "GROUP BY\n",
      "   symbol1, symbol2\n",
      ")\n",
      "SELECT symbol1, symbol2, n, correlation AS abs_correlation\n",
      "FROM summ_table\n",
      "WHERE \n",
      "    n > 25 AND correlation > 0.5\n",
      "LIMIT 100\n",
      "\n"
     ]
    }
   ],
   "source": [
    "summ_query = regulome.get_summarized_table('Gene Expression',feat1,'MicroRNA Expression',feat2)\n",
    "\n",
    "sql = ( 'WITH' + cohort + ',' + query_t1 + ',' + query_t2 + ',' + summ_query + \"\"\"\n",
    "SELECT symbol1, symbol2, n, correlation AS abs_correlation\n",
    "FROM summ_table\n",
    "WHERE \n",
    "    n > {0} AND correlation > {1}\n",
    "LIMIT 100\n",
    "\"\"\".format( str(nsamples) , str(min_corr) ) )\n",
    "\n",
    "print( sql )  # THIS Query perform the computations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run the Query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " in runQuery ... \n",
      "  FATAL ERROR: query execution failed \n"
     ]
    }
   ],
   "source": [
    "bqclient = bigquery.Client()\n",
    "df_results = regulome.runQuery ( bqclient, sql, [], [], PatientList, dryRun=False )\n",
    "df_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}