{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Processing all datasets to be used in downstream analyses\n",
    "\n",
    "RNAseq, mutation, and copy number data were accessed from the UCSC Xena Data Browser. Clinical data was downloaded from the TCGA Snaptron curation effort. See `data_download.sh` for more details."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import requests\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn import preprocessing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define Input and Output Filenames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Input Files\n",
    "rna_file = os.path.join('data', 'raw', 'HiSeqV2')\n",
    "mut_file = os.path.join('data', 'raw', 'PANCAN_mutation')\n",
    "copy_file = os.path.join('data', 'raw', 'Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes')\n",
    "clinical_file = os.path.join('data', 'raw', 'samples.tsv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Output Files\n",
    "# Processing RNAseq data by z-score and zeroone norm\n",
    "rna_out_file = os.path.join('data', 'pancan_scaled_rnaseq.tsv.gz')\n",
    "rna_out_zeroone_file = os.path.join('data', 'pancan_scaled_zeroone_rnaseq.tsv.gz')\n",
    "\n",
    "# Mutation Data\n",
    "mut_out_file = os.path.join('data', 'pancan_mutation.tsv.gz')\n",
    "mut_burden_file = os.path.join('data', 'pancan_mutation_burden.tsv')\n",
    "\n",
    "# Two copy number matrices, for thresholded (2) gains and losses\n",
    "copy_gain_out_file = os.path.join('data', 'copy_number_gain.tsv.gz')\n",
    "copy_loss_out_file = os.path.join('data', 'copy_number_loss.tsv.gz')\n",
    "\n",
    "# Clinical data\n",
    "clinical_processed_out_file = os.path.join('data', 'clinical_data.tsv')\n",
    "\n",
    "# OncoKB output file\n",
    "oncokb_out_file = os.path.join('data', 'oncokb_genetypes.tsv')\n",
    "\n",
    "# Status matirix integrating mutation and copy number events\n",
    "known_status_file = os.path.join('data', 'status_matrix.tsv.gz')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/gway/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (32,129,130,131,132,133,134,137,138,139,140,141,142,144,147,217,256,283,284,285,286,287,288,289,290,294,295,296,299,301,304,305,306,307,308,309,310,311,315,317,320,321,322,323,325,326,328,329,332,335,336,341,344,346,349,351,357,359,361,362,365,367,368,369,370,374,376,377,378,379,380,381,382,384,385,386,387,388,392,393,395,396,397,408,409,418,419,420,421,422,423,424,425,426,427,428,431,433,434,435,436,444,447,448,449,451,453,454,455,456,457,458,459,460,461,464,466,468,469,470,471,472,473,481,483,484,485,488,491,493,496,497,499,502,503,504,505,509,510,511,512,513,514,515,516,517,518,520,521,523,524,525,526,527,528,529,530,531,532,533,535,536,537,538,539,540,542,543,545,546,549,550,551,552,553,554,555,558,559,560,561,562,564,565,566,567,568,569,570,571,572,573,574,575,577,579,580,581,582,583,584,585,586,588,589,592,593,594,595,596,598,599,601,602,603,604,606,607,626,628,629,630,631,632,633,634,635,636,637,638,639,640,641,642,643,644,645,646,658,660,662,669,670,671,672,673,675,676,678,679,687,688,689,690,692,693,694,695,696,698,699,700,701,702,703,706,709,710,711,726,727,728,729,730,731,732,733,734,735,736,744,746,747,748,749,750,753,754,755,756,757,759,761,762,766,767,769,770,771,772,809,810,811,812,813,815,816,817,818,819,820,822,824,825,826,827,828,850,851,852,853,855,856,857) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  interactivity=interactivity, compiler=compiler, result=result)\n"
     ]
    }
   ],
   "source": [
    "rnaseq_df = pd.read_table(rna_file, index_col=0)\n",
    "mutation_df = pd.read_table(mut_file)\n",
    "copy_df = pd.read_table(copy_file, index_col=0)\n",
    "clinical_df = pd.read_table(clinical_file, index_col=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Begin processing different data types\n",
    "\n",
    "### RNAseq\n",
    "\n",
    "The RNAseq data was accessed through the UCSC Xena database."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Process RNAseq file\n",
    "rnaseq_df.index = rnaseq_df.index.map(lambda x: x.split('|')[0])\n",
    "rnaseq_df.columns = rnaseq_df.columns.str.slice(start=0, stop=15)\n",
    "rnaseq_df = rnaseq_df.drop('?').fillna(0).sort_index(axis=1)\n",
    "\n",
    "# Gene is listed twice in RNAseq data, drop both occurrences\n",
    "rnaseq_df.drop('SLC35E2', axis=0, inplace=True)\n",
    "rnaseq_df = rnaseq_df.T\n",
    "\n",
    "# Determine most variably expressed genes and subset\n",
    "num_mad_genes = 5000\n",
    "mad_genes = rnaseq_df.mad(axis=0).sort_values(ascending=False)\n",
    "top_mad_genes = mad_genes.iloc[0:num_mad_genes, ].index\n",
    "rnaseq_subset_df = rnaseq_df.loc[:, top_mad_genes]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Scale RNAseq data using z-scores\n",
    "rnaseq_scaled_df = preprocessing.StandardScaler().fit_transform(rnaseq_subset_df)\n",
    "rnaseq_scaled_df = pd.DataFrame(rnaseq_scaled_df,\n",
    "                                columns=rnaseq_subset_df.columns,\n",
    "                                index=rnaseq_subset_df.index)\n",
    "rnaseq_scaled_df.to_csv(rna_out_file, sep='\\t', compression='gzip')\n",
    "\n",
    "# Scale RNAseq data using zero-one normalization\n",
    "rnaseq_scaled_zeroone_df = preprocessing.MinMaxScaler().fit_transform(rnaseq_subset_df)\n",
    "rnaseq_scaled_zeroone_df = pd.DataFrame(rnaseq_scaled_zeroone_df,\n",
    "                                        columns=rnaseq_subset_df.columns,\n",
    "                                        index=rnaseq_subset_df.index)\n",
    "rnaseq_scaled_zeroone_df.to_csv(rna_out_zeroone_file, sep='\\t', compression='gzip')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Mutation\n",
    "\n",
    "Mutation data are stored in a long format. First, subset the data to only deleterious mutations listed below. Next, pivot the dataframe to have samples as the index, genes as the columns, and either a 1 or 0 to indicate a deleterious mutation or wild-type sample by gene."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Filter mutation types and generate binary matrix\n",
    "mutations = {\n",
    "    'Frame_Shift_Del',\n",
    "    'Frame_Shift_Ins',\n",
    "    'In_Frame_Del',\n",
    "    'In_Frame_Ins',\n",
    "    'Missense_Mutation',\n",
    "    'Nonsense_Mutation',\n",
    "    'Nonstop_Mutation',\n",
    "    'RNA',\n",
    "    'Splice_Site',\n",
    "    'Translation_Start_Site',\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Process mutation in long format to dataframe format\n",
    "mut_pivot = (mutation_df.query(\"effect in @mutations\")\n",
    "                        .groupby(['#sample', 'chr',\n",
    "                                  'gene'])\n",
    "                        .apply(len).reset_index()\n",
    "                        .rename(columns={0: 'mutation'}))\n",
    "\n",
    "mut_pivot = (mut_pivot.pivot_table(index='#sample',\n",
    "                                   columns='gene', values='mutation',\n",
    "                                   fill_value=0)\n",
    "                      .astype(bool).astype(int))\n",
    "mut_pivot.index.name = 'SAMPLE_BARCODE'\n",
    "\n",
    "mut_pivot.to_csv(mut_out_file, sep='\\t', compression='gzip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Process per sample mutation burden\n",
    "\n",
    "# Mutation burden quantifies how many mutations are observed in a single tumor.\n",
    "# It is a measure of how unstable a tumors' genome is, potentially reflecting\n",
    "# deficiences in DNA repair and/or specific etiology. It is a major source of\n",
    "# variation in expression data and is used by alternative downstream models or\n",
    "# to explore relationships in tybalt features.\n",
    "burden_df = mutation_df[mutation_df['effect'].isin(mutations)]\n",
    "burden_df = burden_df.groupby('#sample').apply(len)\n",
    "burden_df = np.log10(burden_df).reset_index()\n",
    "burden_df.columns = ['SAMPLE_BARCODE', 'log10_mut']\n",
    "burden_df.to_csv(mut_burden_file, index=False, sep='\\t')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Copy Number\n",
    "\n",
    "Copy number data contains thresholded GISTIC2.0 calls where 0 equals wild-type copy number, 1 and -1 mean slight gain and slight loss, respectively, and 2 and -2 mean high gain and high loss, respectively."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "copy_df = copy_df.astype(int)\n",
    "copy_df = copy_df.T\n",
    "copy_df.columns.name = 'gene'\n",
    "copy_df.index.name = 'Sample'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# For our purposes, a copy loss status event (1 vs. 0) is conservatively defined only as a deep loss.\n",
    "copy_loss_df = copy_df.replace(to_replace=[1, 2, -1], value=0)\n",
    "copy_loss_df.replace(to_replace=-2, value=1, inplace=True)\n",
    "copy_loss_df.to_csv(copy_loss_out_file, sep='\\t', compression='gzip')\n",
    "\n",
    "# A copy gain status event (1 vs. 0) is defined only as a high gain.\n",
    "copy_gain_df = copy_df.replace(to_replace=[-1, -2, 1], value=0)\n",
    "copy_gain_df.replace(to_replace=2, value=1, inplace=True)\n",
    "copy_gain_df.to_csv(copy_gain_out_file, sep='\\t', compression='gzip')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Clinical Data\n",
    "\n",
    "The clinical data consists of hundreds of parameters collected for the TCGA samples. Some columns are redundant, while others contain high amounts of missing data. Select and rename only a couple columns of interest."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "clinical_columns_dict = {\n",
    "    'gdc_platform': 'platform',\n",
    "    'gdc_center.short_name': 'analysis_center',\n",
    "    'gdc_cases.submitter_id': 'sample_id',\n",
    "    'gdc_cases.demographic.gender': 'gender',\n",
    "    'gdc_cases.demographic.race': 'race',\n",
    "    'gdc_cases.demographic.ethnicity': 'ethnicity',\n",
    "    'gdc_cases.project.primary_site': 'organ',\n",
    "    'gdc_cases.project.project_id': 'acronym',\n",
    "    'gdc_cases.tissue_source_site.project': 'disease',\n",
    "    'gdc_cases.diagnoses.vital_status': 'vital_status',\n",
    "    'gdc_cases.samples.sample_type': 'sample_type',\n",
    "    'cgc_case_age_at_diagnosis': 'age_at_diagnosis',\n",
    "    'cgc_portion_id': 'portion_id',\n",
    "    'cgc_slide_percent_tumor_nuclei': 'percent_tumor_nuclei',\n",
    "    'cgc_drug_therapy_drug_name': 'drug',\n",
    "    'xml_year_of_initial_pathologic_diagnosis': 'year_of_diagnosis',\n",
    "    'xml_stage_event_pathologic_stage': 'stage' \n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "clinical_sub_df = clinical_df.filter(items=clinical_columns_dict.keys())\n",
    "clinical_sub_df = clinical_sub_df.rename(columns=clinical_columns_dict)\n",
    "clinical_sub_df.index = clinical_sub_df['sample_id']\n",
    "clinical_sub_df.drop('sample_id', axis=1, inplace=True)\n",
    "clinical_sub_df['acronym'] = clinical_sub_df['acronym'].str[5:]\n",
    "clinical_sub_df.to_csv(clinical_processed_out_file, sep='\\t')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### OncoKB Gene-Types\n",
    "\n",
    "Here, we use the [OncoKB API](http://oncokb.org/#/dataAccess) of [Chakravarty et al. 2017](http://ascopubs.org/doi/abs/10.1200/JCO.2016.34.15_suppl.11583) to download all oncogenes and tumor suppressor genes. These help to subset the copy number gain and copy loss data frames to identify a full status matrix."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "response = requests.get('http://oncokb.org/api/v1/genes')\n",
    "oncokb_df = pd.read_json(response.content)\n",
    "oncokb_df.to_csv(oncokb_out_file, sep='\\t')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Full Status Matrix\n",
    "\n",
    "**We create the full status matrix with:** \n",
    "\n",
    "- Deleterious mutations = 1\n",
    "- High copy gains of oncogenes = 1\n",
    "- Deep copy losses of tumor suppressor genes = 1\n",
    "- All other gene by sample relationships = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Integrate copy number, oncokb gene-type, and mutation status to define status matrix\n",
    "oncogenes_df = oncokb_df[oncokb_df['oncogene']]\n",
    "tsg_df = oncokb_df[oncokb_df['tsg']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Subset copy gains by oncogenes and copy losses by tumor suppressors (tsg)\n",
    "status_gain = copy_gain_df.loc[:, oncogenes_df['hugoSymbol']]\n",
    "status_loss = copy_loss_df.loc[:, tsg_df['hugoSymbol']]\n",
    "copy_status = pd.concat([status_gain, status_loss], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# NOTCH1 and NOTCH2 are both tumor suppressors and oncogenes depending on tissue context\n",
    "# drop them from copy number consideration for now\n",
    "copy_status = copy_status.drop(['NOTCH1', 'NOTCH2'], axis=1)\n",
    "\n",
    "# Subset each dataframe\n",
    "status_samples = set(mut_pivot.index) & set(copy_status.index)\n",
    "\n",
    "mutation_status = mut_pivot.loc[status_samples, :]\n",
    "copy_status = copy_status.loc[status_samples, :]\n",
    "copy_status = copy_status.loc[:, mutation_status.columns].fillna(0).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Combine and write to file\n",
    "full_status = mutation_status.add(copy_status, fill_value=0)\n",
    "full_status.replace(to_replace=2, value=1, inplace=True)\n",
    "full_status.to_csv(known_status_file, sep='\\t', compression='gzip')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}