{ "metadata": { "name": "", "signature": "sha256:f9f06dbac2722e04de7d997a310f327d7c1fea117d14f9b7fa953c23ddb62341" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "

HNSCC Cohort Process Data for Calling HPV Status

" ] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Setup" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Initialization" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%pylab inline" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] } ], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "cd ../src" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/cellar/users/agross/TCGA_Code/TCGA/src\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import pickle as pickle" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Read in Pre-processed Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here we read in the pre-processed data that we downloaded and initialized in the [download_data notebook](download_data.ipynb)." ] }, { "cell_type": "code", "collapsed": false, "input": [ "def get_run(firehose_dir, version='Latest'):\n", " '''\n", " Helper to get a run from the file-system. \n", " '''\n", " path = '{}/ucsd_analyses'.format(firehose_dir)\n", " if version is 'Latest':\n", " version = sorted(os.listdir(path))[-1]\n", " run = pickle.load(open('{}/{}/RunObject.p'.format(path, version), 'rb'))\n", " return run" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "params = pd.read_table('../global_params.txt', header=None, squeeze=True, \n", " index_col=0)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "run_path = '{}/Firehose__{}/'.format(params.ix['OUT_PATH'], params.ix['RUN_DATE'])\n", "run = get_run(run_path, 'Run_' + params.ix['VERSION'])\n", "cancer = run.load_cancer(params.ix['CANCER'])\n", "clinical = cancer.load_clinical()\n", "\n", "\n", "mut = cancer.load_data('Mutation')\n", "mut.uncompress()\n", "cn = cancer.load_data('CN_broad')\n", "cn.uncompress()\n", "\n", "rna = cancer.load_data('mRNASeq')\n", "mirna = cancer.load_data('miRNASeq')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "surv = clinical.survival.survival_5y" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Clinical / Sequencing HPV Status" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "####This file contains the HPV calls from analysis of the sequencing reads \n", "* This was obtained from Neil Hayes in March \n", "* This should probably be made public with the TCGA marker paper." ] }, { "cell_type": "code", "collapsed": false, "input": [ "hpv_all = pd.read_csv('../Extra_Data/hpv_summary_3_20_13_distribute.csv', index_col=0)\n", "hpv = hpv_all.Molecular_HPV.map({0:'HPV-', 1:'HPV+'})\n", "hpv.name = 'HPV'\n", "hpv_seq = hpv" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "hpv_seq.value_counts()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 9, "text": [ "HPV- 244\n", "HPV+ 35\n", "dtype: int64" ] } ], "prompt_number": 9 }, { "cell_type": "markdown", "metadata": {}, "source": [ "####Here I process the clinical HPV marker tests \n", "* I only use this data if a patient does not have sequencing HPV status available\n", "* If both the p16 and the ish testing are positive I call the patient HPV+" ] }, { "cell_type": "code", "collapsed": false, "input": [ "status = clinical.clinical[['hpvstatusbyishtesting','hpvstatusbyp16testing']]\n", "hpv_clin = (status.dropna() == 'positive').sum(1)\n", "hpv_clin = hpv_clin.map({2: 'HPV+', 0:'HPV-', 1:nan}).dropna()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "hpv_clin.value_counts()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 11, "text": [ "HPV- 50\n", "HPV+ 8\n", "dtype: int64" ] } ], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "hpv_clin.ix[hpv_clin.index.diff(hpv_seq.index)].value_counts()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 12, "text": [ "HPV- 9\n", "HPV+ 4\n", "dtype: int64" ] } ], "prompt_number": 12 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "New HPV Calls from TCGA Data Portal" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* These are calls obtained from [this](https://tcga-data.nci.nih.gov/tcgafiles/ftp_auth/distro_ftpusers/anonymous/tumor/hnsc/bcr/nationwidechildrens.org/bio/clin/nationwidechildrens.org_HNSC.bio.Level_2.0.6.0/) directory on the TCGA Data Portal \n", "* This is from a diagnostic run on the tissue after it had reached TCGA" ] }, { "cell_type": "code", "collapsed": false, "input": [ "hpv_new = pd.read_table('../Extra_Data/nationwidechildrens.org_auxiliary_hnsc.txt',\n", " skiprows=[1], index_col=0, na_values=['[Not Available]'])\n", "hpv_new = hpv_new['hpv_status']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "hpv_combo = (hpv_seq.dropna() == 'HPV+').combine_first(hpv_new == 'Positive')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 14 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Re-calculate of meta-features in HPV- background" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* These contain meta-features that are sensative to the background patient set (I.E. side effects)\n", "* The mutation and copy number data-sets don't need to be regenerated \n", "* These are saved in a seperate directory" ] }, { "cell_type": "code", "collapsed": false, "input": [ "hpv_combo.to_clipboard()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "clinical.hpv = hpv_combo\n", "clinical.save() #I keep the same object as there are no side effects" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "keepers_o = hpv_combo[hpv_combo==0].index\n", "keepers_o = keepers_o.intersection(mut.features.columns)\n", "keepers_o = keepers_o.intersection(cn.features.columns)\n", "keepers_o = keepers_o.intersection(surv.unstack().index)\n", "keepers_o = keepers_o.intersection(rna.features.columns)\n", "keepers_o = keepers_o.intersection(mirna.features.columns)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "len(keepers_o)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 19, "text": [ "258" ] } ], "prompt_number": 19 }, { "cell_type": "code", "collapsed": true, "input": [ "from Initialization.InitializeReal import RealDataset\n", "from Processing.Helpers import make_path_dump\n", "rna = RealDataset(run, cancer, 'mRNASeq', keepers_o)\n", "mirna = RealDataset(run, cancer, 'miRNASeq', keepers_o, create_meta_features=False)\n", "\n", "make_path_dump(rna, rna.path + '/store/no_hpv2.p')\n", "make_path_dump(mirna, mirna.path + '/store/no_hpv2.p')" ], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }