{ "metadata": { "name": "", "signature": "sha256:b034f860af5ae9bbc08674502ecc162fca97133f23957278163f7af1c165aced" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "

HNSCC HPV- Cohort Clinical Variable Exploration

" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Overview of clinical variables in the TCGA HNSCC cohort and their implications towards patient prognosis. Here we are mainly just processing data for compilation of Supplemental Table 1. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "####Import Data and Packages \n", "For full list of data and packages imported see the [Imports](../Analysis_Notebooks/Imports.ipynb) notebook." ] }, { "cell_type": "code", "collapsed": false, "input": [ "import NotebookImport\n", "from Imports import *" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "importing IPython notebook from Imports.ipynb\n", "Populating the interactive namespace from numpy and matplotlib" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "changing to source dirctory" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "line_width has been deprecated, use display.width instead (currently both are\n", "identical)\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "populating namespace with data\n" ] } ], "prompt_number": 1 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Supplemtal Table 1: Clinical characteristics of HPV- cohort" ] }, { "cell_type": "code", "collapsed": false, "input": [ "old = pd.Series(1.*(age>=75) + 1.*(age>=85), name='age')\n", "old = old.map({0:'younger than 75', 1:'between 75 and 85', 2:'older than 85'})" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "ff = clinical.processed.select(lambda s: ('hpv' not in s), axis=1)\n", "ff = ff.join(clinical.clinical.race).join(clinical.clinical.gender)\n", "ff['year of dx.'] = ff['year']\n", "ff['age'] = old\n", "ff['smoker'] = ff['smoker'].map({'current smoker':'yes', 'lifelong non-smoker':'no',\n", " 'current reformed smoker for < or = 15 years':'reformed / missing',\n", " 'current reformed smoker for > 15 years':'reformed / missing',\n", " nan:'reformed / missing'})\n", "ff['stage'] = ff['stage'].map(lambda s: s.split()[1].upper() if (isinstance(s, str) and ' ' in s) else 'NX')\n", "ff['lymph_stage'] = ff['lymph_stage'].map(lambda s: s.upper() if (isinstance(s, str) and s[0] == 'n') else 'NX')\n", "ff = ff.replace('[Unknown]', np.nan)\n", "ff = ff.ix[keepers_o].fillna('missing')\n", "ff['tumor_subdivision'] = ff['tumor_subdivision'].replace('missing','missing / other')\n", "ff['drinker'] = ff['drinker'].map({True: 'yes', False: 'no', 'missing': 'missing / moderate'})\n", "\n", "clin_uni = pd.concat({g: get_surv_fit_lr(surv, f)\n", " for g,f in ff.iteritems() \n", " if len(f.dropna().unique()) in range(2,10)})" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "pat = clin_uni[('Stats','# Patients')]\n", "med_surv = clin_uni[('Median Survival','Median')]\n", "surv_p = clin_uni[('Log-Rank','p')]\n", "tab = pd.concat([pat, med_surv, surv_p], keys=['# Patients', 'Median Surv.', 'Log-rank P'], axis=1)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "o = ['gender', 'race', 'year of dx.', 'age', 'tumor_subdivision','stage','lymph_stage','invasion',\n", " 'spread', 'drinker', 'smoker']\n", "tab = tab.ix[o]\n", "o = {v:i for i,v in enumerate(o)}" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "tab.index = pd.MultiIndex.from_tuples([(o[i[0]], i[0], i[1]) for i in tab.index if i[0] in o])\n", "tab = tab.reset_index(level=0).sort(['level_0','# Patients'], ascending=[True,False])\n", "del tab['level_0']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "UPMC Cohort" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Read in UPMC data and scrub clinical data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "meta = pd.read_csv('../Extra_Data/UPMC_cohort/meta.csv', index_col=0)\n", "meta.index = pd.MultiIndex.from_tuples([('-'.join(i.split('-')[:-1]), i.split('-')[-1]) \n", " for i in meta.index])\n", "clin = pd.read_csv('../Extra_Data/UPMC_cohort/pitt_broad_data.csv', index_col=0)\n", "clin.Tumor_type = clin.Tumor_type.map(str.strip)\n", "surv = pd.concat([clin.os_5yr, clin.os_5yr_mons*30.5], keys=['event','days'], axis=1).stack()\n", "clin2 = pd.read_csv('../Extra_Data/UPMC_cohort/pitt_broad_data_update.csv', index_col=0)\n", "surv2 = pd.concat([clin2.os_5yr, clin2.os_5yr_mons*30.5], keys=['event','days'], axis=1).stack()\n", "surv = surv2.combine_first(surv)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "clin.HPV.value_counts()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ "Negative 63\n", "Positive 11\n", "dtype: int64" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "keepers = meta['SNP array ID'].notnull() & (meta['Final_74_exome_analysis'] == 1)\n", "keepers = keepers.groupby(level=0).sum() " ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "keepers = meta['SNP array ID'].notnull() & (meta['Final_74_exome_analysis'] == 1)\n", "keepers = keepers.groupby(level=0).last() \n", "keepers = (keepers > 0) & (clin.HPV == 'Negative')\n", "keepers = keepers.ix[surv.index.get_level_values(0)].dropna()\n", "keepers = keepers.groupby(level=0).last()\n", "keepers = true_index(keepers)\n", "len(keepers)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 10, "text": [ "48" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "o = ['gender', 'race', 'year of dx.', 'age', 'tumor_subdivision','stage','lymph_stage','invasion',\n", " 'spread', 'drinker', 'smoker']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "clin = clin.ix[keepers]\n", "clin['gender'] = clin['Gender']\n", "clin['race'] = clin['Race'].map(str.lower).replace('black','black or african american')\n", "clin['year of dx.'] = clin.Date_Dx.map(lambda s: 'post_2000' if s[-2] in ['0','1'] else 'pre_2000')\n", "clin['age'] = clin.Age_Dx.map(lambda s: 'between 75 and 85' if s >= 75 else 'younger than 75')\n", "def subd(s):\n", " if s == 'Oral cavity':\n", " return 'oral cavity'\n", " elif s == 'Larynx':\n", " return 'larynx'\n", " elif s == 'Oropharynx':\n", " return 'oropharynx'\n", " else:\n", " return 'missing / other'\n", "clin['tumor_subdivision'] = clin.Primary_site.map(subd)\n", "clin['stage'] = clin.clinical_stage.map(lambda s: s[:2] if s != 'Unstaged' else 'NX')\n", "clin['lymph_stage'] = clin.Nodal_stage.map(lambda s: s[:2] if s != '.' else 'NX')\n", "clin['invasion'] = clin.PNI.replace('unknown','missing')\n", "clin['spread'] = clin.EPS.replace('unknown','missing')\n", "def alch(v):\n", " s = v['Alcohol_amt']\n", " c = v['Alcohol']\n", " if c == 'no':\n", " return 'no'\n", " if pd.isnull(s):\n", " return 'missing / moderate'\n", " elif int(s) <= 6:\n", " return 'no'\n", " elif int(s) > 13:\n", " return 'yes'\n", " else:\n", " return 'missing / moderate'\n", "clin['drinker'] = clin.apply(alch,1)\n", "clin['smoker'] = clin.Smoking_history" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "clin_uni2 = pd.concat({v: get_surv_fit_lr(surv, clin[v])\n", " for v in o if v in clin and len(clin[v].unique()) > 1})\n", "pat = pd.concat({v: clin[v].value_counts() for v in o if v in clin})\n", "#pat = clin_uni2[('Stats','# Patients')]\n", "med_surv = clin_uni2[('Median Survival','Median')]\n", "surv_p = clin_uni2[('Log-Rank','p')]\n", "tab2 = pd.concat([pat, med_surv, surv_p], keys=['# Patients', 'Median Surv.', 'Log-rank P'], axis=1)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "tab_combo = pd.concat([tab, tab2.ix[tab.index]], keys=['TCGA','UPMC'], axis=1)\n", "tab_combo[('UPMC', '# Patients')] = tab_combo[('UPMC', '# Patients')].fillna('')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "o = [('gender', ''),\n", " ('gender', 'male'),\n", " ('gender', 'female'),\n", " ('race', ''),\n", " ('race', 'white'),\n", " ('race', 'black or african american'),\n", " ('race', 'missing'),\n", " ('race', 'asian'),\n", " ('race', 'american indian or alaska native'),\n", " ('year of dx.', ''),\n", " ('year of dx.', 'post_2000'),\n", " ('year of dx.', 'pre_2000'),\n", " ('age', ''),\n", " ('age', 'younger than 75'),\n", " ('age', 'between 75 and 85'),\n", " ('tumor_subdivision', ''),\n", " ('tumor_subdivision', 'oral cavity'),\n", " ('tumor_subdivision', 'larynx'),\n", " ('tumor_subdivision', 'oropharynx'),\n", " ('tumor_subdivision', 'missing / other'),\n", " ('stage', ''),\n", " ('stage', 'IV'),\n", " ('stage', 'III'),\n", " ('stage', 'II'),\n", " ('stage', 'I'),\n", " ('stage', 'NX'),\n", " ('lymph_stage', ''),\n", " ('lymph_stage', 'N3'),\n", " ('lymph_stage', 'N2'),\n", " ('lymph_stage', 'N1'),\n", " ('lymph_stage', 'N0'),\n", " ('lymph_stage', 'NX'), \n", " ('invasion', ''),\n", " ('invasion', 'yes'),\n", " ('invasion', 'no'),\n", " ('invasion', 'missing'),\n", " ('spread', ''),\n", " ('spread', 'yes'),\n", " ('spread', 'no'),\n", " ('spread', 'missing'),\n", " ('drinker', ''),\n", " ('drinker', 'yes'),\n", " ('drinker', 'no'),\n", " ('drinker', 'missing / moderate'),\n", " ('smoker', ''),\n", " ('smoker', 'yes'),\n", " ('smoker', 'no'),\n", " ('smoker', 'reformed / missing'),]\n", "tab_combo.ix[o]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TCGAUPMC
# PatientsMedian Surv.Log-rank P# PatientsMedian Surv.Log-rank P
gender 0.696 0.472
male 176 4 37 1.85
female 74 2.96 11 NaN
race 0.418 0.182
white 213 4 47 2.81
black or african american 25 1.48 1 1
missing 6 1.6 NaN NaN
asian 5 NaN NaN NaN
american indian or alaska native 1 NaN NaN NaN
year of dx. 5.46e-06 NaN NaN
post_2000 203 NaN 48 NaN NaN
pre_2000 47 1.53 NaN NaN
age 0.0106 0.0929
younger than 75 220 4 45 1.85
between 75 and 85 30 1.81 3 NaN
tumor_subdivision 0.116 0.42
oral cavity 161 4.36 25 2.11
larynx 75 2.25 11 NaN
oropharynx 13 1.43 4 0.844
missing / other 1 0.986 8 3.7
stage 0.221 0.167
IV 135 2.3 35 1.5
III 36 4.49 NaN NaN
II 37 NaN 12 NaN
I 14 NaN NaN NaN
NX 28 2.25 1 NaN
lymph_stage 0.00083 0.000435
N3 5 0.962 3 0.894
N2 85 1.9 23 1.3
N1 29 NaN 7 3.7
N0 84 NaN 14 NaN
NX 47 1.83 1 NaN
invasion 0.00342 0.0188
yes 95 2.5 22 1.36
no 89 NaN 20 NaN
missing 66 1.9 6 3.25
spread 0.00146 0.0448
yes 51 1.42 16 1.37
no 118 NaN 14 2.11
missing 81 3.53 18 NaN
drinker 0.0875 0.0252
yes 45 4 8 1.04
no 35 NaN 27 NaN
missing / moderate 170 2.5 13 1.37
smoker 0.0231 0.285
yes 84 1.6 41 1.85
no 44 4.71 7 5.01
reformed / missing 122 4 NaN NaN
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 16, "text": [ " TCGA UPMC \n", " # Patients Median Surv. Log-rank P # Patients Median Surv. Log-rank P\n", "gender 0.696 0.472\n", " male 176 4 37 1.85 \n", " female 74 2.96 11 NaN \n", "race 0.418 0.182\n", " white 213 4 47 2.81 \n", " black or african american 25 1.48 1 1 \n", " missing 6 1.6 NaN NaN\n", " asian 5 NaN NaN NaN\n", " american indian or alaska native 1 NaN NaN NaN\n", "year of dx. 5.46e-06 NaN NaN\n", " post_2000 203 NaN 48 NaN NaN\n", " pre_2000 47 1.53 NaN NaN\n", "age 0.0106 0.0929\n", " younger than 75 220 4 45 1.85 \n", " between 75 and 85 30 1.81 3 NaN \n", "tumor_subdivision 0.116 0.42\n", " oral cavity 161 4.36 25 2.11 \n", " larynx 75 2.25 11 NaN \n", " oropharynx 13 1.43 4 0.844 \n", " missing / other 1 0.986 8 3.7 \n", "stage 0.221 0.167\n", " IV 135 2.3 35 1.5 \n", " III 36 4.49 NaN NaN\n", " II 37 NaN 12 NaN \n", " I 14 NaN NaN NaN\n", " NX 28 2.25 1 NaN \n", "lymph_stage 0.00083 0.000435\n", " N3 5 0.962 3 0.894 \n", " N2 85 1.9 23 1.3 \n", " N1 29 NaN 7 3.7 \n", " N0 84 NaN 14 NaN \n", " NX 47 1.83 1 NaN \n", "invasion 0.00342 0.0188\n", " yes 95 2.5 22 1.36 \n", " no 89 NaN 20 NaN \n", " missing 66 1.9 6 3.25 \n", "spread 0.00146 0.0448\n", " yes 51 1.42 16 1.37 \n", " no 118 NaN 14 2.11 \n", " missing 81 3.53 18 NaN \n", "drinker 0.0875 0.0252\n", " yes 45 4 8 1.04 \n", " no 35 NaN 27 NaN \n", " missing / moderate 170 2.5 13 1.37 \n", "smoker 0.0231 0.285\n", " yes 84 1.6 41 1.85 \n", " no 44 4.71 7 5.01 \n", " reformed / missing 122 4 NaN NaN" ] } ], "prompt_number": 16 } ], "metadata": {} } ] }