{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "import pandas as pd\n", "import numpy as np\n", "import re\n", "import os\n", "import math\n", "from multiprocessing import Pool\n", "from tqdm import tqdm\n", "from scipy import stats\n", "## init\n", "mySpecie='Homo_sapiens'\n", "#prealigned_dir='/cellar/users/btsui/all_seq_snp/Homo_sapiens_all_merged_snp.TCGA.prealigned.pickle'\n", "targetted_align_dir='/cellar/users/btsui/all_seq_snp/Homo_sapiens_all_merged_snp.TCGA.pickle'\n", "manifest_dir='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/./tcga_lgg_wgs_bams.df.wxs_rnaseq.pickle'\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "targetted_df=pd.read_pickle(targetted_align_dir).loc[\"TCGA\"]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "#targetted_df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "##bam-readcount sometime have duplicate rows, need deduplication\n", "#%time targetted_df=targetted_df.groupby(targetted_df.index.names).first()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "all_UUIDs=targetted_df.index.get_level_values('Run_digits').unique()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "n UUID: 1570\n" ] } ], "source": [ "#883, 1427\n", "print ('n UUID:',len(all_UUIDs))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "### use andrea mapping to map from TCGA barcode to UUID. " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "manifest_df=pd.read_pickle(manifest_dir)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "scrolled": true }, "outputs": [], "source": [ "manifest_df['processed']=manifest_df.file_id.isin(all_UUIDs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "uuid_barcode_mapDf=pd.read_csv('/cellar/users/andreabc/GDC_barcodes/uuid_barcode_map.txt',sep='\\t').set_index('file_id')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "manifest_df['sample_barcode']=uuid_barcode_mapDf.loc[manifest_df.file_id]['sample_barcode'].values\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "m_data_category=manifest_df.data_category=='Raw Sequencing Data'\n", "m_experimental_strategy=manifest_df['experimental_strategy'].isin(['RNA-Seq','WXS'])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "manifest_df_sub=manifest_df[manifest_df['processed']&m_data_category&m_experimental_strategy]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "tmpVC=manifest_df_sub['sample_barcode'].value_counts()\n", "with_both=tmpVC.index[tmpVC==2]\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "524" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(with_both)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "manifest_df_w_RNA_WXS=manifest_df_sub[manifest_df_sub.sample_barcode.isin(with_both)]\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "#g=manifest_df_w_RNA_WXS.groupby(['sample_barcode','experimental_strategy'])\n", "#[manifest_df_sub['sample_barcode']=='TCGA-HT-A4DV-01A']" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "#manifest_df_sub" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "#queryBarcode='TCGA-HT-A4DV-01A'\n", "#rnaseq_uuid=g.get_group((queryBarcode,'RNA-Seq'))['file_id'].iloc[0]\n", "#wxs_uuid=g.get_group((queryBarcode,'WXS'))['file_id'].iloc[0]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "#targetted_df.head()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "query_chromosome, qeury_corrdinate='2',208248388\n", "m_chr=targetted_df.index.get_level_values('Chr')=='2'\n", "Pos_array=targetted_df.index.get_level_values('Pos')\n", "window_size=10\n", "\n", "m_pos=(Pos_array>(qeury_corrdinate-window_size))&(Pos_array<(qeury_corrdinate+window_size))\n", "\n", "#\n" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "targetted_df_sub=targetted_df[m_pos]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "rnaseq_uuids=manifest_df_w_RNA_WXS[manifest_df_w_RNA_WXS['experimental_strategy']=='WXS']['file_id'].unique()\n", "\n", "m_uuid=targetted_df_sub.index.get_level_values('Run_digits').isin(rnaseq_uuids)\n" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:3: FutureWarning: \n", "Passing list-likes to .loc or [] with any missing label will raise\n", "KeyError in the future, you can use .reindex() as an alternative.\n", "\n", "See the documentation here:\n", "https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike\n", " This is separate from the ipykernel package so we can avoid doing imports until\n" ] } ], "source": [ "index_metaDf=targetted_df_sub.index.to_frame()\n", "\n", "index_metaDf['sample_barcode']=manifest_df_w_RNA_WXS.set_index('file_id').loc[index_metaDf['Run_digits']]['sample_barcode'].values" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "multI=index_metaDf.set_index(['sample_barcode','Run_digits','Chr','Pos','base']).index\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "targetted_df_sub.index=multI" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "wxs_df=targetted_df_sub[m_uuid]#.loc[wxs_uuid]" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "groupings_L=['sample_barcode','Chr','Pos','base']" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 64 ms, sys: 4 ms, total: 68 ms\n", "Wall time: 67.4 ms\n" ] } ], "source": [ "%time wxs_df=wxs_df.groupby(groupings_L).first()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "rnaseq_uuids=manifest_df_w_RNA_WXS[manifest_df_w_RNA_WXS['experimental_strategy']=='RNA-Seq']['file_id'].unique()\n", "\n", "m_uuid=targetted_df_sub.index.get_level_values('Run_digits').isin(rnaseq_uuids)\n", "\n", "rnaseq_df=targetted_df_sub[m_uuid]#.loc[rnaseq_uuid]" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 24 ms, sys: 0 ns, total: 24 ms\n", "Wall time: 20.7 ms\n" ] } ], "source": [ "%time rnaseq_df=rnaseq_df.groupby(groupings_L).first()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "mergedDf=pd.concat([ rnaseq_df,wxs_df],axis=1,keys=['rna-seq','wxs'])" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "mergedDf=mergedDf[~mergedDf.isnull().all(axis=1)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## identify IDH1 mutations" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "#ref C\n", "IDH1_corrdinate=208248388\n", "mergedDf_subDf=mergedDf[mergedDf.index.get_level_values('Pos')==IDH1_corrdinate]" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "m_quality=(mergedDf_subDf['wxs']['AverageBaseQuality']>20)\n", "ref_alt='T'\n", "m_alt=(mergedDf_subDf.index.get_level_values('base'))==ref_alt\n", "\n", "qualFilteredDf=mergedDf_subDf[m_quality]" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "unstackDf=qualFilteredDf.unstack()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rna-seqwxs
featuresReadDepthAverageBaseQualityReadDepthAverageBaseQuality
baseACGTACGTACGTACGT
sample_barcodeChrPos
TCGA-CS-4938-01B2208248388NaN68.0NaN24.0NaN37.0NaN38.0NaN87.0NaN29.0NaN29.0NaN29.0
TCGA-CS-4941-01A2208248388NaN235.0NaNNaNNaN37.0NaNNaNNaN182.0NaNNaNNaN27.0NaNNaN
TCGA-CS-4942-01A2208248388NaN140.0NaN19.0NaN38.0NaN38.0NaN192.0NaN46.0NaN25.0NaN28.0
TCGA-CS-4943-01A2208248388NaN179.0NaN73.0NaN38.0NaN37.0NaN116.0NaN59.0NaN26.0NaN28.0
TCGA-CS-4944-01A2208248388NaN43.0NaN17.0NaN37.0NaN37.0NaN149.0NaN43.0NaN27.0NaN28.0
\n", "
" ], "text/plain": [ " rna-seq \\\n", "features ReadDepth AverageBaseQuality \n", "base A C G T A \n", "sample_barcode Chr Pos \n", "TCGA-CS-4938-01B 2 208248388 NaN 68.0 NaN 24.0 NaN \n", "TCGA-CS-4941-01A 2 208248388 NaN 235.0 NaN NaN NaN \n", "TCGA-CS-4942-01A 2 208248388 NaN 140.0 NaN 19.0 NaN \n", "TCGA-CS-4943-01A 2 208248388 NaN 179.0 NaN 73.0 NaN \n", "TCGA-CS-4944-01A 2 208248388 NaN 43.0 NaN 17.0 NaN \n", "\n", " wxs \\\n", "features ReadDepth \n", "base C G T A C G T \n", "sample_barcode Chr Pos \n", "TCGA-CS-4938-01B 2 208248388 37.0 NaN 38.0 NaN 87.0 NaN 29.0 \n", "TCGA-CS-4941-01A 2 208248388 37.0 NaN NaN NaN 182.0 NaN NaN \n", "TCGA-CS-4942-01A 2 208248388 38.0 NaN 38.0 NaN 192.0 NaN 46.0 \n", "TCGA-CS-4943-01A 2 208248388 38.0 NaN 37.0 NaN 116.0 NaN 59.0 \n", "TCGA-CS-4944-01A 2 208248388 37.0 NaN 37.0 NaN 149.0 NaN 43.0 \n", "\n", " \n", "features AverageBaseQuality \n", "base A C G T \n", "sample_barcode Chr Pos \n", "TCGA-CS-4938-01B 2 208248388 NaN 29.0 NaN 29.0 \n", "TCGA-CS-4941-01A 2 208248388 NaN 27.0 NaN NaN \n", "TCGA-CS-4942-01A 2 208248388 NaN 25.0 NaN 28.0 \n", "TCGA-CS-4943-01A 2 208248388 NaN 26.0 NaN 28.0 \n", "TCGA-CS-4944-01A 2 208248388 NaN 27.0 NaN 28.0 " ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unstackDf.head()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "from sklearn import metrics" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVUAAADTCAYAAAAxkoBfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAF3JJREFUeJzt3X2UHFWZx/Hvj/AmUQRMYGMgDnEje5CFIIMSUDaAKKIniLIsWZZXNaggxPcgurpyPIviG6CiQQOoEGQJSEQUMUtAVt4SJCQhBDBGCAQSwFV8QxOe/ePe2TRjz0ylp6p6evL7nNOnu25X9X260vOkblXdexURmJlZOTZrdwBmZsOJk6qZWYmcVM3MSuSkamZWIidVM7MSOamamZXISdXMrEROqmZmJXJSNTMr0ebtDmAwRo0aFV1dXfVWunx5et5tt3rrNbPaLFy48MmIGN3Kth2dVLu6uliwYEG9lU6enJ7nz6+3XjOrjaRft7qtm/9mZiVyUjUzK5GTqplZiZxUzcxK5KRqZlaijr7634quGT/s9/2V57y5pkjMbDjykaqZWYmcVM3MSuSkamZWIidVM7MSOamamZWosqQqaZakNZKWNJR9StKjku7Jj8Mb3jtT0kOSlkt6Y1VxmZlVqcoj1UuAw5qUfykiJubH9QCSdgeOAV6Zt/mapBEVxmZmVonKkmpE3AI8XXD1I4ArIuLZiPgV8BDw6qpiMzOrSjvOqZ4m6d58emD7XDYWeKRhnVW57G9ImiZpgaQFa9eurTpWM7ONUndSvRB4OTARWA18IZerybrR7AMiYmZEdEdE9+jRLY0ha2ZWmVqTakQ8ERHrI+I54CI2NPFXAbs0rLoz8FidsZmZlaHWpCppTMPikUDPnQFzgWMkbSVpV2ACcGedsZmZlaGyAVUkzQYmA6MkrQI+CUyWNJHUtF8JnAIQEUslXQncB6wDTo2I9VXFZmZWlcqSakRMbVL8rX7W/wzwmariMTOrg3tUmZmVyEnVzKxETqpmZiVyUjUzK5GTqplZiZxUzcxK5KRqZlYiJ1UzsxI5qZqZlchJ1cysRE6qZmYlclI1MyvRgElV0khJm+XXr5A0RdIW1YdmZtZ5ihyp3gJsLWksMA84iTSpn5mZ9VIkqSoi/gi8DbggIo4Edq82LDOzzlQoqUqaBBwL/DCXVTYOq5lZJyuSVKcDZwLX5BH6xwM3DbRRni11jaQlDWXnSro/z6Z6jaTtcnmXpD9Juic/vt7qFzIza6cBk2pE3BwRU4Cv5OUVEXF6gc++BDisV9mNwB4RsSfwAClZ9/hlREzMj3cXit7MbIgpcvV/kqT7gGV5eS9JXxtou4i4BXi6V9lPImJdXrydNGuqmdmwUaT5/2XgjcBTABGxCDiwhLpPBn7UsLyrpF9IulnS6/raSNI0SQskLVi7dm0JYZiZlafQzf8R8UivokHNdCrpLNKsqZflotXAuIjYG/gAcLmkbfuIZWZEdEdE9+jRowcThplZ6Yok1Uck7Q+EpC0lfYh8KqAVkk4A3gIcGxEBEBHPRkTPkfBC4JfAK1qtw8ysXYok1XcDpwJjgVXAxLy80SQdBnwUmJLvfe0pHy1pRH49HpgArGilDjOzdhrwftOIeJJ0j+pGkTQbmAyMkrQK+CTpav9WwI2SAG7PV/oPBD4taR3p1MK7I+Lpph9sZjaE9ZlUJV0ARF/vD3RbVURMbVL8rT7WnQPM6e/zzMw6QX9Hqgtqi8LMbJjoM6lGxKWNy/lqfETEM5VHZWbWoYrc/N8taTFwL7BE0iJJ+1QfmplZ5ykyMMos4L0R8TMASa8FLgb2rDIwM7NOVOSWqmd6EipARNwK+BSAmVkTRY5U75T0DWA26W6AfwHmS3oVQETcXWF8ZmYdpUhSnZifP9mrfH9Skj241IjMzDpYkZv/D6ojEDOz4WDApJoHkj4e6Gpcv+CYqmZmm5Qizf/rSWOfLgaeqzYcM7POViSpbh0RH6g8EjOzYaDILVXfkfQuSWMk7dDzqDwyM7MOVORI9S/AucBZbBhgJYDxVQVlZtapiiTVDwB/n4cANDOzfhRp/i8F/jjgWmZmVuhIdT1wj6SbgGd7CovcUiVpFmnqlDURsUcu2wH4HukWrZXA0RHxG6VRq88DDicl8RPdW8vMOk2RI9XvA58Bfg4sbHgUcQlwWK+yGcC8iJgAzMvLAG8iTaMyAZgGXFiwDjOzIaNIj6pLB1qnn21vkdTVq/gI0jQrAJcC80nzVh0BfDtPBni7pO0kjYmI1a3Wb2ZWtyLjqU6QdJWk+ySt6HkMos6dehJlft4xl48FGqfCXpXLesczTdICSQvWrl07iDDMzMpXpPl/Makpvg44CPg28J0KYlGTsr+ZIysiZkZEd0R0jx49uoIwzMxaVySpviAi5gGKiF9HxKcY3MhUT0gaA5Cf1+TyVcAuDevtDDw2iHrMzGpXJKn+WdJmwIOSTpN0JBua7K2YC5yQX58AXNtQfryS/YDf+nyqmXWaIkl1OrANcDqwD3AcG5JivyTNBm4DdpO0StI7gHOAQyU9CByalyEN3LICeAi4CHjvRnwPM7MhocjV/7vyy9/npPjCiPhdkQ+PiKl9vHVIk3UDOLXI55qZDVVFrv5fLmlbSSOB+4Dlkj5cfWhmZp2nSPN/93xk+lZSE30c6RSAmZn1UiSpbiFpC1JSvTYi/kqTW53MzKxYUv0GqY/+SOAWSS8DCp1TNTPb1AyYVCPi/IgYGxGH54tJD5M6AZiZWS9FRql6npxY11UQi5lZxyvS/Dczs4L6TaqSNpO0f13BmJl1un6TakQ8B3yhpljMzDpekeb/TyS9PY/Mb2Zm/Sg68d9IYL2kP5GG6IuI2LbSyMzMOlCRvv8vqiMQM7PhoNAtVZKmAAfmxfkRcV11IZmZda4iA6qcA5xBGkzlPuCMXGZmZr0UOVI9HJiY7wRA0qXAL9gwC6qZmWVFb/7fruH1i6sIxMxsOChypPqfwC8k3US68n8gcGarFUraDfheQ9F44N9JiftdQM8UqR+LiOtbrcfMrB2KXP2fLWk+sC8pqX40Ih5vtcKIWA5MBJA0AngUuAY4CfhSRHy+1c82M2u3os3/nrmgRwD7S3pbSfUfAvwyIn5d0ueZmbXVgEeqkmYBewJLgedycQBXl1D/McDshuXTJB0PLAA+GBG/aRLPNGAawLhx40oIwcysPEXOqe4XEbuXXbGkLYEpbDg/eyFwNilhn00ac+Dk3ttFxExgJkB3d7dnIDCzIaVI8/82SaUnVeBNwN0R8QRARDwREevzrVsXAa+uoE4zs0oVOVK9lJRYHweeZUPf/z0HWfdUGpr+ksZExOq8eCSwZJCfb2ZWuyJJdRZp9tTFbDinOiiStgEOBU5pKP6cpImk5v/KXu+ZmXWEIkn14YiYW2alEfFH4CW9yjzttZl1vCJJ9X5JlwM/IDX/AYiIMq7+m5kNK0WS6gtIyfQNDWVl3VJlZjasFOlRdVIdgZiZDQdFbv6/mHRk+jwR8Tf3kJqZbeqKNP8bB6TemnS702PVhGNm1tmKNP/nNC5Lmg38tLKIzMw6WNEBVRpNANzp3sysiSLnVJ/h+edUHwc+WllEZmYdzLOpmpmVqMjEf/OKlJmZWT9HqpK2BrYBRknanjSQCsC2wEtriM3MrOP01/w/BZhOSqAL2ZBUfwd8teK4zMw6Up9JNSLOA86T9L6IuKDGmAzomvHDAddZec6ba4jEzDZGkQtVTqhD1ECJ10nXrH6t3KdqZmZ96DOpSjogP29VRcWSVkpaLOkeSQty2Q6SbpT0YH7evoq6zcyq0t+R6vn5+bYK6z8oIiZGRHdengHMi4gJwLy8bGbWMfo7p/rXPELVWEnn934zIk6vIJ4jgMn59aXAfNx7y8w6SH9J9S3A64GDSbdUlS2An0gK4Bt56umdeib/i4jVknbsvZGkacA0gHHjPASBmQ0t/d1S9SRwhaRlEbGogroPiIjHcuK8UdL9RTbKyXcmQHd399+M82pm1k5Frv4/JekaSWskPSFpjqSdB1txRDyWn9cA1wCvBp6QNAbSlNXAmsHWY2ZWpyJJ9WJgLqln1VjSBIAXD6ZSSSMlvajnNWn+qyW5nhPyaicA1w6mHjOzuhUZ+X/HiGhMopdImj7IencCrpHUE8PlEfFjSXcBV0p6B/Aw8M+DrMfMrFZFkupaSf8GzM7LU4GnBlNpRKwA9mpS/hRwyGA+28ysnYo0/08GjiYNTr0aOCqXmZlZL0X6/j8MTKkhFiuZB2Uxq5/7/puZlchJ1cysREUuVFkFijTNzazzFJmj6uMNrysZscrMbLjob+i/j0iaRLra36PKEavMzDpef83/5aSb78dL+hmwDHiJpN0iYnkt0ZmZdZj+mv+/AT4GPEQajq9n+L8Zkn5ecVxmZh2pvyPVw4BPAi8HvggsAv4QESfVEZiZWSfq80g1Ij4WEYcAK4HvkhLwaEm3SvpBTfGZmXWUIrdU3RARdwF3SXpPRLxW0qiqAzMz60QD3lIVER9pWDwxlz1ZVUBmZp1so3pUVTQDgJnZsOFuqmZmJao9qUraRdJNkpZJWirpjFz+KUmPSronPw6vOzYzs8FqR9//dcAHI+LuPKXKQkk35ve+FBGfb0NMpXK/frNNV+1JNU9B3TMN9TOSlpHmvjIz63htPacqqQvYG7gjF50m6V5JsyRt38c20yQtkLRg7dq1NUVqZlZM24b+k/RCYA4wPSJ+J+lC4Gwg8vMXaDJtS0TMBGYCdHd3R30RD08DnarwzABmG6ctSVXSFqSEellEXA0QEU80vH8RcF07YrPn85QsZhunHVf/BXwLWBYRX2woH9Ow2pHAkrpjMzMbrHYcqR4AHAcslnRPLvsYMFXSRFLzfyVwShtiMzMblHZc/b8VUJO3rq87FiuHz8uabeAeVWZmJXJSNTMrkZOqmVmJPEV1C25f8RTHuCuqmTXhI1UzsxI5qZqZlchJ1cysRE6qZmYlclI1MyuRr/5b5dzjyjYlPlI1MyuRk6qZWYnc/O9loKbqFSueqimSTYfHbLXhxEnVOoLPy1qncFI1G2J85N7ZnFRtWCgjEQ2nZOYj+/YZcklV0mHAecAI4JsRcU6bQ7JhokjS7BTD6bsMN0MqqUoaAXwVOBRYBdwlaW5E3NfeyMwSHwHaQIZUUgVeDTwUESsAJF0BHAE4qVpH6JQjyLrirOM/maF22kYRUVtlA5F0FHBYRLwzLx8HvCYiTmtYZxowLS/uBizfyGpGAU+WEO5gOIahEwMMjTgcw9CKYWREjG5l46F2pNpsQsDnZf2ImAnMbLkCaUFEdLe6fRkcw9CJYajE4RiGXAxdrW4/1HpUrQJ2aVjeGXisTbGYmW20oZZU7wImSNpV0pbAMcDcNsdkZlbYkGr+R8Q6SacBN5BuqZoVEUtLrqblUwclcgzJUIgBhkYcjiHp+BiG1IUqM7NON9Sa/2ZmHc1J1cysRJtUUpV0mKTlkh6SNKOmOneRdJOkZZKWSjojl+8g6UZJD+bn7WuIZYSkX0i6Li/vKumOHMP38sXBKuvfTtJVku7P+2NS3ftB0vvzv8MSSbMlbV31fpA0S9IaSUsaypp+byXn59/ovZJeVXEc5+Z/j3slXSNpu4b3zsxxLJf0xqpiaHjvQ5JC0qi8XMm+6CsGSe/L33WppM81lG/cfoiITeJBuvD1S2A8sCWwCNi9hnrHAK/Kr18EPADsDnwOmJHLZwCfrSGWDwCXA9fl5SuBY/LrrwPvqbj+S4F35tdbAtvVuR+AscCvgBc0fP8Tq94PwIHAq4AlDWVNvzdwOPAj0j3b+wF3VBzHG4DN8+vPNsSxe/4b2QrYNf/tjKgihly+C+kC9a+BUVXuiz72w0HAT4Gt8vKOre6Hyv6AhtoDmATc0LB8JnBmG+K4ljS2wXJgTC4bAyyvuN6dgXnAwcB1+Yf6ZMMf1PP2TwX1b5sTmnqV17YfclJ9BNiBdOfLdcAb69gPQFevP+Km3xv4BjC12XpVxNHrvSOBy/Lr5/195IQ3qaoYgKuAvYCVDUm1sn3R5N/jSuD1Tdbb6P2wKTX/e/6geqzKZbWR1AXsDdwB7BQRqwHy844VV/9l4CPAc3n5JcD/RsS6vFz1/hgPrAUuzqcgvilpJDXuh4h4FPg88DCwGvgtsJB690OPvr53O3+nJ5OODGuNQ9IU4NGIWNTrrTr3xSuA1+XTQDdL2rfVGDalpDpgF9hKK5deCMwBpkfE7+qqN9f9FmBNRCxsLG6yapX7Y3NSk+vCiNgb+AOp2VubfN7yCFIz7qXASOBNTVZt532GbfmdSjoLWAdcVmcckrYBzgL+vdnbdcSQbQ5sTzrN8GHgSklqJYZNKam2rQuspC1ICfWyiLg6Fz8haUx+fwywpsIQDgCmSFoJXEE6BfBlYDtJPR1Aqt4fq4BVEXFHXr6KlGTr3A+vB34VEWsj4q/A1cD+1LsfevT1vWv/nUo6AXgLcGzkNm6Ncbyc9J/covz73Bm4W9Lf1RgDua6rI7mT1KIb1UoMm1JSbUsX2Py/3beAZRHxxYa35gIn5NcnkM61ViIizoyInSMNEnEM8N8RcSxwE3BUTTE8DjwiabdcdAhpSMfa9gOp2b+fpG3yv0tPDLXthwZ9fe+5wPH5yvd+wG97ThNUQWlQ+I8CUyLij73iO0bSVpJ2BSYAd5Zdf0QsjogdI6Ir/z5XkS7sPk69++L7pIMNJL2CdCH1SVrZD2WdAO+EB+lq4gOkK3hn1VTna0nNhXuBe/LjcNI5zXnAg/l5h5rimcyGq//j8w/kIeC/yFc+K6x7IrAg74vvk5pbte4H4D+A+4ElwHdIV3Ur3Q/AbNI53L+SksY7+vrepObmV/NvdDHQXXEcD5HOGfb8Nr/esP5ZOY7lwJuqiqHX+yvZcKGqkn3Rx37YEvhu/l3cDRzc6n5wN1UzsxJtSs1/M7PKOamamZXISdXMrEROqmZmJXJSNTMrkZOqtUzSkXlUoX9oKOvqGf1H0mTlEbH6+Yz/X0fSFNU0elgfsfy+TfVOzz2LbBhwUrXBmArcSupQMGgRMTcizmllW0kjyoihTaYDTqrDhJOqtSSPZXAA6cbpAZOqpJF5HMu78oAqRzRZ50RJX8mvd8rjey7Kj/2brP97SZ+WdAcwSdI+eTCMhZJuaOgG+q5c7yJJc3qOCnPvutvye2f3E/vxeTzPRZK+k8teJmleLp8naVwuv0TSUQ3b/j4/T5Y0XxvGk70s9xQ6nTQOwU1K4+6OyJ+xRNJiSe8faN/a0OKkaq16K/DjiHgAeFoDDyB8Fql77L6ksSvPzaNU9eV84OaI2Is0RkCzCSBHkoZvew1p5K8LgKMiYh9gFvCZvN7VEbFv/qxlpP8IAM4jDfCyL/B4syAkvTLHfnDe/oz81leAb0fEnqRBSM4f4PtDGqFsOmmMzvHAARFxPqkv+UERcRCp19nYiNgjIv4RuLjA59oQ4qRqrZpKGpyF/Dx1gPXfAMyQdA8wH9gaGNfP+gcDFwJExPqI+G2TddaTBqoB2A3YA7gx1/Fx0uAXAHtI+pmkxcCxwCtz+QGkLouQuqz2FcdVEfFkjuXpXD6JNOB3z7av7ee79LgzIlZFxHOkLqFdTdZZAYyXdEHul1/riGY2eENqimrrDJJeQko2e0gK0qwKIekj/W0GvD0ilvf6rJ0GEcqfI2J9w+cvjYhJTda7BHhrRCySdCJp/IMeA/XTVoF1Gj9nHflgJQ/a0jg1y7MNr9fT5O8vIn4jaS/S4NmnAkeTxjm1DuEjVWvFUaSm78sijS60C2lU//6O1m4A3pcTDZL2HqCOecB78rojJG07wPrLgdGSJuVttshNd0jT2KxWGoLx2IZt/ocN54Mby3vHcXT+jwRJO+Tyn/fa9tb8eiWwT359BLDFAHEDPJNjRGl+ps0iYg7wCdKpD+sgTqrWiqnANb3K5gD/2s82Z5MSzL35lqs+LwxlZwAH5Sb7QjY02ZuKiL+Qkv1nJS0iNa97Lm59gnTO9UbSCFWNdZwq6S7gxX187lLSudmb8+f2DN94OnCSpHuB49hwrvUi4J8k3Qm8hjQY90BmAj+SdBNpVPn5+RTGJaTpPKyDeJQqM7MS+UjVzKxETqpmZiVyUjUzK5GTqplZiZxUzcxK5KRqZlYiJ1UzsxL9H4A2E+BEKjZvAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "\n", "fig,ax=plt.subplots(figsize=(5,3))\n", "\n", "unstackDf[('wxs','ReadDepth','T')].fillna(0).hist(bins=30,label='Minor alle',ax=ax)\n", "#unstackDf[('wxs','ReadDepth','C')].fillna(0).hist(bins=30,ax=ax,\n", "# label='Reference allele')\n", "ax.axvline(x=10,c='red')\n", "ax.set_ylabel('# of tumor samples')\n", "#ax.legend([])\n", "ax.set_xlabel('Allelic read counts')\n", "ax.grid(False)\n" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "351" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(unstackDf[('wxs','ReadDepth','T')]>0).sum()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'asdasdas' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0masdasdas\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'asdasdas' is not defined" ] } ], "source": [ "asdasdas" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_true=(>10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_true.value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print ('ROC AUC',metrics.roc_auc_score(y_true,\n", " unstackDf[('rna-seq','ReadDepth','T')].fillna(0)))\n", "precision, recall, thresholds=metrics.precision_recall_curve(y_true,\n", " unstackDf[('rna-seq','ReadDepth','T')].fillna(0))\n", "ax=pd.DataFrame({'precision':precision,'recall':recall}).plot(x='recall',y='precision')\n", "ax.set_ylabel('Precision')\n", "ax.set_xlabel('Recall')\n", "metrics.auc(recall,precision)\n", "ax.set_title('IDH1 C>T mutation using RNAseq')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "qualFilteredDf[('wxs','ReadDepth')].index.get_level_values('base').value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "qualFilteredDf[('rna-seq','ReadDepth')].index.get_level_values('base').value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "qualFilteredDf[('rna-seq','ReadDepth')].index.get_level_values('base').value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mergedDf_subDf" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "qualFilteredDf['ReadDepth'],qualFilteredDf['wxs']>3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "##c \n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#qualFilteredDf[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "unstackDf=qualFilteredDf['wxs'].unstack()['ReadDepth']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#qualFilteredDf['wxs'].unstack()['ReadDepth']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sns.jointplot(data=qualFilteredDf[qualFilteredDf.index.get_level_values('base')=='T'],x=('rna-seq','ReadDepth'),y=('wxs','ReadDepth'))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#categorical_crossentropy" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### generate the correlation between the data " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }