{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tutorial 3: Joining dataframes with `cptac`\n",
    "\n",
    "In this tutorial, we provide several examples of how to use the built-in `cptac` functions for joining different dataframes.\n",
    "\n",
    "We will do this on data for Endometrial carcinoma. First we need to import the package and create an endometrial data object, which we call 'en'."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Data type</th>\n",
       "      <th>Available sources</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>CNV</td>\n",
       "      <td>awg, washu</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>CNV_gistic</td>\n",
       "      <td>awgconf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CNV_log2ratio</td>\n",
       "      <td>awgconf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>acetylproteomics</td>\n",
       "      <td>awg, awgconf, pdc</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>acetylproteomics_gene</td>\n",
       "      <td>awgconf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>circular_RNA</td>\n",
       "      <td>awg, awgconf, bcm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>clinical</td>\n",
       "      <td>awg, awgconf, mssm, pdc</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>deconvolution_cibersort</td>\n",
       "      <td>washu</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>deconvolution_xcell</td>\n",
       "      <td>washu</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>derived_molecular</td>\n",
       "      <td>awg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>experimental_design</td>\n",
       "      <td>awg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>followup</td>\n",
       "      <td>awg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>gene_fusion</td>\n",
       "      <td>awgconf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>methylation</td>\n",
       "      <td>awgconf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>miRNA</td>\n",
       "      <td>awg, awgconf, washu</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>phosphoproteomics</td>\n",
       "      <td>awg, awgconf, pdc, umich</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>phosphoproteomics_gene</td>\n",
       "      <td>awgconf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>proteomics</td>\n",
       "      <td>awg, awgconf, pdc, umich</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>somatic_mutation</td>\n",
       "      <td>awg, awgconf, harmonized, washu</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>somatic_mutation_binary</td>\n",
       "      <td>awg, awgconf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>targeted_phosphoproteomics</td>\n",
       "      <td>awgconf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>targeted_proteomics</td>\n",
       "      <td>awgconf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>transcriptomics</td>\n",
       "      <td>awg, awgconf, bcm, broad, washu</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>tumor_purity</td>\n",
       "      <td>washu</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     Data type                Available sources\n",
       "0                          CNV                       awg, washu\n",
       "1                   CNV_gistic                          awgconf\n",
       "2                CNV_log2ratio                          awgconf\n",
       "3             acetylproteomics                awg, awgconf, pdc\n",
       "4        acetylproteomics_gene                          awgconf\n",
       "5                 circular_RNA                awg, awgconf, bcm\n",
       "6                     clinical          awg, awgconf, mssm, pdc\n",
       "7      deconvolution_cibersort                            washu\n",
       "8          deconvolution_xcell                            washu\n",
       "9            derived_molecular                              awg\n",
       "10         experimental_design                              awg\n",
       "11                    followup                              awg\n",
       "12                 gene_fusion                          awgconf\n",
       "13                 methylation                          awgconf\n",
       "14                       miRNA              awg, awgconf, washu\n",
       "15           phosphoproteomics         awg, awgconf, pdc, umich\n",
       "16      phosphoproteomics_gene                          awgconf\n",
       "17                  proteomics         awg, awgconf, pdc, umich\n",
       "18            somatic_mutation  awg, awgconf, harmonized, washu\n",
       "19     somatic_mutation_binary                     awg, awgconf\n",
       "20  targeted_phosphoproteomics                          awgconf\n",
       "21         targeted_proteomics                          awgconf\n",
       "22             transcriptomics  awg, awgconf, bcm, broad, washu\n",
       "23                tumor_purity                            washu"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import cptac\n",
    "en = cptac.Ucec()\n",
    "en.list_data_sources()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## General format\n",
    "\n",
    "cptac has a helpful function called `multi_join`. It allows data from several different cptac dataframes to be joined at the same time.\n",
    "\n",
    "To use `multi_join`, you specify the dataframes you want to join by passing a dictionary of their names to the function call. The function will automatically check that the dataframes whose names you provided are valid for the join function, and print an error message if they aren't.\n",
    "\n",
    "Whenever a column from an -omics dataframe is included in a joined table, the name of the -omics dataframe it came from is joined to the column header, to avoid confusion.\n",
    "\n",
    "If you wish to only include particular columns in the join, include them as values in the dictionary. All values will accept either a single column name as a string, or a list of column name strings. In this use case, we will usually only select specific columns for readability, but you could select the whole dataframe in all these cases, except for the mutations dataframe.\n",
    "\n",
    "The join functions use logic analogous to an SQL INNER JOIN."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Join dictionary\n",
    "\n",
    "The main parameter for the `multi_join` function is a dictionary with source and datatype as a key, and specific columns as a value. Because there are multiple sources for each datatype, the desired source needs to be included. This can be done in two different ways. The first is by using a string that contains the source, a space, and then the datatype. The second is by using a tuple formatted (source, datatype). For example, using:\n",
    "\n",
    "`{('awg', 'proteomics'): ''}`\n",
    "\n",
    "or\n",
    "\n",
    "`{\"awg proteomics\": ''}`\n",
    "\n",
    "as the join dictionary would each result in `multi_join` returning a dataframe containing only awg proteomics data.\n",
    "\n",
    "You'll notice the value in the key:value pair is an empty string. Because a dictionary needs to have a value for each key, the empty string or an empty list mean we want everything from the specified dataframe. If a string or list of strings is specified, the joined dataframe will only contain the specified columns. See below for more examples."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Join omics to omics\n",
    "\n",
    "`multi_join` can join two -omics dataframes to each other. Types of -omics data valid for use with this function are acetylproteomics, CNV, phosphoproteomics, phosphoproteomics_gene, proteomics, and transcriptomics."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                         \r"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th>Name</th>\n",
       "      <th>A1BG_awg_proteomics</th>\n",
       "      <th>A2M_awg_proteomics</th>\n",
       "      <th>A2ML1_awg_proteomics</th>\n",
       "      <th>A4GALT_awg_proteomics</th>\n",
       "      <th>AAAS_awg_proteomics</th>\n",
       "      <th>AACS_awg_proteomics</th>\n",
       "      <th>AADAT_awg_proteomics</th>\n",
       "      <th>AAED1_awg_proteomics</th>\n",
       "      <th>AAGAB_awg_proteomics</th>\n",
       "      <th>AAK1_awg_proteomics</th>\n",
       "      <th>...</th>\n",
       "      <th colspan=\"10\" halign=\"left\">ZZZ3_awg_phosphoproteomics</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Site</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>...</th>\n",
       "      <th>S397</th>\n",
       "      <th>S411</th>\n",
       "      <th>S420</th>\n",
       "      <th>S424</th>\n",
       "      <th>S426</th>\n",
       "      <th>S468</th>\n",
       "      <th>S89</th>\n",
       "      <th>T415</th>\n",
       "      <th>T418</th>\n",
       "      <th>Y399</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Patient_ID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C3L-00006</th>\n",
       "      <td>-1.180</td>\n",
       "      <td>-0.8630</td>\n",
       "      <td>-0.802</td>\n",
       "      <td>0.222</td>\n",
       "      <td>0.2560</td>\n",
       "      <td>0.6650</td>\n",
       "      <td>1.2800</td>\n",
       "      <td>-0.3390</td>\n",
       "      <td>0.412</td>\n",
       "      <td>-0.664</td>\n",
       "      <td>...</td>\n",
       "      <td>0.18400</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.20500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00008</th>\n",
       "      <td>-0.685</td>\n",
       "      <td>-1.0700</td>\n",
       "      <td>-0.684</td>\n",
       "      <td>0.984</td>\n",
       "      <td>0.1350</td>\n",
       "      <td>0.3340</td>\n",
       "      <td>1.3000</td>\n",
       "      <td>0.1390</td>\n",
       "      <td>1.330</td>\n",
       "      <td>-0.367</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.17100</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.393</td>\n",
       "      <td>-0.17100</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.29</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.1605</td>\n",
       "      <td>-0.0635</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00032</th>\n",
       "      <td>-0.528</td>\n",
       "      <td>-1.3200</td>\n",
       "      <td>0.435</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.2400</td>\n",
       "      <td>1.0400</td>\n",
       "      <td>-0.0213</td>\n",
       "      <td>-0.0479</td>\n",
       "      <td>0.419</td>\n",
       "      <td>-0.500</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00090</th>\n",
       "      <td>-1.670</td>\n",
       "      <td>-1.1900</td>\n",
       "      <td>-0.443</td>\n",
       "      <td>0.243</td>\n",
       "      <td>-0.0993</td>\n",
       "      <td>0.7570</td>\n",
       "      <td>0.7400</td>\n",
       "      <td>-0.9290</td>\n",
       "      <td>0.229</td>\n",
       "      <td>-0.223</td>\n",
       "      <td>...</td>\n",
       "      <td>0.13970</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.55900</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.2980</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00098</th>\n",
       "      <td>-0.374</td>\n",
       "      <td>-0.0206</td>\n",
       "      <td>-0.537</td>\n",
       "      <td>0.311</td>\n",
       "      <td>0.3750</td>\n",
       "      <td>0.0131</td>\n",
       "      <td>-1.1000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.565</td>\n",
       "      <td>-0.101</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.15875</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.196</td>\n",
       "      <td>0.06175</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.2900</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 84211 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "Name       A1BG_awg_proteomics A2M_awg_proteomics A2ML1_awg_proteomics  \\\n",
       "Site                                                                     \n",
       "Patient_ID                                                               \n",
       "C3L-00006               -1.180            -0.8630               -0.802   \n",
       "C3L-00008               -0.685            -1.0700               -0.684   \n",
       "C3L-00032               -0.528            -1.3200                0.435   \n",
       "C3L-00090               -1.670            -1.1900               -0.443   \n",
       "C3L-00098               -0.374            -0.0206               -0.537   \n",
       "\n",
       "Name       A4GALT_awg_proteomics AAAS_awg_proteomics AACS_awg_proteomics  \\\n",
       "Site                                                                       \n",
       "Patient_ID                                                                 \n",
       "C3L-00006                  0.222              0.2560              0.6650   \n",
       "C3L-00008                  0.984              0.1350              0.3340   \n",
       "C3L-00032                    NaN             -0.2400              1.0400   \n",
       "C3L-00090                  0.243             -0.0993              0.7570   \n",
       "C3L-00098                  0.311              0.3750              0.0131   \n",
       "\n",
       "Name       AADAT_awg_proteomics AAED1_awg_proteomics AAGAB_awg_proteomics  \\\n",
       "Site                                                                        \n",
       "Patient_ID                                                                  \n",
       "C3L-00006                1.2800              -0.3390                0.412   \n",
       "C3L-00008                1.3000               0.1390                1.330   \n",
       "C3L-00032               -0.0213              -0.0479                0.419   \n",
       "C3L-00090                0.7400              -0.9290                0.229   \n",
       "C3L-00098               -1.1000                  NaN                0.565   \n",
       "\n",
       "Name       AAK1_awg_proteomics  ... ZZZ3_awg_phosphoproteomics            \\\n",
       "Site                            ...                       S397 S411 S420   \n",
       "Patient_ID                      ...                                        \n",
       "C3L-00006               -0.664  ...                    0.18400  NaN  NaN   \n",
       "C3L-00008               -0.367  ...                   -0.17100  NaN  NaN   \n",
       "C3L-00032               -0.500  ...                        NaN  NaN  NaN   \n",
       "C3L-00090               -0.223  ...                    0.13970  NaN  NaN   \n",
       "C3L-00098               -0.101  ...                   -0.15875  NaN  NaN   \n",
       "\n",
       "Name                                                        \n",
       "Site         S424     S426 S468   S89 T415    T418    Y399  \n",
       "Patient_ID                                                  \n",
       "C3L-00006     NaN -0.20500  NaN   NaN  NaN     NaN     NaN  \n",
       "C3L-00008  -0.393 -0.17100  NaN  0.29  NaN  0.1605 -0.0635  \n",
       "C3L-00032     NaN      NaN  NaN   NaN  NaN     NaN     NaN  \n",
       "C3L-00090     NaN -0.55900  NaN   NaN  NaN     NaN  0.2980  \n",
       "C3L-00098   0.196  0.06175  NaN   NaN  NaN     NaN -0.2900  \n",
       "\n",
       "[5 rows x 84211 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prot_and_phos = en.multi_join({\"awg proteomics\":'', \"awg phosphoproteomics\":''})\n",
    "prot_and_phos.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Joining only specific columns.\n",
    "(Note that when a gene is selected from the phosphoproteomics dataframe, data for all sites of the gene are selected. The same is done for acetylproteomics data.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th>Name</th>\n",
       "      <th>A1BG_awg_proteomics</th>\n",
       "      <th colspan=\"2\" halign=\"left\">PIK3CA_awg_phosphoproteomics</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Site</th>\n",
       "      <th></th>\n",
       "      <th>S312</th>\n",
       "      <th>T313</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Patient_ID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C3L-00006</th>\n",
       "      <td>-1.180</td>\n",
       "      <td>-0.00615</td>\n",
       "      <td>0.0731</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00008</th>\n",
       "      <td>-0.685</td>\n",
       "      <td>-0.02220</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00032</th>\n",
       "      <td>-0.528</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0830</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00090</th>\n",
       "      <td>-1.670</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.8460</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00098</th>\n",
       "      <td>-0.374</td>\n",
       "      <td>0.43600</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Name       A1BG_awg_proteomics PIK3CA_awg_phosphoproteomics        \n",
       "Site                                                   S312    T313\n",
       "Patient_ID                                                         \n",
       "C3L-00006               -1.180                     -0.00615  0.0731\n",
       "C3L-00008               -0.685                     -0.02220     NaN\n",
       "C3L-00032               -0.528                          NaN  0.0830\n",
       "C3L-00090               -1.670                          NaN -0.8460\n",
       "C3L-00098               -0.374                      0.43600     NaN"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prot_and_phos_selected = en.multi_join({\"awg proteomics\":'A1BG', \"awg phosphoproteomics\":'PIK3CA'})\n",
    "prot_and_phos_selected.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Join metadata to omics\n",
    "\n",
    "The `multi_join` function can also join a metadata dataframe (e.g. clinical or derived_molecular) with an -omics dataframe:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                       \r"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Name</th>\n",
       "      <th>Sample_ID</th>\n",
       "      <th>Sample_Tumor_Normal</th>\n",
       "      <th>Proteomics_Tumor_Normal</th>\n",
       "      <th>Country</th>\n",
       "      <th>Histologic_Grade_FIGO</th>\n",
       "      <th>Myometrial_invasion_Specify</th>\n",
       "      <th>Histologic_type</th>\n",
       "      <th>Treatment_naive</th>\n",
       "      <th>Tumor_purity</th>\n",
       "      <th>Path_Stage_Primary_Tumor-pT</th>\n",
       "      <th>...</th>\n",
       "      <th>ZWILCH_awg_transcriptomics</th>\n",
       "      <th>ZWINT_awg_transcriptomics</th>\n",
       "      <th>ZXDA_awg_transcriptomics</th>\n",
       "      <th>ZXDB_awg_transcriptomics</th>\n",
       "      <th>ZXDC_awg_transcriptomics</th>\n",
       "      <th>ZYG11A_awg_transcriptomics</th>\n",
       "      <th>ZYG11B_awg_transcriptomics</th>\n",
       "      <th>ZYX_awg_transcriptomics</th>\n",
       "      <th>ZZEF1_awg_transcriptomics</th>\n",
       "      <th>ZZZ3_awg_transcriptomics</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Patient_ID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C3L-00006</th>\n",
       "      <td>S001</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>United States</td>\n",
       "      <td>FIGO grade 1</td>\n",
       "      <td>under 50 %</td>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>YES</td>\n",
       "      <td>Normal</td>\n",
       "      <td>pT1a (FIGO IA)</td>\n",
       "      <td>...</td>\n",
       "      <td>11.06</td>\n",
       "      <td>10.73</td>\n",
       "      <td>8.40</td>\n",
       "      <td>9.78</td>\n",
       "      <td>10.88</td>\n",
       "      <td>5.93</td>\n",
       "      <td>11.52</td>\n",
       "      <td>10.23</td>\n",
       "      <td>11.50</td>\n",
       "      <td>11.47</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00008</th>\n",
       "      <td>S002</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>United States</td>\n",
       "      <td>FIGO grade 1</td>\n",
       "      <td>under 50 %</td>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>YES</td>\n",
       "      <td>Normal</td>\n",
       "      <td>pT1a (FIGO IA)</td>\n",
       "      <td>...</td>\n",
       "      <td>10.87</td>\n",
       "      <td>11.43</td>\n",
       "      <td>8.39</td>\n",
       "      <td>9.14</td>\n",
       "      <td>10.38</td>\n",
       "      <td>7.25</td>\n",
       "      <td>11.64</td>\n",
       "      <td>10.64</td>\n",
       "      <td>11.26</td>\n",
       "      <td>11.57</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00032</th>\n",
       "      <td>S003</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>United States</td>\n",
       "      <td>FIGO grade 2</td>\n",
       "      <td>under 50 %</td>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>YES</td>\n",
       "      <td>Normal</td>\n",
       "      <td>pT1a (FIGO IA)</td>\n",
       "      <td>...</td>\n",
       "      <td>10.06</td>\n",
       "      <td>10.13</td>\n",
       "      <td>8.35</td>\n",
       "      <td>9.27</td>\n",
       "      <td>10.46</td>\n",
       "      <td>6.85</td>\n",
       "      <td>11.60</td>\n",
       "      <td>10.21</td>\n",
       "      <td>11.51</td>\n",
       "      <td>11.09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00084</th>\n",
       "      <td>S004</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Carcinosarcoma</td>\n",
       "      <td>YES</td>\n",
       "      <td>Normal</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00090</th>\n",
       "      <td>S005</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>United States</td>\n",
       "      <td>FIGO grade 2</td>\n",
       "      <td>under 50 %</td>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>YES</td>\n",
       "      <td>Normal</td>\n",
       "      <td>pT1a (FIGO IA)</td>\n",
       "      <td>...</td>\n",
       "      <td>10.29</td>\n",
       "      <td>10.41</td>\n",
       "      <td>9.10</td>\n",
       "      <td>9.59</td>\n",
       "      <td>10.15</td>\n",
       "      <td>7.89</td>\n",
       "      <td>11.90</td>\n",
       "      <td>10.21</td>\n",
       "      <td>11.34</td>\n",
       "      <td>11.51</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 28084 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "Name       Sample_ID Sample_Tumor_Normal Proteomics_Tumor_Normal  \\\n",
       "Patient_ID                                                         \n",
       "C3L-00006       S001               Tumor                   Tumor   \n",
       "C3L-00008       S002               Tumor                   Tumor   \n",
       "C3L-00032       S003               Tumor                   Tumor   \n",
       "C3L-00084       S004               Tumor                   Tumor   \n",
       "C3L-00090       S005               Tumor                   Tumor   \n",
       "\n",
       "Name              Country Histologic_Grade_FIGO Myometrial_invasion_Specify  \\\n",
       "Patient_ID                                                                    \n",
       "C3L-00006   United States          FIGO grade 1                  under 50 %   \n",
       "C3L-00008   United States          FIGO grade 1                  under 50 %   \n",
       "C3L-00032   United States          FIGO grade 2                  under 50 %   \n",
       "C3L-00084             NaN                   NaN                         NaN   \n",
       "C3L-00090   United States          FIGO grade 2                  under 50 %   \n",
       "\n",
       "Name       Histologic_type Treatment_naive Tumor_purity  \\\n",
       "Patient_ID                                                \n",
       "C3L-00006     Endometrioid             YES       Normal   \n",
       "C3L-00008     Endometrioid             YES       Normal   \n",
       "C3L-00032     Endometrioid             YES       Normal   \n",
       "C3L-00084   Carcinosarcoma             YES       Normal   \n",
       "C3L-00090     Endometrioid             YES       Normal   \n",
       "\n",
       "Name       Path_Stage_Primary_Tumor-pT  ... ZWILCH_awg_transcriptomics  \\\n",
       "Patient_ID                              ...                              \n",
       "C3L-00006               pT1a (FIGO IA)  ...                      11.06   \n",
       "C3L-00008               pT1a (FIGO IA)  ...                      10.87   \n",
       "C3L-00032               pT1a (FIGO IA)  ...                      10.06   \n",
       "C3L-00084                          NaN  ...                        NaN   \n",
       "C3L-00090               pT1a (FIGO IA)  ...                      10.29   \n",
       "\n",
       "Name       ZWINT_awg_transcriptomics ZXDA_awg_transcriptomics  \\\n",
       "Patient_ID                                                      \n",
       "C3L-00006                      10.73                     8.40   \n",
       "C3L-00008                      11.43                     8.39   \n",
       "C3L-00032                      10.13                     8.35   \n",
       "C3L-00084                        NaN                      NaN   \n",
       "C3L-00090                      10.41                     9.10   \n",
       "\n",
       "Name       ZXDB_awg_transcriptomics ZXDC_awg_transcriptomics  \\\n",
       "Patient_ID                                                     \n",
       "C3L-00006                      9.78                    10.88   \n",
       "C3L-00008                      9.14                    10.38   \n",
       "C3L-00032                      9.27                    10.46   \n",
       "C3L-00084                       NaN                      NaN   \n",
       "C3L-00090                      9.59                    10.15   \n",
       "\n",
       "Name        ZYG11A_awg_transcriptomics  ZYG11B_awg_transcriptomics  \\\n",
       "Patient_ID                                                           \n",
       "C3L-00006                         5.93                       11.52   \n",
       "C3L-00008                         7.25                       11.64   \n",
       "C3L-00032                         6.85                       11.60   \n",
       "C3L-00084                          NaN                         NaN   \n",
       "C3L-00090                         7.89                       11.90   \n",
       "\n",
       "Name        ZYX_awg_transcriptomics ZZEF1_awg_transcriptomics  \\\n",
       "Patient_ID                                                      \n",
       "C3L-00006                     10.23                     11.50   \n",
       "C3L-00008                     10.64                     11.26   \n",
       "C3L-00032                     10.21                     11.51   \n",
       "C3L-00084                       NaN                       NaN   \n",
       "C3L-00090                     10.21                     11.34   \n",
       "\n",
       "Name       ZZZ3_awg_transcriptomics  \n",
       "Patient_ID                           \n",
       "C3L-00006                     11.47  \n",
       "C3L-00008                     11.57  \n",
       "C3L-00032                     11.09  \n",
       "C3L-00084                       NaN  \n",
       "C3L-00090                     11.51  \n",
       "\n",
       "[5 rows x 28084 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clin_and_tran = en.multi_join({\"awg clinical\":'', \"awg transcriptomics\":''})\n",
    "clin_and_tran.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Joining only specific columns:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Name</th>\n",
       "      <th>Age</th>\n",
       "      <th>Histologic_type</th>\n",
       "      <th>ZZZ3_awg_transcriptomics</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Patient_ID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C3L-00006</th>\n",
       "      <td>64.0</td>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>11.47</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00008</th>\n",
       "      <td>58.0</td>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>11.57</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00032</th>\n",
       "      <td>50.0</td>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>11.09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00084</th>\n",
       "      <td>NaN</td>\n",
       "      <td>Carcinosarcoma</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00090</th>\n",
       "      <td>75.0</td>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>11.51</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Name         Age Histologic_type  ZZZ3_awg_transcriptomics\n",
       "Patient_ID                                                \n",
       "C3L-00006   64.0    Endometrioid                     11.47\n",
       "C3L-00008   58.0    Endometrioid                     11.57\n",
       "C3L-00032   50.0    Endometrioid                     11.09\n",
       "C3L-00084    NaN  Carcinosarcoma                       NaN\n",
       "C3L-00090   75.0    Endometrioid                     11.51"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clin_and_tran = en.multi_join({\"awg clinical\": [\"Age\", \"Histologic_type\"], \"awg transcriptomics\": \"ZZZ3\"})\n",
    "clin_and_tran.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Join metadata to metadata\n",
    "\n",
    "Of course two metadata dataframes (e.g. clinical or derived_molecular) can also be joined together. Note how we passed a column name to select from the clinical dataframe, but passing an empty string `''` or an empty list `[]` for the column parameter for the derived_molecular dataframe caused the entire dataframe to be selected."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Name</th>\n",
       "      <th>Histologic_type</th>\n",
       "      <th>Estrogen_Receptor</th>\n",
       "      <th>Estrogen_Receptor_%</th>\n",
       "      <th>Progesterone_Receptor</th>\n",
       "      <th>Progesterone_Receptor_%</th>\n",
       "      <th>MLH1</th>\n",
       "      <th>MLH2</th>\n",
       "      <th>MSH6</th>\n",
       "      <th>PMS2</th>\n",
       "      <th>p53</th>\n",
       "      <th>...</th>\n",
       "      <th>Log2_variant_total</th>\n",
       "      <th>Log2_SNP_total</th>\n",
       "      <th>Log2_INDEL_total</th>\n",
       "      <th>Genomics_subtype</th>\n",
       "      <th>Mutation_signature_C&gt;A</th>\n",
       "      <th>Mutation_signature_C&gt;G</th>\n",
       "      <th>Mutation_signature_C&gt;T</th>\n",
       "      <th>Mutation_signature_T&gt;C</th>\n",
       "      <th>Mutation_signature_T&gt;A</th>\n",
       "      <th>Mutation_signature_T&gt;G</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Patient_ID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C3L-00006</th>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>Cannot be determined</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Cannot be determined</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Intact nuclear expression</td>\n",
       "      <td>Intact nuclear expression</td>\n",
       "      <td>Loss of nuclear expression</td>\n",
       "      <td>Intact nuclear expression</td>\n",
       "      <td>Cannot be determined</td>\n",
       "      <td>...</td>\n",
       "      <td>10.062046</td>\n",
       "      <td>9.984418</td>\n",
       "      <td>5.832890</td>\n",
       "      <td>MSI-H</td>\n",
       "      <td>8.300395</td>\n",
       "      <td>1.482213</td>\n",
       "      <td>72.529644</td>\n",
       "      <td>14.426877</td>\n",
       "      <td>1.383399</td>\n",
       "      <td>1.877470</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00008</th>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>Cannot be determined</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Cannot be determined</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Intact nuclear expression</td>\n",
       "      <td>Intact nuclear expression</td>\n",
       "      <td>Intact nuclear expression</td>\n",
       "      <td>Loss of nuclear expression</td>\n",
       "      <td>Cannot be determined</td>\n",
       "      <td>...</td>\n",
       "      <td>8.861087</td>\n",
       "      <td>8.330917</td>\n",
       "      <td>7.169925</td>\n",
       "      <td>MSI-H</td>\n",
       "      <td>14.641745</td>\n",
       "      <td>2.803738</td>\n",
       "      <td>64.485981</td>\n",
       "      <td>15.264798</td>\n",
       "      <td>0.934579</td>\n",
       "      <td>1.869159</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00032</th>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>Cannot be determined</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Cannot be determined</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Intact nuclear expression</td>\n",
       "      <td>Intact nuclear expression</td>\n",
       "      <td>Intact nuclear expression</td>\n",
       "      <td>Intact nuclear expression</td>\n",
       "      <td>Cannot be determined</td>\n",
       "      <td>...</td>\n",
       "      <td>5.321928</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>3.169925</td>\n",
       "      <td>CNV_low</td>\n",
       "      <td>16.129032</td>\n",
       "      <td>3.225806</td>\n",
       "      <td>70.967742</td>\n",
       "      <td>3.225806</td>\n",
       "      <td>3.225806</td>\n",
       "      <td>3.225806</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00084</th>\n",
       "      <td>Carcinosarcoma</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00090</th>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>Cannot be determined</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Cannot be determined</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Intact nuclear expression</td>\n",
       "      <td>Intact nuclear expression</td>\n",
       "      <td>Intact nuclear expression</td>\n",
       "      <td>Intact nuclear expression</td>\n",
       "      <td>Cannot be determined</td>\n",
       "      <td>...</td>\n",
       "      <td>5.672425</td>\n",
       "      <td>5.523562</td>\n",
       "      <td>2.584963</td>\n",
       "      <td>CNV_low</td>\n",
       "      <td>17.777778</td>\n",
       "      <td>8.888889</td>\n",
       "      <td>62.222222</td>\n",
       "      <td>8.888889</td>\n",
       "      <td>2.222222</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 126 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "Name       Histologic_type     Estrogen_Receptor  Estrogen_Receptor_%  \\\n",
       "Patient_ID                                                              \n",
       "C3L-00006     Endometrioid  Cannot be determined                  NaN   \n",
       "C3L-00008     Endometrioid  Cannot be determined                  NaN   \n",
       "C3L-00032     Endometrioid  Cannot be determined                  NaN   \n",
       "C3L-00084   Carcinosarcoma                   NaN                  NaN   \n",
       "C3L-00090     Endometrioid  Cannot be determined                  NaN   \n",
       "\n",
       "Name       Progesterone_Receptor  Progesterone_Receptor_%  \\\n",
       "Patient_ID                                                  \n",
       "C3L-00006   Cannot be determined                      NaN   \n",
       "C3L-00008   Cannot be determined                      NaN   \n",
       "C3L-00032   Cannot be determined                      NaN   \n",
       "C3L-00084                    NaN                      NaN   \n",
       "C3L-00090   Cannot be determined                      NaN   \n",
       "\n",
       "Name                             MLH1                       MLH2  \\\n",
       "Patient_ID                                                         \n",
       "C3L-00006   Intact nuclear expression  Intact nuclear expression   \n",
       "C3L-00008   Intact nuclear expression  Intact nuclear expression   \n",
       "C3L-00032   Intact nuclear expression  Intact nuclear expression   \n",
       "C3L-00084                         NaN                        NaN   \n",
       "C3L-00090   Intact nuclear expression  Intact nuclear expression   \n",
       "\n",
       "Name                              MSH6                        PMS2  \\\n",
       "Patient_ID                                                           \n",
       "C3L-00006   Loss of nuclear expression   Intact nuclear expression   \n",
       "C3L-00008    Intact nuclear expression  Loss of nuclear expression   \n",
       "C3L-00032    Intact nuclear expression   Intact nuclear expression   \n",
       "C3L-00084                          NaN                         NaN   \n",
       "C3L-00090    Intact nuclear expression   Intact nuclear expression   \n",
       "\n",
       "Name                         p53  ... Log2_variant_total Log2_SNP_total  \\\n",
       "Patient_ID                        ...                                     \n",
       "C3L-00006   Cannot be determined  ...          10.062046       9.984418   \n",
       "C3L-00008   Cannot be determined  ...           8.861087       8.330917   \n",
       "C3L-00032   Cannot be determined  ...           5.321928       5.000000   \n",
       "C3L-00084                    NaN  ...                NaN            NaN   \n",
       "C3L-00090   Cannot be determined  ...           5.672425       5.523562   \n",
       "\n",
       "Name        Log2_INDEL_total  Genomics_subtype  Mutation_signature_C>A  \\\n",
       "Patient_ID                                                               \n",
       "C3L-00006           5.832890             MSI-H                8.300395   \n",
       "C3L-00008           7.169925             MSI-H               14.641745   \n",
       "C3L-00032           3.169925           CNV_low               16.129032   \n",
       "C3L-00084                NaN               NaN                     NaN   \n",
       "C3L-00090           2.584963           CNV_low               17.777778   \n",
       "\n",
       "Name        Mutation_signature_C>G  Mutation_signature_C>T  \\\n",
       "Patient_ID                                                   \n",
       "C3L-00006                 1.482213               72.529644   \n",
       "C3L-00008                 2.803738               64.485981   \n",
       "C3L-00032                 3.225806               70.967742   \n",
       "C3L-00084                      NaN                     NaN   \n",
       "C3L-00090                 8.888889               62.222222   \n",
       "\n",
       "Name        Mutation_signature_T>C  Mutation_signature_T>A  \\\n",
       "Patient_ID                                                   \n",
       "C3L-00006                14.426877                1.383399   \n",
       "C3L-00008                15.264798                0.934579   \n",
       "C3L-00032                 3.225806                3.225806   \n",
       "C3L-00084                      NaN                     NaN   \n",
       "C3L-00090                 8.888889                2.222222   \n",
       "\n",
       "Name        Mutation_signature_T>G  \n",
       "Patient_ID                          \n",
       "C3L-00006                 1.877470  \n",
       "C3L-00008                 1.869159  \n",
       "C3L-00032                 3.225806  \n",
       "C3L-00084                      NaN  \n",
       "C3L-00090                 0.000000  \n",
       "\n",
       "[5 rows x 126 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hist_and_derived_molecular = en.multi_join({\n",
    "    \"awg clinical\": \"Histologic_type\",\n",
    "    \"awg derived_molecular\": '' # Note that by using an empty string or list as the value, we join the entire dataframe\n",
    "})\n",
    "\n",
    "hist_and_derived_molecular.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Join many datatypes together\n",
    "\n",
    "If you need data from three or more dataframes, they can all simply be added to the joining dictionary. The only limit to the number of dataframes the joining dictionary parameter for `multi_join` can take is your imagination."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "cptac warning: The following columns were not found in the awg phosphoproteomics dataframe, so they were inserted into joined table, but filled with NaN: AURKA (<ipython-input-7-8c248f83a0d2>, line 2)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                        \r"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "cptac warning: In joining the somatic_mutation table, no mutations were found for the following samples, so they were filled with Wildtype_Tumor or Wildtype_Normal: 78 samples for the PTEN gene (<ipython-input-7-8c248f83a0d2>, line 2)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th>Name</th>\n",
       "      <th>AURKA_awg_proteomics</th>\n",
       "      <th>TP53_awg_proteomics</th>\n",
       "      <th>AURKA_awg_phosphoproteomics</th>\n",
       "      <th colspan=\"2\" halign=\"left\">TP53_awg_phosphoproteomics</th>\n",
       "      <th>Sample_ID</th>\n",
       "      <th>Sample_Tumor_Normal</th>\n",
       "      <th>Proteomics_Tumor_Normal</th>\n",
       "      <th>Country</th>\n",
       "      <th>Histologic_Grade_FIGO</th>\n",
       "      <th>...</th>\n",
       "      <th>Gender</th>\n",
       "      <th>Tumor_Site</th>\n",
       "      <th>Tumor_Site_Other</th>\n",
       "      <th>Tumor_Focality</th>\n",
       "      <th>Tumor_Size_cm</th>\n",
       "      <th>Num_full_term_pregnancies</th>\n",
       "      <th>PTEN_Mutation</th>\n",
       "      <th>PTEN_Location</th>\n",
       "      <th>PTEN_Mutation_Status</th>\n",
       "      <th>Sample_Status</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Site</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>NaN</th>\n",
       "      <th>S315</th>\n",
       "      <th>T150</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>...</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Patient_ID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C3L-00006</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.295</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S001</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>United States</td>\n",
       "      <td>FIGO grade 1</td>\n",
       "      <td>...</td>\n",
       "      <td>Female</td>\n",
       "      <td>Anterior endometrium</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Unifocal</td>\n",
       "      <td>2.9</td>\n",
       "      <td>1</td>\n",
       "      <td>[Missense_Mutation, Nonsense_Mutation]</td>\n",
       "      <td>[p.R130Q, p.R233*]</td>\n",
       "      <td>Multiple_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00008</th>\n",
       "      <td>0.311</td>\n",
       "      <td>0.277</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.646</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S002</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>United States</td>\n",
       "      <td>FIGO grade 1</td>\n",
       "      <td>...</td>\n",
       "      <td>Female</td>\n",
       "      <td>Posterior endometrium</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Unifocal</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>[Missense_Mutation]</td>\n",
       "      <td>[p.G127R]</td>\n",
       "      <td>Single_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00032</th>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.871</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.800</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S003</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>United States</td>\n",
       "      <td>FIGO grade 2</td>\n",
       "      <td>...</td>\n",
       "      <td>Female</td>\n",
       "      <td>Other, specify</td>\n",
       "      <td>Anterior and Posterior endometrium</td>\n",
       "      <td>Unifocal</td>\n",
       "      <td>4.5</td>\n",
       "      <td>4 or more</td>\n",
       "      <td>[Nonsense_Mutation]</td>\n",
       "      <td>[p.W111*]</td>\n",
       "      <td>Single_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00084</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S004</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[Wildtype_Tumor]</td>\n",
       "      <td>[No_mutation]</td>\n",
       "      <td>Wildtype_Tumor</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00090</th>\n",
       "      <td>-0.798</td>\n",
       "      <td>-0.343</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S005</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>Tumor</td>\n",
       "      <td>United States</td>\n",
       "      <td>FIGO grade 2</td>\n",
       "      <td>...</td>\n",
       "      <td>Female</td>\n",
       "      <td>Other, specify</td>\n",
       "      <td>Anterior and Posterior endometrium</td>\n",
       "      <td>Unifocal</td>\n",
       "      <td>3.5</td>\n",
       "      <td>4 or more</td>\n",
       "      <td>[Missense_Mutation]</td>\n",
       "      <td>[p.R130G]</td>\n",
       "      <td>Single_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 36 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "Name       AURKA_awg_proteomics TP53_awg_proteomics  \\\n",
       "Site                                                  \n",
       "Patient_ID                                            \n",
       "C3L-00006                   NaN               0.295   \n",
       "C3L-00008                 0.311               0.277   \n",
       "C3L-00032                   NaN              -0.871   \n",
       "C3L-00084                   NaN                 NaN   \n",
       "C3L-00090                -0.798              -0.343   \n",
       "\n",
       "Name       AURKA_awg_phosphoproteomics TP53_awg_phosphoproteomics       \\\n",
       "Site                               NaN                       S315 T150   \n",
       "Patient_ID                                                               \n",
       "C3L-00006                          NaN                        NaN  NaN   \n",
       "C3L-00008                          NaN                      0.646  NaN   \n",
       "C3L-00032                          NaN                     -0.800  NaN   \n",
       "C3L-00084                          NaN                        NaN  NaN   \n",
       "C3L-00090                          NaN                        NaN  NaN   \n",
       "\n",
       "Name       Sample_ID Sample_Tumor_Normal Proteomics_Tumor_Normal  \\\n",
       "Site                                                               \n",
       "Patient_ID                                                         \n",
       "C3L-00006       S001               Tumor                   Tumor   \n",
       "C3L-00008       S002               Tumor                   Tumor   \n",
       "C3L-00032       S003               Tumor                   Tumor   \n",
       "C3L-00084       S004               Tumor                   Tumor   \n",
       "C3L-00090       S005               Tumor                   Tumor   \n",
       "\n",
       "Name              Country Histologic_Grade_FIGO  ...  Gender  \\\n",
       "Site                                             ...           \n",
       "Patient_ID                                       ...           \n",
       "C3L-00006   United States          FIGO grade 1  ...  Female   \n",
       "C3L-00008   United States          FIGO grade 1  ...  Female   \n",
       "C3L-00032   United States          FIGO grade 2  ...  Female   \n",
       "C3L-00084             NaN                   NaN  ...     NaN   \n",
       "C3L-00090   United States          FIGO grade 2  ...  Female   \n",
       "\n",
       "Name                   Tumor_Site                    Tumor_Site_Other  \\\n",
       "Site                                                                    \n",
       "Patient_ID                                                              \n",
       "C3L-00006    Anterior endometrium                                 NaN   \n",
       "C3L-00008   Posterior endometrium                                 NaN   \n",
       "C3L-00032          Other, specify  Anterior and Posterior endometrium   \n",
       "C3L-00084                     NaN                                 NaN   \n",
       "C3L-00090          Other, specify  Anterior and Posterior endometrium   \n",
       "\n",
       "Name       Tumor_Focality Tumor_Size_cm Num_full_term_pregnancies  \\\n",
       "Site                                                                \n",
       "Patient_ID                                                          \n",
       "C3L-00006        Unifocal           2.9                         1   \n",
       "C3L-00008        Unifocal           3.5                         1   \n",
       "C3L-00032        Unifocal           4.5                 4 or more   \n",
       "C3L-00084             NaN           NaN                       NaN   \n",
       "C3L-00090        Unifocal           3.5                 4 or more   \n",
       "\n",
       "Name                                 PTEN_Mutation       PTEN_Location  \\\n",
       "Site                                                                     \n",
       "Patient_ID                                                               \n",
       "C3L-00006   [Missense_Mutation, Nonsense_Mutation]  [p.R130Q, p.R233*]   \n",
       "C3L-00008                      [Missense_Mutation]           [p.G127R]   \n",
       "C3L-00032                      [Nonsense_Mutation]           [p.W111*]   \n",
       "C3L-00084                         [Wildtype_Tumor]       [No_mutation]   \n",
       "C3L-00090                      [Missense_Mutation]           [p.R130G]   \n",
       "\n",
       "Name       PTEN_Mutation_Status Sample_Status  \n",
       "Site                                           \n",
       "Patient_ID                                     \n",
       "C3L-00006     Multiple_mutation         Tumor  \n",
       "C3L-00008       Single_mutation         Tumor  \n",
       "C3L-00032       Single_mutation         Tumor  \n",
       "C3L-00084        Wildtype_Tumor         Tumor  \n",
       "C3L-00090       Single_mutation         Tumor  \n",
       "\n",
       "[5 rows x 36 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "joining_dictionary = {\"awg proteomics\": [\"AURKA\", \"TP53\"], \"awg phosphoproteomics\": [\"AURKA\", \"TP53\"], \"awg clinical\": [], \"awg somatic_mutation\": \"PTEN\"}\n",
    "en.multi_join(joining_dictionary).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`multi_join` does not necessarily need to join different dataframes. If you just want a small amount of information from a dataframe, this function is useful for that as well."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Name</th>\n",
       "      <th>Histologic_type</th>\n",
       "      <th>Histologic_Grade_FIGO</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Patient_ID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C3L-00006</th>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>FIGO grade 1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00008</th>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>FIGO grade 1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00032</th>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>FIGO grade 2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00084</th>\n",
       "      <td>Carcinosarcoma</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00090</th>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>FIGO grade 2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Name       Histologic_type Histologic_Grade_FIGO\n",
       "Patient_ID                                      \n",
       "C3L-00006     Endometrioid          FIGO grade 1\n",
       "C3L-00008     Endometrioid          FIGO grade 1\n",
       "C3L-00032     Endometrioid          FIGO grade 2\n",
       "C3L-00084   Carcinosarcoma                   NaN\n",
       "C3L-00090     Endometrioid          FIGO grade 2"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "histologic_type_and_grade = en.multi_join({\"awg clinical\": ['Histologic_type', 'Histologic_Grade_FIGO']})\n",
    "histologic_type_and_grade.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Join omics to mutations\n",
    "\n",
    "Joining an -omics dataframe with the mutation data for a specified gene or genes is slightly different than other types of joins using `multi_join`. Because there may be multiple mutations for one gene in a single sample, the mutation type and location data are returned in lists by default, even if there is only one mutation. If there is no mutation for the gene in a particular sample, the list contains either \"Wildtype_Tumor\" or \"Wildtype_Normal\", depending on whether it's a tumor or normal sample. The mutation status column contains either \"Single_mutation\", \"Multiple_mutation\", \"Wildtype_Tumor\", or \"Wildtype_Normal\", for help with parsing."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "cptac warning: In joining the somatic_mutation table, no mutations were found for the following samples, so they were filled with Wildtype_Tumor or Wildtype_Normal: 69 samples for the PTEN gene (<ipython-input-9-0ffecace8e23>, line 1)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Name</th>\n",
       "      <th>AURKA_awg_proteomics</th>\n",
       "      <th>TP53_awg_proteomics</th>\n",
       "      <th>PTEN_Mutation</th>\n",
       "      <th>PTEN_Location</th>\n",
       "      <th>PTEN_Mutation_Status</th>\n",
       "      <th>Sample_Status</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Patient_ID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C3L-00006</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.2950</td>\n",
       "      <td>[Missense_Mutation, Nonsense_Mutation]</td>\n",
       "      <td>[p.R130Q, p.R233*]</td>\n",
       "      <td>Multiple_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00008</th>\n",
       "      <td>0.31100</td>\n",
       "      <td>0.2770</td>\n",
       "      <td>[Missense_Mutation]</td>\n",
       "      <td>[p.G127R]</td>\n",
       "      <td>Single_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00032</th>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.8710</td>\n",
       "      <td>[Nonsense_Mutation]</td>\n",
       "      <td>[p.W111*]</td>\n",
       "      <td>Single_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00090</th>\n",
       "      <td>-0.79800</td>\n",
       "      <td>-0.3430</td>\n",
       "      <td>[Missense_Mutation]</td>\n",
       "      <td>[p.R130G]</td>\n",
       "      <td>Single_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00098</th>\n",
       "      <td>3.11000</td>\n",
       "      <td>3.0100</td>\n",
       "      <td>[Wildtype_Tumor]</td>\n",
       "      <td>[No_mutation]</td>\n",
       "      <td>Wildtype_Tumor</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00136</th>\n",
       "      <td>-1.65000</td>\n",
       "      <td>-0.1480</td>\n",
       "      <td>[Missense_Mutation, Missense_Mutation]</td>\n",
       "      <td>[p.Y68C, p.R130G]</td>\n",
       "      <td>Multiple_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00137</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.4410</td>\n",
       "      <td>[Frame_Shift_Ins, Nonsense_Mutation]</td>\n",
       "      <td>[p.H118Qfs*8, p.Y180*]</td>\n",
       "      <td>Multiple_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00139</th>\n",
       "      <td>0.84800</td>\n",
       "      <td>-1.2200</td>\n",
       "      <td>[Wildtype_Tumor]</td>\n",
       "      <td>[No_mutation]</td>\n",
       "      <td>Wildtype_Tumor</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00143</th>\n",
       "      <td>-1.73000</td>\n",
       "      <td>-0.0825</td>\n",
       "      <td>[Missense_Mutation]</td>\n",
       "      <td>[p.R130G]</td>\n",
       "      <td>Single_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00145</th>\n",
       "      <td>-0.00513</td>\n",
       "      <td>-0.1810</td>\n",
       "      <td>[Missense_Mutation, Frame_Shift_Ins]</td>\n",
       "      <td>[p.H93R, p.E242*]</td>\n",
       "      <td>Multiple_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Name        AURKA_awg_proteomics  TP53_awg_proteomics  \\\n",
       "Patient_ID                                              \n",
       "C3L-00006                    NaN               0.2950   \n",
       "C3L-00008                0.31100               0.2770   \n",
       "C3L-00032                    NaN              -0.8710   \n",
       "C3L-00090               -0.79800              -0.3430   \n",
       "C3L-00098                3.11000               3.0100   \n",
       "C3L-00136               -1.65000              -0.1480   \n",
       "C3L-00137                    NaN               0.4410   \n",
       "C3L-00139                0.84800              -1.2200   \n",
       "C3L-00143               -1.73000              -0.0825   \n",
       "C3L-00145               -0.00513              -0.1810   \n",
       "\n",
       "Name                                 PTEN_Mutation           PTEN_Location  \\\n",
       "Patient_ID                                                                   \n",
       "C3L-00006   [Missense_Mutation, Nonsense_Mutation]      [p.R130Q, p.R233*]   \n",
       "C3L-00008                      [Missense_Mutation]               [p.G127R]   \n",
       "C3L-00032                      [Nonsense_Mutation]               [p.W111*]   \n",
       "C3L-00090                      [Missense_Mutation]               [p.R130G]   \n",
       "C3L-00098                         [Wildtype_Tumor]           [No_mutation]   \n",
       "C3L-00136   [Missense_Mutation, Missense_Mutation]       [p.Y68C, p.R130G]   \n",
       "C3L-00137     [Frame_Shift_Ins, Nonsense_Mutation]  [p.H118Qfs*8, p.Y180*]   \n",
       "C3L-00139                         [Wildtype_Tumor]           [No_mutation]   \n",
       "C3L-00143                      [Missense_Mutation]               [p.R130G]   \n",
       "C3L-00145     [Missense_Mutation, Frame_Shift_Ins]       [p.H93R, p.E242*]   \n",
       "\n",
       "Name       PTEN_Mutation_Status Sample_Status  \n",
       "Patient_ID                                     \n",
       "C3L-00006     Multiple_mutation         Tumor  \n",
       "C3L-00008       Single_mutation         Tumor  \n",
       "C3L-00032       Single_mutation         Tumor  \n",
       "C3L-00090       Single_mutation         Tumor  \n",
       "C3L-00098        Wildtype_Tumor         Tumor  \n",
       "C3L-00136     Multiple_mutation         Tumor  \n",
       "C3L-00137     Multiple_mutation         Tumor  \n",
       "C3L-00139        Wildtype_Tumor         Tumor  \n",
       "C3L-00143       Single_mutation         Tumor  \n",
       "C3L-00145     Multiple_mutation         Tumor  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_acet_and_PTEN_mut_mult = en.multi_join({\"awg proteomics\": [\"AURKA\", \"TP53\"], \"awg somatic_mutation\": \"PTEN\"})\n",
    "selected_acet_and_PTEN_mut_mult.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/robertoldroyd/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3437: UserWarning: No source specified for proteomics data. Source awg used, pass a source to the omics_source parameter to prevent this warning\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n",
      "/Users/robertoldroyd/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3437: UserWarning: No source specified for mutations data. Source awg used, pass a source to the mutations_source parameter to prevent this warning\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n",
      "cptac warning: In joining the somatic_mutation table, no mutations were found for the following samples, so they were filled with Wildtype_Tumor or Wildtype_Normal: 69 samples for the PTEN gene (/Users/robertoldroyd/opt/anaconda3/lib/python3.8/site-packages/cptac/cancers/cancer.py, line 387)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Name</th>\n",
       "      <th>AURKA_awg_proteomics</th>\n",
       "      <th>TP53_awg_proteomics</th>\n",
       "      <th>PTEN_Mutation</th>\n",
       "      <th>PTEN_Location</th>\n",
       "      <th>PTEN_Mutation_Status</th>\n",
       "      <th>Sample_Status</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Patient_ID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C3L-00006</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.2950</td>\n",
       "      <td>[Missense_Mutation, Nonsense_Mutation]</td>\n",
       "      <td>[p.R130Q, p.R233*]</td>\n",
       "      <td>Multiple_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00008</th>\n",
       "      <td>0.31100</td>\n",
       "      <td>0.2770</td>\n",
       "      <td>[Missense_Mutation]</td>\n",
       "      <td>[p.G127R]</td>\n",
       "      <td>Single_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00032</th>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.8710</td>\n",
       "      <td>[Nonsense_Mutation]</td>\n",
       "      <td>[p.W111*]</td>\n",
       "      <td>Single_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00090</th>\n",
       "      <td>-0.79800</td>\n",
       "      <td>-0.3430</td>\n",
       "      <td>[Missense_Mutation]</td>\n",
       "      <td>[p.R130G]</td>\n",
       "      <td>Single_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00098</th>\n",
       "      <td>3.11000</td>\n",
       "      <td>3.0100</td>\n",
       "      <td>[Wildtype_Tumor]</td>\n",
       "      <td>[No_mutation]</td>\n",
       "      <td>Wildtype_Tumor</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00136</th>\n",
       "      <td>-1.65000</td>\n",
       "      <td>-0.1480</td>\n",
       "      <td>[Missense_Mutation, Missense_Mutation]</td>\n",
       "      <td>[p.Y68C, p.R130G]</td>\n",
       "      <td>Multiple_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00137</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.4410</td>\n",
       "      <td>[Frame_Shift_Ins, Nonsense_Mutation]</td>\n",
       "      <td>[p.H118Qfs*8, p.Y180*]</td>\n",
       "      <td>Multiple_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00139</th>\n",
       "      <td>0.84800</td>\n",
       "      <td>-1.2200</td>\n",
       "      <td>[Wildtype_Tumor]</td>\n",
       "      <td>[No_mutation]</td>\n",
       "      <td>Wildtype_Tumor</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00143</th>\n",
       "      <td>-1.73000</td>\n",
       "      <td>-0.0825</td>\n",
       "      <td>[Missense_Mutation]</td>\n",
       "      <td>[p.R130G]</td>\n",
       "      <td>Single_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00145</th>\n",
       "      <td>-0.00513</td>\n",
       "      <td>-0.1810</td>\n",
       "      <td>[Missense_Mutation, Frame_Shift_Ins]</td>\n",
       "      <td>[p.H93R, p.E242*]</td>\n",
       "      <td>Multiple_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Name        AURKA_awg_proteomics  TP53_awg_proteomics  \\\n",
       "Patient_ID                                              \n",
       "C3L-00006                    NaN               0.2950   \n",
       "C3L-00008                0.31100               0.2770   \n",
       "C3L-00032                    NaN              -0.8710   \n",
       "C3L-00090               -0.79800              -0.3430   \n",
       "C3L-00098                3.11000               3.0100   \n",
       "C3L-00136               -1.65000              -0.1480   \n",
       "C3L-00137                    NaN               0.4410   \n",
       "C3L-00139                0.84800              -1.2200   \n",
       "C3L-00143               -1.73000              -0.0825   \n",
       "C3L-00145               -0.00513              -0.1810   \n",
       "\n",
       "Name                                 PTEN_Mutation           PTEN_Location  \\\n",
       "Patient_ID                                                                   \n",
       "C3L-00006   [Missense_Mutation, Nonsense_Mutation]      [p.R130Q, p.R233*]   \n",
       "C3L-00008                      [Missense_Mutation]               [p.G127R]   \n",
       "C3L-00032                      [Nonsense_Mutation]               [p.W111*]   \n",
       "C3L-00090                      [Missense_Mutation]               [p.R130G]   \n",
       "C3L-00098                         [Wildtype_Tumor]           [No_mutation]   \n",
       "C3L-00136   [Missense_Mutation, Missense_Mutation]       [p.Y68C, p.R130G]   \n",
       "C3L-00137     [Frame_Shift_Ins, Nonsense_Mutation]  [p.H118Qfs*8, p.Y180*]   \n",
       "C3L-00139                         [Wildtype_Tumor]           [No_mutation]   \n",
       "C3L-00143                      [Missense_Mutation]               [p.R130G]   \n",
       "C3L-00145     [Missense_Mutation, Frame_Shift_Ins]       [p.H93R, p.E242*]   \n",
       "\n",
       "Name       PTEN_Mutation_Status Sample_Status  \n",
       "Patient_ID                                     \n",
       "C3L-00006     Multiple_mutation         Tumor  \n",
       "C3L-00008       Single_mutation         Tumor  \n",
       "C3L-00032       Single_mutation         Tumor  \n",
       "C3L-00090       Single_mutation         Tumor  \n",
       "C3L-00098        Wildtype_Tumor         Tumor  \n",
       "C3L-00136     Multiple_mutation         Tumor  \n",
       "C3L-00137     Multiple_mutation         Tumor  \n",
       "C3L-00139        Wildtype_Tumor         Tumor  \n",
       "C3L-00143       Single_mutation         Tumor  \n",
       "C3L-00145     Multiple_mutation         Tumor  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_acet_and_PTEN_mut = en.join_omics_to_mutations(\n",
    "    omics_name=\"proteomics\",\n",
    "    mutations_genes=\"PTEN\", \n",
    "    omics_genes=[\"AURKA\", \"TP53\"])\n",
    "\n",
    "selected_acet_and_PTEN_mut.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Filtering multiple mutations\n",
    "\n",
    "The function has the ability to filter multiple mutations down to just one mutation. It allows you to specify particular mutation types or locations to prioritize, and also provides a default sorting hierarchy for all other mutations. The default hierarchy chooses truncation mutations over missense mutations, and silent mutations last of all. If there are multiple mutations of the same type, it chooses the mutation occurring earlier in the sequence. \n",
    "\n",
    "To filter all mutations based on this default hierarchy, simply pass an empty list to the optional `mutations_filter` parameter. Notice how in sample S001, the nonsense mutation was chosen over the missense mutation, because it's a type of trucation mutation, even though the missense mutation occurs earlier in the peptide sequence. In sample S008, both mutations were types of truncation mutations, so the function just chose the earlier one."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "cptac warning: In joining the somatic_mutation table, no mutations were found for the following samples, so they were filled with Wildtype_Tumor or Wildtype_Normal: 69 samples for the PTEN gene (<ipython-input-11-520a06c702d8>, line 1)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Name</th>\n",
       "      <th>AURKA_awg_proteomics</th>\n",
       "      <th>TP53_awg_proteomics</th>\n",
       "      <th>PTEN_Mutation</th>\n",
       "      <th>PTEN_Location</th>\n",
       "      <th>PTEN_Mutation_Status</th>\n",
       "      <th>Sample_Status</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Patient_ID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C3L-00006</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.295</td>\n",
       "      <td>Nonsense_Mutation</td>\n",
       "      <td>p.R233*</td>\n",
       "      <td>Multiple_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00137</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.441</td>\n",
       "      <td>Frame_Shift_Ins</td>\n",
       "      <td>p.H118Qfs*8</td>\n",
       "      <td>Multiple_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Name        AURKA_awg_proteomics  TP53_awg_proteomics      PTEN_Mutation  \\\n",
       "Patient_ID                                                                 \n",
       "C3L-00006                    NaN                0.295  Nonsense_Mutation   \n",
       "C3L-00137                    NaN                0.441    Frame_Shift_Ins   \n",
       "\n",
       "Name       PTEN_Location PTEN_Mutation_Status Sample_Status  \n",
       "Patient_ID                                                   \n",
       "C3L-00006        p.R233*    Multiple_mutation         Tumor  \n",
       "C3L-00137    p.H118Qfs*8    Multiple_mutation         Tumor  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PTEN_default_filter = en.multi_join({\"awg proteomics\": [\"AURKA\", \"TP53\"],\n",
    "                                     \"awg somatic_mutation\": \"PTEN\"},\n",
    "                                    mutations_filter=[])\n",
    "PTEN_default_filter.loc[[\"C3L-00006\", \"C3L-00137\"]]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To prioritize a particular type of mutation, or a particular location, include it in the `mutations_filter` list. Below, we tell the function to prioritize nonsense mutations over all other mutations. Notice how in sample S008, the nonsense mutation is now selected instead of the frameshift insertion, even though the nonsense mutation occurs later in the peptide sequence."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "cptac warning: In joining the somatic_mutation table, no mutations were found for the following samples, so they were filled with Wildtype_Tumor or Wildtype_Normal: 69 samples for the PTEN gene (<ipython-input-12-e925d3d5980f>, line 1)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Name</th>\n",
       "      <th>AURKA_awg_proteomics</th>\n",
       "      <th>TP53_awg_proteomics</th>\n",
       "      <th>PTEN_Mutation</th>\n",
       "      <th>PTEN_Location</th>\n",
       "      <th>PTEN_Mutation_Status</th>\n",
       "      <th>Sample_Status</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Patient_ID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C3L-00006</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.295</td>\n",
       "      <td>Nonsense_Mutation</td>\n",
       "      <td>p.R233*</td>\n",
       "      <td>Multiple_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00137</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.441</td>\n",
       "      <td>Nonsense_Mutation</td>\n",
       "      <td>p.Y180*</td>\n",
       "      <td>Multiple_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Name        AURKA_awg_proteomics  TP53_awg_proteomics      PTEN_Mutation  \\\n",
       "Patient_ID                                                                 \n",
       "C3L-00006                    NaN                0.295  Nonsense_Mutation   \n",
       "C3L-00137                    NaN                0.441  Nonsense_Mutation   \n",
       "\n",
       "Name       PTEN_Location PTEN_Mutation_Status Sample_Status  \n",
       "Patient_ID                                                   \n",
       "C3L-00006        p.R233*    Multiple_mutation         Tumor  \n",
       "C3L-00137        p.Y180*    Multiple_mutation         Tumor  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PTEN_simple_filter = en.multi_join({\"awg proteomics\": [\"AURKA\", \"TP53\"],\n",
    "                                    \"awg somatic_mutation\": \"PTEN\"},\n",
    "                                   mutations_filter=[\"Nonsense_Mutation\"])\n",
    "PTEN_simple_filter.loc[[\"C3L-00006\", \"C3L-00137\"]]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can include multiple mutation types and/or locations in the `mutations_filter` list. Values earlier in the list will be prioritized over values later in the list. For example, with the filter we specify below, the function first selects sample S001's missense mutation over its nonsense mutation, because we put the location of S001's missense mutation as the first value in our filter list. We still included Nonsense_Mutation in the filter list, but it comes after the location of S001's missense mutation, which is why S001's missense mutation is still prioritized. However, on all other samples, unless they also have a mutation at that same location, the function will continue prioritizing nonsense mutations, as we see in sample S008."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "cptac warning: In joining the somatic_mutation table, no mutations were found for the following samples, so they were filled with Wildtype_Tumor or Wildtype_Normal: 69 samples for the PTEN gene (<ipython-input-13-3cf83de88378>, line 1)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Name</th>\n",
       "      <th>AURKA_awg_proteomics</th>\n",
       "      <th>TP53_awg_proteomics</th>\n",
       "      <th>PTEN_Mutation</th>\n",
       "      <th>PTEN_Location</th>\n",
       "      <th>PTEN_Mutation_Status</th>\n",
       "      <th>Sample_Status</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Patient_ID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C3L-00006</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.295</td>\n",
       "      <td>Missense_Mutation</td>\n",
       "      <td>p.R130Q</td>\n",
       "      <td>Multiple_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00137</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.441</td>\n",
       "      <td>Nonsense_Mutation</td>\n",
       "      <td>p.Y180*</td>\n",
       "      <td>Multiple_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Name        AURKA_awg_proteomics  TP53_awg_proteomics      PTEN_Mutation  \\\n",
       "Patient_ID                                                                 \n",
       "C3L-00006                    NaN                0.295  Missense_Mutation   \n",
       "C3L-00137                    NaN                0.441  Nonsense_Mutation   \n",
       "\n",
       "Name       PTEN_Location PTEN_Mutation_Status Sample_Status  \n",
       "Patient_ID                                                   \n",
       "C3L-00006        p.R130Q    Multiple_mutation         Tumor  \n",
       "C3L-00137        p.Y180*    Multiple_mutation         Tumor  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PTEN_complex_filter = en.multi_join({\"awg proteomics\": [\"AURKA\", \"TP53\"],\n",
    "                                    \"awg somatic_mutation\": \"PTEN\"}, \n",
    "                                    mutations_filter=[\"p.R130Q\", \"Nonsense_Mutation\"])\n",
    "PTEN_complex_filter.loc[[\"C3L-00006\", \"C3L-00137\"]]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Join metadata to mutations\n",
    "\n",
    "Joining metadata to mutation data works exactly like joining other datatypes. Just like any time you are using somatic_mutation data, you can filter multiple mutations with the `mutations_filter` parameter. Here are some examples:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "cptac warning: In joining the somatic_mutation table, no mutations were found for the following samples, so they were filled with Wildtype_Tumor or Wildtype_Normal: 78 samples for the PTEN gene (<ipython-input-14-176be404a675>, line 1)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Name</th>\n",
       "      <th>Histologic_type</th>\n",
       "      <th>PTEN_Mutation</th>\n",
       "      <th>PTEN_Location</th>\n",
       "      <th>PTEN_Mutation_Status</th>\n",
       "      <th>Sample_Status</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Patient_ID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C3L-00006</th>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>[Missense_Mutation, Nonsense_Mutation]</td>\n",
       "      <td>[p.R130Q, p.R233*]</td>\n",
       "      <td>Multiple_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00008</th>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>[Missense_Mutation]</td>\n",
       "      <td>[p.G127R]</td>\n",
       "      <td>Single_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00032</th>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>[Nonsense_Mutation]</td>\n",
       "      <td>[p.W111*]</td>\n",
       "      <td>Single_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00084</th>\n",
       "      <td>Carcinosarcoma</td>\n",
       "      <td>[Wildtype_Tumor]</td>\n",
       "      <td>[No_mutation]</td>\n",
       "      <td>Wildtype_Tumor</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00090</th>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>[Missense_Mutation]</td>\n",
       "      <td>[p.R130G]</td>\n",
       "      <td>Single_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Name       Histologic_type                           PTEN_Mutation  \\\n",
       "Patient_ID                                                           \n",
       "C3L-00006     Endometrioid  [Missense_Mutation, Nonsense_Mutation]   \n",
       "C3L-00008     Endometrioid                     [Missense_Mutation]   \n",
       "C3L-00032     Endometrioid                     [Nonsense_Mutation]   \n",
       "C3L-00084   Carcinosarcoma                        [Wildtype_Tumor]   \n",
       "C3L-00090     Endometrioid                     [Missense_Mutation]   \n",
       "\n",
       "Name             PTEN_Location PTEN_Mutation_Status Sample_Status  \n",
       "Patient_ID                                                         \n",
       "C3L-00006   [p.R130Q, p.R233*]    Multiple_mutation         Tumor  \n",
       "C3L-00008            [p.G127R]      Single_mutation         Tumor  \n",
       "C3L-00032            [p.W111*]      Single_mutation         Tumor  \n",
       "C3L-00084        [No_mutation]       Wildtype_Tumor         Tumor  \n",
       "C3L-00090            [p.R130G]      Single_mutation         Tumor  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hist_and_PTEN = en.multi_join(\n",
    "    {\"awg clinical\": 'Histologic_type',\n",
    "    \"awg somatic_mutation\": \"PTEN\"})\n",
    "\n",
    "hist_and_PTEN.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "With multiple mutations filtered:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "cptac warning: In joining the somatic_mutation table, no mutations were found for the following samples, so they were filled with Wildtype_Tumor or Wildtype_Normal: 78 samples for the PTEN gene (<ipython-input-15-c858f3802047>, line 1)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Name</th>\n",
       "      <th>Histologic_type</th>\n",
       "      <th>PTEN_Mutation</th>\n",
       "      <th>PTEN_Location</th>\n",
       "      <th>PTEN_Mutation_Status</th>\n",
       "      <th>Sample_Status</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Patient_ID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C3L-00006</th>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>Nonsense_Mutation</td>\n",
       "      <td>p.R233*</td>\n",
       "      <td>Multiple_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00008</th>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>Missense_Mutation</td>\n",
       "      <td>p.G127R</td>\n",
       "      <td>Single_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00032</th>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>Nonsense_Mutation</td>\n",
       "      <td>p.W111*</td>\n",
       "      <td>Single_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00084</th>\n",
       "      <td>Carcinosarcoma</td>\n",
       "      <td>Wildtype_Tumor</td>\n",
       "      <td>No_mutation</td>\n",
       "      <td>Wildtype_Tumor</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C3L-00090</th>\n",
       "      <td>Endometrioid</td>\n",
       "      <td>Missense_Mutation</td>\n",
       "      <td>p.R130G</td>\n",
       "      <td>Single_mutation</td>\n",
       "      <td>Tumor</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Name       Histologic_type      PTEN_Mutation PTEN_Location  \\\n",
       "Patient_ID                                                    \n",
       "C3L-00006     Endometrioid  Nonsense_Mutation       p.R233*   \n",
       "C3L-00008     Endometrioid  Missense_Mutation       p.G127R   \n",
       "C3L-00032     Endometrioid  Nonsense_Mutation       p.W111*   \n",
       "C3L-00084   Carcinosarcoma     Wildtype_Tumor   No_mutation   \n",
       "C3L-00090     Endometrioid  Missense_Mutation       p.R130G   \n",
       "\n",
       "Name       PTEN_Mutation_Status Sample_Status  \n",
       "Patient_ID                                     \n",
       "C3L-00006     Multiple_mutation         Tumor  \n",
       "C3L-00008       Single_mutation         Tumor  \n",
       "C3L-00032       Single_mutation         Tumor  \n",
       "C3L-00084        Wildtype_Tumor         Tumor  \n",
       "C3L-00090       Single_mutation         Tumor  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hist_and_PTEN = en.multi_join(\n",
    "    {\"awg clinical\": \"Histologic_type\",\n",
    "    \"awg somatic_mutation\": \"PTEN\"},\n",
    "    mutations_filter=[\"Nonsense_Mutation\"])\n",
    "\n",
    "hist_and_PTEN.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exporting dataframes\n",
    "\n",
    "If you wish to export a dataframe to a file, simply call the dataframe's to_csv method, passing the path you wish to save the file to, and the value separator you want:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "hist_and_PTEN.to_csv(path_or_buf=\"histologic_type_and_PTEN_mutation.tsv\", sep='\\t')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}