{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# This is a notebook where I show you what data is available for analysis, roughly how it is formatted, and how to make a few plots\n", "## It is meant to be a \"jump start\" for your own exploration of the data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Resources\n", "### Figure80 article\n", "LINK TO BE ADDED\n", "### Paper\n", "https://science.sciencemag.org/content/366/6464/490\n", "### Preprint \n", "https://www.biorxiv.org/content/10.1101/675314v1\n", "### Github and archived Zenodo of the pipeline to get from raw reads to the processed data available here\n", "https://github.com/mjohnson11/TnSeq_Pipeline and https://zenodo.org/record/3402230#.Xc3brpNKhTY\n", "(this notebook is essentially a simplified version of the one in these repositories used to make the paper figures)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Requirements if using this locally:\n", "I think just a normal anaconda distribution of python should be fine for everything here, which you probably already have: https://www.anaconda.com/distribution/" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Importing various useful libraries / setting up plotting" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from matplotlib import pyplot as pl\n", "import seaborn as sns\n", "sns.set_style(\"white\")\n", "sns.set_style(\"ticks\")\n", "colors = ['#FFB000', '#648FFF']\n", "%matplotlib notebook" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Reading data:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:26: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n" ] } ], "source": [ "## READING DATA\n", "def change_well_format(w):\n", " if '_' in w:\n", " plate = int(w[1:3])\n", " t = 'LK' + str(plate) + '-'\n", " n = int(w.split('_')[1])\n", " lets = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']\n", " l = lets[int(np.floor((n-1)/12))]\n", " return t + l + str(((n-1) % 12) + 1).zfill(2)\n", " else:\n", " return w\n", "\n", "def get_geno_matrix(seg_names):\n", " # Data from https://www.nature.com/articles/nature11867, can also be dowloaded here http://genomics-pubs.princeton.edu/YeastCross_BYxRM/\n", " d = pd.read_csv('files/BYxRM_GenoData.csv') \n", " map_genos = {'B': 0, 'R': 1}\n", " for w in d.keys():\n", " if change_well_format(w) in seg_names:\n", " d[change_well_format(w)] = d[w].map(map_genos)\n", " assert len([s for s in seg_names if s in d.columns]) == len(seg_names)\n", " return d[['marker'] + seg_names]\n", "\n", "\n", "# Reading information of segregant fitness in our focal environment (from Jerison et al. 2017)\n", "x_info = pd.read_csv('files/Clones_For_Tn96_Experiment.csv')\n", "seg_to_fit = {i[0]: i[1] for i in x_info.as_matrix(['segregant', 'initial fitness, YPD 30C'])}\n", "# Reading data files containing fitness effect information from the small library experiment (few (~100) mutations in many genetic backgrounds)\n", "tp_all = pd.read_csv('files/TP_data_by_edge.csv')\n", "# This excludes the neutral controls, a few mutations that were unintentionally included in this library, and a few controls that didn't end up getting good enough coverage to really analyze\n", "tp = tp_all.loc[tp_all['Type']=='Experiment']\n", "# Reading same thing for large library experiment (many mutations in a few genetic backgrounds)\n", "bt = pd.read_csv('files/BT_data_by_edge.csv')\n", "# Reading aggregate data on DFE (distribution of fitness effect) data statistics for each segregant\n", "tp_dfe = pd.read_csv('files/TP_DFE_statistics.csv')\n", "bt_dfe = pd.read_csv('files/BT_DFE_statistics.csv')\n", "# For bioinformatic reasons, the BT (large library) data has a longer edge sequence, changing so the two exps are comparable\n", "bt['Long.Edge'] = bt['Edge']\n", "bt['Edge'] = bt['Long.Edge'].str[:15]\n", "# Making a few dictionaries that point from the names I am used to to the dataframes\n", "dats = {'BT': bt, 'TP': tp, 'BT.DFE': bt_dfe, 'TP.DFE': tp_dfe}\n", "exps = {'BT': 'E1', 'TP': 'E2'}\n", "# Getting a list of segregants in each experiment by looking for columns like segregant.mean.s in the dataframe\n", "segs_all = {exp: [i.split('.')[0] for i in dats[exp] if '.mean.s' in i] for exp in exps}\n", "# Getting genotype information on these segregants using data from Bloom et al. 2013\n", "gm = get_geno_matrix(segs_all['TP'])\n", "# Making restricted lists of segregants that have at least 50 mutations with s measured for DFE comparisons\n", "segs_use = {exp: [s for s in segs_all[exp] if len(dats[exp].loc[pd.notnull(dats[exp][s + '.mean.s'])])>=50] for exp in exps}\n", "# Making some \n", "sorted_segs = {exp: sorted(segs_use[exp], key=lambda x: seg_to_fit[x]) for exp in exps}\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Let's just take a look at all this data, to see what's what\n", "for tp and bt, each row is a mutation\n", "for tp_dfe and bt_dfe, each row is a DFE statistic (or background fitness, which we can treat similarly)\n", "for gm, each row is a genotyped allele that is different between RM and BY (and takes each state in ~half the segregants)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Edge | \n", "Type | \n", "Edge.ID | \n", "Gene.Use | \n", "Gene_ORF | \n", "Gene_ORF.nearby | \n", "briefDescription | \n", "briefDescription.nearby | \n", "chromosome | \n", "description | \n", "... | \n", "full_model_p | \n", "full_model_aic | \n", "full_model_r2_95_conf_low | \n", "full_model_r2_95_conf_high | \n", "full_model_p_values | \n", "full_model_params | \n", "full_model_coeffs | \n", "qtls | \n", "resid.qtls | \n", "full.model.qtls | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "TGATCATCACGGGAC | \n", "Experiment | \n", "888 | \n", "in FLC2 | \n", "FLC2 | \n", "FLC2 | \n", "Putative calcium channel involved in calcium r... | \n", "Putative calcium channel involved in calcium r... | \n", "chr01 | \n", "Putative calcium channel involved in calcium r... | \n", "... | \n", "8.990535e-25 | \n", "-1049.391200 | \n", "0.484313 | \n", "0.664302 | \n", "3.3295688484889364e-32;1.8707009014831425e-05;... | \n", "x;locus_9649106_chr14_401893_C_T;locus_9717516... | \n", "-0.009667588479105496;-0.06737728192983725;0.3... | \n", "chr14_401893;chr14_374661;chr14_402771 | \n", "chr14_470303;chr14_420065;chr14_492376 | \n", "chr14_401893;chr14_374661;chr14_402771|chr14_4... | \n", "
1 | \n", "TCGAAAGCACAGTAG | \n", "Experiment | \n", "377 | \n", "nearby SRB2 | \n", "NaN | \n", "SRB2 | \n", "NaN | \n", "Subunit of the RNA polymerase II mediator complex | \n", "chr08 | \n", "NaN | \n", "... | \n", "5.640931e-01 | \n", "-83.796365 | \n", "0.000082 | \n", "0.396975 | \n", "0.00020013790535836575;0.5640930907526316 | \n", "x | \n", "-0.049382849134784586;-0.08592702839231847 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
2 | \n", "GTTGAACTGGTTGTT | \n", "Experiment | \n", "197 | \n", "in STM1 | \n", "STM1 | \n", "STM1 | \n", "Protein required for optimal translation under... | \n", "Protein required for optimal translation under... | \n", "chr12 | \n", "Protein required for optimal translation under... | \n", "... | \n", "2.500225e-07 | \n", "-803.596564 | \n", "0.103969 | \n", "0.347860 | \n", "1.4054000123711754e-50;0.9145877127045838;4.10... | \n", "x;locus_9680363_chr14_433150_G_A | \n", "-0.028546289780384954;-0.0024826738390553073;0... | \n", "chr14_433150;chr14_420065;chr14_485550 | \n", "chr14_433150;chr14_420065;chr14_492376 | \n", "chr14_433150;chr14_420065;chr14_485550 | \n", "
3 rows × 1696 columns
\n", "\n", " | Edge.ID | \n", "Edge | \n", "Gene.Use | \n", "Gene_ORF | \n", "Gene_ORF.nearby | \n", "briefDescription | \n", "briefDescription.nearby | \n", "chromosome | \n", "description | \n", "description.nearby | \n", "... | \n", "full_model_r2 | \n", "full_model_p | \n", "full_model_aic | \n", "full_model_p_values | \n", "full_model_params | \n", "full_model_coeffs | \n", "qtls | \n", "resid.qtls | \n", "full.model.qtls | \n", "Long.Edge | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0 | \n", "TCTCCAAGGGATACT | \n", "in YMR085W | \n", "YMR085W | \n", "YMR085W | \n", "Putative protein of unknown function | \n", "Putative protein of unknown function | \n", "chr13 | \n", "Putative protein of unknown function; YMR085W ... | \n", "Putative protein of unknown function; YMR085W ... | \n", "... | \n", "0.010223 | \n", "0.781072 | \n", "-80.547553 | \n", "0.26814250806982043;0.7810722126668491 | \n", "x | \n", "0.001490528225059142;0.006953626038516742 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "TCTCCAAGGGATACTTAACGTTATTCCTTT | \n", "
1 | \n", "1 | \n", "TGTGTCGATTTAGTG | \n", "in RKM3 | \n", "RKM3 | \n", "RKM3 | \n", "Ribosomal lysine methyltransferase | \n", "Ribosomal lysine methyltransferase | \n", "chr02 | \n", "Ribosomal lysine methyltransferase; specific f... | \n", "Ribosomal lysine methyltransferase; specific f... | \n", "... | \n", "0.030625 | \n", "0.567432 | \n", "-114.446981 | \n", "0.15944797128735522;0.5674322326468317 | \n", "x | \n", "0.001162120166529229;-0.009006462611979207 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "TGTGTCGATTTAGTGTTAAAGAATGACGTC | \n", "
2 | \n", "2 | \n", "TATGGTGCAGAAAAG | \n", "nearby YNL143C | \n", "NaN | \n", "MEP2|YNL143C | \n", "NaN | \n", "Ammonium permease involved in regulation of ps... | \n", "chr14 | \n", "NaN | \n", "Ammonium permease involved in regulation of ps... | \n", "... | \n", "0.144110 | \n", "0.132863 | \n", "-171.801874 | \n", "0.10689612722798757;0.13286265303302433 | \n", "x | \n", "0.0006108064927804045;0.012645873311052722 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "TATGGTGCAGAAAAGTGGCTCGGAATGAAC | \n", "
3 rows × 246 columns
\n", "\n", " | DFE.statistic | \n", "LK4-G06 | \n", "LK3-G08 | \n", "LK2-G07 | \n", "LK4-H03 | \n", "LK4-D11 | \n", "LK3-E06 | \n", "LK1-D12 | \n", "LK6-C05 | \n", "LK4-C09 | \n", "... | \n", "full_resid_seg_model_r2 | \n", "full_resid_seg_model_p | \n", "full_resid_seg_model_r2_95_conf_low | \n", "full_resid_seg_model_r2_95_conf_high | \n", "full_resid_seg_model_p_values | \n", "full_resid_seg_model_params | \n", "full_resid_seg_model_coeffs | \n", "qtls | \n", "resid.qtls | \n", "full.model.qtls | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "background.fitness | \n", "0.017536 | \n", "0.093806 | \n", "-0.019195 | \n", "0.022961 | \n", "-0.060829 | \n", "0.032745 | \n", "-0.057404 | \n", "-0.059662 | \n", "-0.087799 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "chr14_381891;chr14_368185;chr14_393024|chr15_1... | \n", "NaN | \n", "chr14_381891;chr14_368185;chr14_393024|chr15_1... | \n", "
1 | \n", "mean | \n", "-0.021606 | \n", "-0.038078 | \n", "-0.022692 | \n", "-0.033220 | \n", "-0.011229 | \n", "-0.026932 | \n", "-0.010239 | \n", "-0.010303 | \n", "-0.016691 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "chr14_381891;chr14_371959;chr14_393024|chr04_4... | \n", "chr14_459667;chr14_414983;chr14_481897 | \n", "chr14_381891;chr14_371959;chr14_393024|chr04_4... | \n", "
2 | \n", "median | \n", "-0.017210 | \n", "-0.022665 | \n", "-0.012889 | \n", "-0.025198 | \n", "-0.012115 | \n", "-0.016404 | \n", "-0.010974 | \n", "-0.009748 | \n", "-0.010328 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "chr14_376315;chr14_368185;chr14_393024 | \n", "chr14_433150;chr14_414983;chr14_485550 | \n", "chr14_376315;chr14_368185;chr14_393024|chr14_4... | \n", "
3 rows × 213 columns
\n", "\n", " | DFE.statistic | \n", "LK4-A04 | \n", "LK3-D08 | \n", "LK3-G02 | \n", "LK2-F11 | \n", "LK1-E09 | \n", "LK4-E02 | \n", "LK1-C09 | \n", "LK2-A12 | \n", "LK4-H11 | \n", "... | \n", "full_resid_seg_model_r2 | \n", "full_resid_seg_model_p | \n", "full_resid_seg_model_r2_95_conf_low | \n", "full_resid_seg_model_r2_95_conf_high | \n", "full_resid_seg_model_p_values | \n", "full_resid_seg_model_params | \n", "full_resid_seg_model_coeffs | \n", "qtls | \n", "resid.qtls | \n", "full.model.qtls | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "background.fitness | \n", "0.036559 | \n", "0.023625 | \n", "-0.021251 | \n", "0.035349 | \n", "0.001306 | \n", "-0.021838 | \n", "0.097835 | \n", "-0.029612 | \n", "-0.043350 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
1 | \n", "mean | \n", "-0.005015 | \n", "-0.005416 | \n", "-0.003760 | \n", "-0.004865 | \n", "-0.002509 | \n", "-0.003774 | \n", "-0.003754 | \n", "-0.002583 | \n", "-0.003060 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
2 | \n", "median | \n", "0.001086 | \n", "0.000173 | \n", "0.000602 | \n", "0.000504 | \n", "0.000646 | \n", "0.000713 | \n", "0.000439 | \n", "0.000787 | \n", "0.000182 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
3 rows × 85 columns
\n", "\n", " | marker | \n", "LK4-G06 | \n", "LK3-G08 | \n", "LK2-G07 | \n", "LK4-A04 | \n", "LK4-H03 | \n", "LK4-D11 | \n", "LK3-E06 | \n", "LK1-D10 | \n", "LK1-D12 | \n", "... | \n", "LK2-D09 | \n", "LK1-D06 | \n", "LK3-A01 | \n", "LK2-D05 | \n", "LK1-C11 | \n", "LK1-G03 | \n", "LK3-D04 | \n", "LK3-C06 | \n", "LK2-A01 | \n", "LK2-G10 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "27915_chr01_27915_T_C | \n", "1 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "... | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
1 | \n", "28323_chr01_28323_G_A | \n", "1 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "... | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
2 | \n", "28652_chr01_28652_G_T | \n", "1 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "... | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
3 rows × 163 columns
\n", "