{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import glob\n", "import os\n", "import yaml\n", "import re\n", "import copy\n", "from io import StringIO\n", "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fps = ['Bs_S106_L001_R1_001.fastq.gz',\n", " 'Bs_S106_L001_R2_001.fastq.gz',\n", " 'Bs_S106_L002_R1_001.fastq.gz',\n", " 'Bs_S106_L002_R2_001.fastq.gz',\n", " 'Vf_S104_L001_R1_001.fastq.gz',\n", " 'Vf_S104_L001_R2_001.fastq.gz',\n", " 'Vf_S104_L002_R1_001.fastq.gz',\n", " 'Vf_S104_L002_R2_001.fastq.gz']\n", "\n", "exp_df = pd.DataFrame.from_dict({'Extension': {0: 'fastq.gz',\n", " 1: 'fastq.gz',\n", " 2: 'fastq.gz',\n", " 3: 'fastq.gz',\n", " 4: 'fastq.gz',\n", " 5: 'fastq.gz',\n", " 6: 'fastq.gz',\n", " 7: 'fastq.gz'},\n", " 'File': {0: 'Bs_S106_L001_R1_001.fastq.gz',\n", " 1: 'Bs_S106_L001_R2_001.fastq.gz',\n", " 2: 'Bs_S106_L002_R1_001.fastq.gz',\n", " 3: 'Bs_S106_L002_R2_001.fastq.gz',\n", " 4: 'Vf_S104_L001_R1_001.fastq.gz',\n", " 5: 'Vf_S104_L001_R2_001.fastq.gz',\n", " 6: 'Vf_S104_L002_R1_001.fastq.gz',\n", " 7: 'Vf_S104_L002_R2_001.fastq.gz'},\n", " 'Index': {0: 'S106',\n", " 1: 'S106',\n", " 2: 'S106',\n", " 3: 'S106',\n", " 4: 'S104',\n", " 5: 'S104',\n", " 6: 'S104',\n", " 7: 'S104'},\n", " 'Lane': {0: 'L001',\n", " 1: 'L001',\n", " 2: 'L002',\n", " 3: 'L002',\n", " 4: 'L001',\n", " 5: 'L001',\n", " 6: 'L002',\n", " 7: 'L002'},\n", " 'Read': {0: 'R1',\n", " 1: 'R2',\n", " 2: 'R1',\n", " 3: 'R2',\n", " 4: 'R1',\n", " 5: 'R2',\n", " 6: 'R1',\n", " 7: 'R2'},\n", " 'Run': {0: '001',\n", " 1: '001',\n", " 2: '001',\n", " 3: '001',\n", " 4: '001',\n", " 5: '001',\n", " 6: '001',\n", " 7: '001'},\n", " 'Sample': {0: 'Bs',\n", " 1: 'Bs',\n", " 2: 'Bs',\n", " 3: 'Bs',\n", " 4: 'Vf',\n", " 5: 'Vf',\n", " 6: 'Vf',\n", " 7: 'Vf'}})\n", "\n", "def parse_ilm_fps_to_df(fps,\n", " pattern = '^((.+?)_(S\\d+)_(L\\d+)_(R[12])_(\\d+)\\.(.+))$',\n", " pattern_names = ['File','Sample','Index','Lane','Read','Run','Extension']):\n", " \n", " p = re.compile(pattern)\n", "\n", " df = pd.DataFrame(columns = pattern_names)\n", "\n", " for f in fps:\n", " m = p.match(f)\n", "\n", " if m:\n", " df = df.append(dict(zip(pattern_names, m.groups())), ignore_index = True)\n", "\n", " return(df)\n", "\n", "obs_df = parse_ilm_fps_to_df(fps)\n", "pd.util.testing.assert_frame_equal(obs_df.sort_index(axis=1), exp_df.sort_index(axis=1))\n", "\n", "df = exp_df\n", "seq_dir = './example/reads'\n", "exp_dict = {'Bs': {'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',\n", " './example/reads/Bs_S106_L002_R1_001.fastq.gz'],\n", " 'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',\n", " './example/reads/Bs_S106_L002_R2_001.fastq.gz']},\n", " 'Vf': {'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',\n", " './example/reads/Vf_S104_L002_R1_001.fastq.gz'],\n", " 'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',\n", " './example/reads/Vf_S104_L002_R2_001.fastq.gz']}}\n", "\n", "def get_sample_reads_df(df, seq_dir):\n", " sample_reads_dict = {}\n", " \n", " samples = list(set(df['Sample']))\n", " \n", " for s in samples:\n", " f_fps = sorted(list(df.loc[(df['Sample'] == s) & (df['Read'] == 'R1'),'File'].values))\n", " r_fps = sorted(list(df.loc[(df['Sample'] == s) & (df['Read'] == 'R2'),'File'].values))\n", "\n", " sample_reads_dict[s] = {'forward': [os.path.join(seq_dir, x) for x in f_fps],\n", " 'reverse': [os.path.join(seq_dir, x) for x in r_fps]}\n", " \n", " return(sample_reads_dict)\n", "\n", "obs_dict = get_sample_reads_df(df, seq_dir)\n", "assert(exp_dict == obs_dict)\n", "\n", "def get_sample_paths(seq_dir):\n", " fps = os.listdir(seq_dir)\n", " \n", " files_df = parse_ilm_fps_to_df(fps)\n", " sample_reads_dict = get_sample_reads_df(files_df, seq_dir)\n", " \n", " return(sample_reads_dict)\n", "\n", "exp_db_dict = {'Bs': {'filter_db': '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',\n", " 'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',\n", " './example/reads/Bs_S106_L002_R1_001.fastq.gz'],\n", " 'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',\n", " './example/reads/Bs_S106_L002_R2_001.fastq.gz']},\n", " 'Vf': {'filter_db': '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',\n", " 'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',\n", " './example/reads/Vf_S104_L002_R1_001.fastq.gz'],\n", " 'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',\n", " './example/reads/Vf_S104_L002_R2_001.fastq.gz']}}\n", "\n", "exp_db_dict_update = {'Bs': {'filter_db': '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',\n", " 'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',\n", " './example/reads/Bs_S106_L002_R1_001.fastq.gz'],\n", " 'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',\n", " './example/reads/Bs_S106_L002_R2_001.fastq.gz']},\n", " 'Vf': {'filter_db': '/home/jgsanders/ref_data/genomes/mouse/mouse',\n", " 'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',\n", " './example/reads/Vf_S104_L002_R1_001.fastq.gz'],\n", " 'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',\n", " './example/reads/Vf_S104_L002_R2_001.fastq.gz']}}\n", "\n", "exp_db_dict_partial = {'Bs': {'filter_db': None,\n", " 'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',\n", " './example/reads/Bs_S106_L002_R1_001.fastq.gz'],\n", " 'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',\n", " './example/reads/Bs_S106_L002_R2_001.fastq.gz']},\n", " 'Vf': {'filter_db': '/home/jgsanders/ref_data/genomes/mouse/mouse',\n", " 'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',\n", " './example/reads/Vf_S104_L002_R1_001.fastq.gz'],\n", " 'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',\n", " './example/reads/Vf_S104_L002_R2_001.fastq.gz']}}\n", "\n", "def add_filter_db(sample_fp_dict, db_fp, samples = None):\n", " \n", " if samples is None:\n", " samples = sample_fp_dict.keys()\n", " \n", " samples_dict = copy.deepcopy(sample_fp_dict)\n", " \n", " for s in samples_dict:\n", " if s in samples:\n", " samples_dict[s]['filter_db'] = db_fp\n", " elif 'filter_db' in samples_dict[s]:\n", " continue\n", " else:\n", " samples_dict[s]['filter_db'] = None\n", " \n", "\n", " \n", " return(samples_dict)\n", "\n", "obs_dict = add_filter_db(exp_dict, '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens')\n", "\n", "assert(obs_dict == exp_db_dict)\n", "\n", "obs_dict = add_filter_db(exp_db_dict, '/home/jgsanders/ref_data/genomes/mouse/mouse', samples = ['Vf'])\n", "\n", "assert(obs_dict == exp_db_dict_update)\n", "\n", "obs_dict = add_filter_db(exp_dict, '/home/jgsanders/ref_data/genomes/mouse/mouse', samples = ['Vf'])\n", "\n", "assert(obs_dict == exp_db_dict_partial)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Steps: \n", "\n", "1. Get samples\n", " - read in sample sheet\n", " - guess samples from sequence folder\n", "2. Get params\n", " - filter db per sample\n", " - samples for binning\n", " - samples for abundance profiling\n", " - tmp dir\n", " - envs\n", " - software\n", " - assembler\n", " - trimmer\n", " - params\n", " * atropos\n", " * maxbin \n", " * humann2\n", " * metaphlan\n", " * kraken\n", " * shogun?\n", "3. format yaml" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 1: get samples\n", "\n", "First, we need to read in a set of samples for analysis. \n", "\n", "There are two options for this: guess the sample names, or read them in from a sample sheet or manifest. \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Option 1: guess sample names from sequence output folder. \n", "\n", "Enter the correct sequencing directory below, and a list of some example filenames should pop up. Make sure these look correct. " ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 219216\r\n", "-rw-r--r--+ 1 jonsanders staff 8797141 Sep 12 16:52 S22205_S104_L001_R1_001.fastq.gz\r\n", "-rw-r--r--+ 1 jonsanders staff 9666850 Sep 12 16:52 S22205_S104_L001_R2_001.fastq.gz\r\n", "-rw-r--r--+ 1 jonsanders staff 8982873 Sep 12 16:52 S22207_S103_L001_R1_001.fastq.gz\r\n", "-rw-r--r--+ 1 jonsanders staff 9939119 Sep 12 16:52 S22207_S103_L001_R2_001.fastq.gz\r\n", "-rw-r--r--+ 1 jonsanders staff 9012384 Sep 12 16:52 S22282_S102_L001_R1_001.fastq.gz\r\n", "-rw-r--r--+ 1 jonsanders staff 9930763 Sep 12 16:52 S22282_S102_L001_R2_001.fastq.gz\r\n", "-rw-r--r--+ 1 jonsanders staff 8901974 Sep 12 16:52 S22400_S101_L001_R1_001.fastq.gz\r\n", "-rw-r--r--+ 1 jonsanders staff 9781744 Sep 12 16:52 S22400_S101_L001_R2_001.fastq.gz\r\n", "-rw-r--r--+ 1 jonsanders staff 8864399 Sep 12 16:52 S22401_S100_L001_R1_001.fastq.gz\r\n" ] } ], "source": [ "seq_dir = './test_data/test_reads/'\n", "!ls -l {seq_dir} | head" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If that looks correct, execute the following cell. It should result in a dictionary associating samples to filepaths.\n", "\n", "To validate, a portion of the dictionary is converted to yaml and printed below: " ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "S22205:\n", " forward:\n", " - ./test_data/test_reads/S22205_S104_L001_R1_001.fastq.gz\n", " reverse:\n", " - ./test_data/test_reads/S22205_S104_L001_R2_001.fastq.gz\n", "S22207:\n", " forward:\n", " - ./test_data/test_reads/S22207_S103_L001_R1_001.fastq.gz\n", " reverse:\n", " - ./test_data/test_reads/S22207_S103_L001_R2_001.fastq.gz\n", "S22282:\n", " forward:\n", " - ./test_data/test_reads/S22282_S102_L001_R1_001.fastq.gz\n", " reverse:\n", " - ./test_data/test_reads/S22282_S102_L001_R2_001.fastq.gz\n", "S22400:\n", " forward:\n", " - ./test_data/test_reads/S22400_S101_L001_R1_001.fastq.gz\n", " reverse:\n", " - ./test_data/test_reads/S22400_S101_L001_R2_001.fastq.gz\n", "S22401:\n", " forward:\n", " - ./test_data/test_reads/S22401_S100_L001_R1_001.fastq.gz\n", " reverse:\n", " - ./test_data/test_reads/S22401_S100_L001_R2_001.fastq.gz\n", "S22402:\n", " forward:\n", " - ./test_data/test_reads/S22402_S105_L001_R1_001.fastq.gz\n", " reverse:\n", " - ./test_data/test_reads/S22402_S105_L001_R2_001.fastq.gz\n", "\n" ] } ], "source": [ "sample_paths = get_sample_paths(seq_dir)\n", "\n", "print(yaml.dump(sample_paths, default_flow_style = False))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Option 2: read sample names and prefixes from sample sheet\n", "\n", "Read in samples from the sample sheet used to demultiplex them. \n", "\n", "This is useful if, for example, you have nonstandard file naming, only want to process a subset of sequences in a run, or want to associate other names to the sequences. " ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LaneSample_IDSample_NameSample_PlateSample_WellI7_Index_IDindexI5_Index_IDindex2Sample_ProjectDescription
01S22205S22205Example Plate 1A1iTru7_101_01ACGTTACCiTru5_01_AACCGACAAExample Projectsample_S22205
11S22282S22282Example Plate 1B1iTru7_101_02CTGTGTTGiTru5_01_BAGTGGCAAExample Projectsample_S22282
\n", "
" ], "text/plain": [ " Lane Sample_ID Sample_Name Sample_Plate Sample_Well I7_Index_ID \\\n", "0 1 S22205 S22205 Example Plate 1 A1 iTru7_101_01 \n", "1 1 S22282 S22282 Example Plate 1 B1 iTru7_101_02 \n", "\n", " index I5_Index_ID index2 Sample_Project Description \n", "0 ACGTTACC iTru5_01_A ACCGACAA Example Project sample_S22205 \n", "1 CTGTGTTG iTru5_01_B AGTGGCAA Example Project sample_S22282 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# read in sample sheet\n", "\n", "def read_sample_sheet(f, sep='\\t'):\n", " data = False\n", " data_lines = ''\n", " for line in f:\n", " if data:\n", " if line.startswith('[') or line.strip() == '':\n", " data = False\n", " continue\n", " data_lines += line\n", " elif line.startswith('[Data]'):\n", " data = True\n", " continue\n", " else:\n", " continue\n", " \n", " data_df = pd.read_csv(StringIO(data_lines), sep = '\\t', comment='#')\n", " return(data_df)\n", "\n", "with open('./test_data/example_sample_sheet.txt', 'r') as f:\n", " ss_df = read_sample_sheet(f)\n", "\n", "ss_df.head()\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
FileSampleIndexLaneReadRunExtension
0S22205_S104_L001_R1_001.fastq.gzS22205S104L001R1001fastq.gz
1S22205_S104_L001_R2_001.fastq.gzS22205S104L001R2001fastq.gz
2S22207_S103_L001_R1_001.fastq.gzS22207S103L001R1001fastq.gz
3S22207_S103_L001_R2_001.fastq.gzS22207S103L001R2001fastq.gz
4S22282_S102_L001_R1_001.fastq.gzS22282S102L001R1001fastq.gz
5S22282_S102_L001_R2_001.fastq.gzS22282S102L001R2001fastq.gz
6S22400_S101_L001_R1_001.fastq.gzS22400S101L001R1001fastq.gz
7S22400_S101_L001_R2_001.fastq.gzS22400S101L001R2001fastq.gz
8S22401_S100_L001_R1_001.fastq.gzS22401S100L001R1001fastq.gz
9S22401_S100_L001_R2_001.fastq.gzS22401S100L001R2001fastq.gz
10S22402_S105_L001_R1_001.fastq.gzS22402S105L001R1001fastq.gz
11S22402_S105_L001_R2_001.fastq.gzS22402S105L001R2001fastq.gz
\n", "
" ], "text/plain": [ " File Sample Index Lane Read Run Extension\n", "0 S22205_S104_L001_R1_001.fastq.gz S22205 S104 L001 R1 001 fastq.gz\n", "1 S22205_S104_L001_R2_001.fastq.gz S22205 S104 L001 R2 001 fastq.gz\n", "2 S22207_S103_L001_R1_001.fastq.gz S22207 S103 L001 R1 001 fastq.gz\n", "3 S22207_S103_L001_R2_001.fastq.gz S22207 S103 L001 R2 001 fastq.gz\n", "4 S22282_S102_L001_R1_001.fastq.gz S22282 S102 L001 R1 001 fastq.gz\n", "5 S22282_S102_L001_R2_001.fastq.gz S22282 S102 L001 R2 001 fastq.gz\n", "6 S22400_S101_L001_R1_001.fastq.gz S22400 S101 L001 R1 001 fastq.gz\n", "7 S22400_S101_L001_R2_001.fastq.gz S22400 S101 L001 R2 001 fastq.gz\n", "8 S22401_S100_L001_R1_001.fastq.gz S22401 S100 L001 R1 001 fastq.gz\n", "9 S22401_S100_L001_R2_001.fastq.gz S22401 S100 L001 R2 001 fastq.gz\n", "10 S22402_S105_L001_R1_001.fastq.gz S22402 S105 L001 R1 001 fastq.gz\n", "11 S22402_S105_L001_R2_001.fastq.gz S22402 S105 L001 R2 001 fastq.gz" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "seq_dir = './test_data/test_reads/'\n", "\n", "fps = os.listdir(seq_dir)\n", "\n", "files_df = parse_ilm_fps_to_df(fps)\n", "\n", "files_df" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sample_S22205:\n", " forward:\n", " - ./test_data/test_reads/S22205_S104_L001_R1_001.fastq.gz\n", " reverse:\n", " - ./test_data/test_reads/S22205_S104_L001_R2_001.fastq.gz\n", "sample_S22282:\n", " forward:\n", " - ./test_data/test_reads/S22282_S102_L001_R1_001.fastq.gz\n", " reverse:\n", " - ./test_data/test_reads/S22282_S102_L001_R2_001.fastq.gz\n", "\n" ] } ], "source": [ "def get_sample_reads_df_from_sample_sheet(ss_df, seq_dir, sample_col='Description', prefix_col='Sample_ID'):\n", " sample_reads_dict = {}\n", " \n", " samples = list(set(ss_df[sample_col]))\n", " \n", " fps = os.listdir(seq_dir)\n", "\n", " files_df = parse_ilm_fps_to_df(fps)\n", " \n", " for s in samples:\n", " # get the subset of sample sheet rows for that sample\n", " sample_rows = ss_df.loc[ss_df[sample_col] == s,]\n", " \n", " f_fps = []\n", " r_fps = []\n", " \n", " # iter across subset table and find file corresponding to each row\n", " for idx, row in sample_rows.iterrows():\n", " lane = 'L{0:03d}'.format(row['Lane']) \n", " \n", " f_fps.extend(files_df.loc[(files_df['Sample'] == row['Sample_ID']) & \n", " (files_df['Lane'] == lane) &\n", " (files_df['Read'] == 'R1'), 'File'].values)\n", " \n", " r_fps.extend(files_df.loc[(files_df['Sample'] == row['Sample_ID']) & \n", " (files_df['Lane'] == lane) &\n", " (files_df['Read'] == 'R2'), 'File'].values)\n", " \n", " f_fps = sorted(f_fps)\n", " r_fps = sorted(r_fps)\n", " \n", " sample_reads_dict[s] = {'forward': [os.path.join(seq_dir, x) for x in f_fps],\n", " 'reverse': [os.path.join(seq_dir, x) for x in r_fps]}\n", " \n", " return(sample_reads_dict)\n", "\n", "sample_paths = get_sample_reads_df_from_sample_sheet(ss_df, seq_dir, sample_col='Description', prefix_col='Sample_ID')\n", "print(yaml.dump(sample_paths, default_flow_style = False))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 2: define parameters" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " - filter db per sample\n", " - samples for binning\n", " - samples for abundance profiling\n", " - tmp dir\n", " - envs\n", " - software\n", " - assembler\n", " - trimmer\n", " - params\n", " * atropos\n", " * maxbin \n", " * humann2\n", " * metaphlan\n", " * kraken\n", " * shogun?" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "config_dict = {}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add filter databases per sample\n", "\n", "These are the filepaths to the databases used for host filtering. \n", "\n", "The method 'add host filter db' adds the provided filepath to the sample dictionary you created above.\n", "\n", "By default, all samples get same database.\n", "\n", "You can also provide a list of specific sample names to update if you want to set different filter dbs for some samples. To do this, just re-execute `add_filter_db` providing a list of samples to update; the existing `filter_db` paths will remain unchanged if they are not in this list. \n", "\n", "Example:\n", "\n", "```\n", "filter_db1 = '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens'\n", "filter_db2 = '/home/jgsanders/ref_data/genomes/mouse/mouse'\n", "\n", "samples_dict = add_filter_db(sample_paths, filter_db1)\n", "samples_dict = add_filter_db(samples_dict, filter_db2, samples = ['Vf'])\n", "```\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sample_S22205:\n", " filter_db: ./test_data/phix\n", " forward:\n", " - ./test_data/test_reads/S22205_S104_L001_R1_001.fastq.gz\n", " reverse:\n", " - ./test_data/test_reads/S22205_S104_L001_R2_001.fastq.gz\n", "sample_S22282:\n", " filter_db: ./test_data/phix\n", " forward:\n", " - ./test_data/test_reads/S22282_S102_L001_R1_001.fastq.gz\n", " reverse:\n", " - ./test_data/test_reads/S22282_S102_L001_R2_001.fastq.gz\n", "\n" ] } ], "source": [ "filter_db = './test_data/phix'\n", "\n", "samples_dict = add_filter_db(sample_paths, filter_db)\n", "\n", "print(yaml.dump(samples_dict, default_flow_style = False))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add samples for binning\n", "\n", "These are the samples from which assembled contigs will be binned into putative draft genomes.\n", "\n", "By default, bin all samples." ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "binning_samples = list(samples_dict.keys())\n", "\n", "config_dict['binning_samples'] = binning_samples" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add samples for abundance profiling\n", "\n", "These are the samples from which abundance information will be used for binning of `binning_samples`\n", "\n", "By default, use all samples." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# This line filters out BLANK* samples from abundance mapping\n", "abundance_samples = [x for x in samples_dict if not x.startswith('BLANK')]\n", "\n", "# This line uses all samples for abundance mapping\n", "abundance_samples = list(samples_dict.keys())\n", "\n", "config_dict['abundance_samples'] = abundance_samples" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add trimmer to use\n", "\n", "These are the read trimmers to use for qc.\n", "\n", "Currently, the available options are:\n", "\n", "```\n", "- atropos\n", "- skewer\n", "```" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "config_dict['trimmer'] = 'atropos'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add assemblers to use\n", "\n", "These are the assemblers to use for assembly.\n", "\n", "Currently, the available options are:\n", "\n", "```\n", "- spades\n", "- metaspades\n", "- megahit\n", "```" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "assemblers = ['megahit','metaspades']\n", "\n", "config_dict['assemblers'] = assemblers" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add assembly to use for binning\n", "\n", "This is the assembler to use for binning.\n", "\n", "Currently, the available options are:\n", "\n", "```\n", "- spades\n", "- metaspades\n", "- megahit\n", "```" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "config_dict['mapping_assembler'] = 'metaspades'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add temporary directory\n", "\n", "This is the directory used during execution of rules. Should ideally be local scratch storage. " ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tmp_dir_root = './'\n", "\n", "config_dict['tmp_dir_root'] = tmp_dir_root" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add environments\n", "\n", "These are the conda environments to use for various rules. \n", "\n", "`env` is the primary `snakemake_assemble` environment. Change others as necessary for other rules." ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [], "source": [ "envs = {'anvi': 'source activate oecophylla-anvi',\n", " 'humann2': 'source activate oecophylla-humann2',\n", " 'kraken': 'source activate oecophylla-kraken',\n", " 'shogun': 'source activate oecophylla-shogun',\n", " 'qc': 'source activate oecophylla-qc',\n", " 'raw': 'source activate oecophylla-qc'}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### anvio resources\n", "\n", "Sets paths to resources for anvio" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": true }, "outputs": [], "source": [ "resources = {'centrifuge_base': '/home/jgsanders/miniconda/envs/anvio2/centrifuge',\n", " 'centrifuge_models': '/home/jgsanders/miniconda/envs/anvio2/centrifuge/b+h+v/b+h+v'}\n", "config_dict['resources'] = resources" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add parameters definitions\n", "\n", "These are parameters passed to the various tools used." ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "params = {}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Atropos params\n", "\n", "Sets quality and adapter trimming parameters. \n", "\n", "Some suggested defaults: \n", "Nextera: `-a CTGTCTCTTATACACATCT -A CTGTCTCTTATACACATCT -q 15 --minimum-length 100 --pair-filter any`\n", "KapaHyperPlus: `-a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -A GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT -q 15 --minimum-length 100 --pair-filter any`" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [], "source": [ "params['atropos'] = ' -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -A GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT -q 15 --minimum-length 100 --pair-filter any'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### maxbin params\n", "\n", "Sets binning parameters. " ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "params['maxbin'] = '-plotmarker'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Kraken params\n", "\n", "Sets parameters for Kraken taxonomic profiling. " ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": true }, "outputs": [], "source": [ "params['kraken'] = {'kraken_db': '/home/qiz173/Databases/Kraken/stdb'}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### SHOGUN params\n", "\n", "Sets parameters for Shogun taxonomic profiling. " ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [], "source": [ "params['shogun'] = \"--utree_indx /home/qiz173/Databases/SHOGUN/annotated/utree/stdb.ctr\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### MetaPhlAn2 params\n", "\n", "Sets parameters for MetaPhlAn2 taxonomic profilling." ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": true }, "outputs": [], "source": [ "params['metaphlan'] = {'metaphlan_dir': '/home/jgsanders/git_sw/metaphlan2'}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### humann2\n", "\n", "Sets various parameters for HUMAnN2 funcitonal profiling. \n", "\n", "`humann2_aa_db`: Amino acid database for translated amino acid search\n", "`humann2_nt_db`: ChocoPhlAn database for nucleotide search\n", "`metaphan_dir`: path to metaphlan2 directory install, should also have the default metaphlan2 db\n", "`norms`: normalizations for which to output tables\n", "`other`: any other HUMAnN2 parameters to use" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": true }, "outputs": [], "source": [ "humann2 = {'humann2_aa_db': 'humann2_aa_db: /home/jgsanders/ref_data/humann2/uniref',\n", " 'humann2_nt_db': '/home/jgsanders/ref_data/humann2/chocophlan',\n", " 'metaphlan_dir': '/home/jgsanders/share/metaphlan2',\n", " 'norms': ['cpm','relab'],\n", " 'other': ''}\n", "\n", "params['humann2'] = humann2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### mash\n", "\n", "parameters for mash\n", "\n", "- `other`: mash command-line parameters for sketching\n", "- `refseq_db`: path to refseq db sketch; default /home/jgsanders/ref_data/mash/RefSeqSketchesDefaults.msh\n", "- `depth`: number of reads to make sketch from. forward and reverse reads will be independently subsampled to reach total. Leave blank (`''`) to skip subsampling. " ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "mash = {'other': '-r -m 2 -k 21 -s 1000',\n", " 'refseq_db': '/home/jgsanders/ref_data/mash/RefSeqSketchesDefaults.msh',\n", " 'depth': '100000'}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 3: format yaml\n", "\n", "Now the entire configuration dictionary can be formatted as a yaml file and saved for use in the pipeline." ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": true }, "outputs": [], "source": [ "config_dict['params'] = params\n", "config_dict['samples'] = samples_dict\n", "config_dict['envs'] = envs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Format as yaml and write to filepath:" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [], "source": [ "config_yaml = yaml.dump(config_dict, default_flow_style = False)\n", "\n", "with open('config.yaml', 'w') as f:\n", " f.write(config_yaml)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This is the complete config yaml:" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "abundance_samples:\n", "- sample_S22282\n", "- sample_S22205\n", "assemblers:\n", "- megahit\n", "- metaspades\n", "binning_samples:\n", "- sample_S22282\n", "- sample_S22205\n", "envs:\n", " anvi: source activate oecophylla-anvi\n", " humann2: source activate oecophylla-humann2\n", " kraken: source activate oecophylla-kraken\n", " qc: source activate oecophylla-qc\n", " raw: source activate oecophylla-qc\n", " shogun: source activate oecophylla-shogun\n", "mapping_assembler: metaspades\n", "params:\n", " atropos: ' -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -A GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT\n", " -q 15 --minimum-length 100 --pair-filter any'\n", " humann2:\n", " humann2_aa_db: 'humann2_aa_db: /home/jgsanders/ref_data/humann2/uniref'\n", " humann2_nt_db: /home/jgsanders/ref_data/humann2/chocophlan\n", " metaphlan_dir: /home/jgsanders/share/metaphlan2\n", " norms:\n", " - cpm\n", " - relab\n", " other: ''\n", " kraken:\n", " kraken_db: /home/qiz173/Databases/Kraken/stdb\n", " maxbin: -plotmarker\n", " metaphlan:\n", " metaphlan_dir: /home/jgsanders/git_sw/metaphlan2\n", " shogun: --utree_indx /home/qiz173/Databases/SHOGUN/annotated/utree/stdb.ctr\n", "resources:\n", " centrifuge_base: /home/jgsanders/miniconda/envs/anvio2/centrifuge\n", " centrifuge_models: /home/jgsanders/miniconda/envs/anvio2/centrifuge/b+h+v/b+h+v\n", "samples:\n", " sample_S22205:\n", " filter_db: ./test_data/phix\n", " forward:\n", " - ./test_data/test_reads/S22205_S104_L001_R1_001.fastq.gz\n", " reverse:\n", " - ./test_data/test_reads/S22205_S104_L001_R2_001.fastq.gz\n", " sample_S22282:\n", " filter_db: ./test_data/phix\n", " forward:\n", " - ./test_data/test_reads/S22282_S102_L001_R1_001.fastq.gz\n", " reverse:\n", " - ./test_data/test_reads/S22282_S102_L001_R2_001.fastq.gz\n", "tmp_dir_root: ./\n", "trimmer: atropos\n", "\n" ] } ], "source": [ "print(config_yaml)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }