{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Haystack Hotspots Pipeline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load packages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from __future__ import division\n", "import os\n", "import sys\n", "import subprocess as sb\n", "import numpy as np\n", "import pandas as pd\n", "import argparse\n", "import multiprocessing\n", "import glob\n", "from haystack_common import determine_path, check_file, check_required_packages, initialize_genome" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create logging file" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "logging.basicConfig(level=logging.INFO,\n", " format='%(levelname)-5s @ %(asctime)s:\\n\\t %(message)s \\n',\n", " datefmt='%a, %d %b %Y %H:%M:%S',\n", " stream=sys.stderr,\n", " filemode=\"w\")\n", "error = logging.critical\n", "warn = logging.warning\n", "debug = logging.debug\n", "info = logging.info\n", "\n", "HAYSTACK_VERSION = \"0.5.0\"\n", "do_not_recompute = None\n", "keep_intermediate_files= None" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create helper functions" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def quantile_normalization(A):\n", " AA = np.zeros_like(A)\n", " I = np.argsort(A, axis=0)\n", " AA[I, np.arange(A.shape[1])] = np.mean(A[I, np.arange(A.shape[1])], axis=1)[:, np.newaxis]\n", " return AA\n", "\n", "def smooth(x, window_len=200):\n", " s = np.r_[x[window_len - 1:0:-1], x, x[-1:-window_len:-1]]\n", " w = np.hanning(window_len)\n", " y = np.convolve(w / w.sum(), s, mode='valid')\n", " return y[int(window_len / 2):-int(window_len / 2) + 1]\n", "\n", "\n", "# write the IGV session file\n", "def rem_base_path(path, base_path):\n", " return path.replace(os.path.join(base_path, ''), '')\n", "\n", "\n", "def find_th_rpm(df_chip, th_rpm):\n", " return np.min(df_chip.apply(lambda x: np.percentile(x, th_rpm)))\n", "\n", "def log2_transform(x):\n", " return np.log2(x + 1)\n", "\n", "def angle_transform(x):\n", " return np.arcsin(np.sqrt(x) / 1000000.0)\n", "\n", "# def normalize_count(feature, scaling_factor):\n", "# feature.name = str(int(feature.name) * scaling_factor)\n", "# return feature\n", "def get_scaling_factor(bam_filename):\n", "\n", " cmd = 'sambamba view -c %s' % bam_filename\n", " # print cmd\n", " proc = sb.Popen(cmd, stdout=sb.PIPE,\n", " shell=True)\n", " (stdout, stderr) = proc.communicate()\n", " # print stdout,stderr\n", " scaling_factor = (1.0 / float(stdout.strip())) * 1000000\n", " # from pysam import AlignmentFile\n", " # infile = AlignmentFile(bam_filename, \"rb\")\n", " # numreads = infile.count(until_eof=True)\n", " # scaling_factor = (1.0 / float(numreads)) * 1000000\n", " return scaling_factor" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create parse argument function" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def get_args():\n", " # mandatory\n", " # mandatory\n", " parser = argparse.ArgumentParser(description='HAYSTACK Parameters')\n", " parser.add_argument('samples_filename_or_bam_folder',\n", " type=str,\n", " help='A tab delimited file with in each row (1) a sample name, '\n", " '(2) the path to the corresponding bam or bigwig filename. '\n", " 'Alternatively it is possible to specify a folder containing some .bam files to analyze.')\n", " parser.add_argument('genome_name',\n", " type=str,\n", " help='Genome assembly to use from UCSC (for example hg19, mm9, etc.)')\n", "\n", " # optional\n", " parser.add_argument('--output_directory',\n", " type=str,\n", " help='Output directory (default: current directory)',\n", " default='')\n", " parser.add_argument('--bin_size',\n", " type=int,\n", " help='bin size to use(default: 500bp)',\n", " default=500)\n", " parser.add_argument('--chrom_exclude',\n", " type=str,\n", " help='Exclude chromosomes that contain given (regex) string. For example _random|chrX|chrY excludes random, X, and Y chromosome regions',\n", " default='_|chrX|chrY')\n", " parser.add_argument('--th_rpm',\n", " type=float,\n", " help='Percentile on the signal intensity to consider for the hotspots (default: 99)',\n", " default=99)\n", " parser.add_argument('--transformation',\n", " type=str,\n", " help='Variance stabilizing transformation among: none, log2, angle (default: angle)',\n", " default='angle',\n", " choices=['angle', 'log2', 'none'])\n", " parser.add_argument('--z_score_high',\n", " type=float,\n", " help='z-score value to select the specific regions(default: 1.5)',\n", " default=1.5)\n", " parser.add_argument('--z_score_low',\n", " type=float,\n", " help='z-score value to select the not specific regions (default: 0.25)',\n", " default=0.25)\n", " parser.add_argument('--read_ext',\n", " type=int,\n", " help='Read extension in bps (default: 200)',\n", " default=200)\n", " parser.add_argument('--max_regions_percentage',\n", " type=float,\n", " help='Upper bound on the %% of the regions selected (default: 0.1, 0.0=0%% 1.0=100%%)',\n", " default=0.1)\n", " parser.add_argument('--name',\n", " help='Define a custom output filename for the report',\n", " default='')\n", " parser.add_argument('--blacklist',\n", " type=str,\n", " help='Exclude blacklisted regions. Blacklisted regions are not excluded by default. '\n", " 'Use hg19 to blacklist regions for the human genome build 19, '\n", " 'otherwise provide the filepath for a bed file with blacklisted regions.',\n", " default='')\n", " parser.add_argument('--depleted',\n", " help='Look for cell type specific regions with depletion of signal instead of enrichment',\n", " action='store_true')\n", " parser.add_argument('--disable_quantile_normalization',\n", " help='Disable quantile normalization (default: False)',\n", " action='store_true')\n", " parser.add_argument('--do_not_recompute',\n", " help='Keep any file previously precalculated',\n", " action='store_true')\n", " parser.add_argument('--do_not_filter_bams',\n", " help='Use BAM files as provided. Do not remove reads that are unmapped, mate unmapped,'\n", " ' not primary aligned or low MAPQ reads, reads failing qc and optical duplicates',\n", " action='store_true')\n", " parser.add_argument('--input_is_bigwig',\n", " help='Use the bigwig format instead of the bam format for the input. '\n", " 'Note: The files must have extension .bw',\n", " action='store_true')\n", " parser.add_argument('--keep_intermediate_files',\n", " help='keep intermediate bedgraph files ',\n", " action='store_true')\n", " parser.add_argument('--n_processes',\n", " type=int,\n", " help='Specify the number of processes to use. The default is #cores available.',\n", " default=multiprocessing.cpu_count())\n", " parser.add_argument('--version',\n", " help='Print version and exit.',\n", " action='version',\n", " version='Version %s' % HAYSTACK_VERSION)\n", " return parser" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create pipeline functions" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def get_data_filepaths(samples_filename_or_bam_folder, input_is_bigwig):\n", " # check folder or sample filename\n", " if not os.path.exists(samples_filename_or_bam_folder):\n", " error(\"The file or folder %s doesn't exist. Exiting.\" %\n", " samples_filename_or_bam_folder)\n", " sys.exit(1)\n", " if os.path.isfile(samples_filename_or_bam_folder):\n", " data_filenames = []\n", " sample_names = []\n", " with open( samples_filename_or_bam_folder) as infile:\n", " for line in infile:\n", " if not line.strip():\n", " continue\n", " if line.startswith('#'): # skip optional header line\n", " info('Skipping header/comment line:%s' % line)\n", " continue\n", " fields = line.strip().split()\n", " n_fields = len(fields)\n", " if n_fields == 2:\n", " sample_names.append(fields[0])\n", " data_filenames.append(fields[1])\n", " else:\n", " error('The samples file format is wrong!')\n", " sys.exit(1)\n", " # dir_path = os.path.dirname(os.path.realpath(samples_filename_or_bam_folder))\n", " # data_filenames = [os.path.join(dir_path, filename)\n", " # for filename in data_filenames]\n", " else:\n", " if input_is_bigwig:\n", " extension_to_check = '.bw'\n", " info('Input is set BigWig (.bw)')\n", " else:\n", " extension_to_check = '.bam'\n", " info('Input is set compressed SAM (.bam)')\n", "\n", " data_filenames = glob.glob(os.path.join(samples_filename_or_bam_folder,\n", " '*' + extension_to_check))\n", " if not data_filenames:\n", " error('No bam/bigwig files to analyze in %s. Exiting.'\n", " % samples_filename_or_bam_folder_or_bam_folder)\n", " sys.exit(1)\n", " sample_names = [os.path.basename(data_filename).replace(extension_to_check, '')\n", " for data_filename in data_filenames]\n", " # check all the files before starting\n", " info('Checking samples files location...')\n", " for data_filename in data_filenames:\n", " check_file(data_filename)\n", " return sample_names, data_filenames\n", "\n", "\n", "\n", "def create_tiled_genome(genome_name,\n", " output_directory,\n", " chr_len_filename,\n", " bin_size,\n", " chrom_exclude,\n", " blacklist):\n", "\n", " from re import search\n", " genome_directory = determine_path('genomes')\n", " annotations_directory = determine_path('gene_annotations')\n", "\n", " genome_sorted_bins_file = os.path.join(output_directory, '%s.%dbp.bins.sorted.bed'\n", " % (os.path.basename(genome_name), bin_size))\n", "\n", " if not (os.path.exists(genome_sorted_bins_file) and do_not_recompute):\n", "\n", " info('Creating bins of %dbp in %s' % (bin_size, genome_sorted_bins_file))\n", "\n", " if chrom_exclude:\n", " chr_len_filtered_filename = os.path.join(output_directory,\n", " \"%s_chr_lengths_filtered.txt\" % genome_name)\n", "\n", " with open(chr_len_filtered_filename, 'wb') as f:\n", " f.writelines(line for line in open(chr_len_filename)\n", " if not search(chrom_exclude, line.split()[0]))\n", " else:\n", " chr_len_filtered_filename = chr_len_filename\n", "\n", "\n", " cmd = 'bedtools makewindows -g \"%s\" -w %s | bedtools sort -i stdin > \"%s\" ' % (chr_len_filtered_filename,\n", " bin_size,\n", " genome_sorted_bins_file)\n", "\n", " sb.call(cmd, shell=True)\n", "\n", " # cmd = 'bedtools sort -i \"%s\" > \"%s\"' % (genome_sorted_bins_file, genome_sorted_bins_file)\n", " #\n", " # sb.call(cmd, shell=True)\n", "\n", "\n", " # tiled_genome = BedTool(). \\\n", " # window_maker(g=chr_len_filtered_filename,\n", " # w=bin_size).sort()\n", "\n", "\n", " if blacklist=='':\n", " info('Tiled genome file created will not be blacklisted')\n", " elif blacklist=='hg19':\n", " info('Using hg19 blacklist file %s to filter out the regions' %blacklist)\n", " blacklist_filepath = os.path.join(annotations_directory,\n", " 'hg19_blacklisted_regions.bed')\n", " check_file(blacklist_filepath)\n", "\n", " genome_sorted_bins_file_temp = genome_sorted_bins_file.replace('.bed','.filtered.bed')\n", "\n", " cmd = 'bedtools intersect -a \"%s\" -b \"%s\" -v > %s ' % (genome_sorted_bins_file,\n", " blacklist_filepath,\n", " genome_sorted_bins_file_temp)\n", " genome_sorted_bins_file = genome_sorted_bins_file_temp\n", "\n", " sb.call(cmd, shell=True)\n", "\n", "\n", " # tiled_genome.intersect(blacklist_filepath,\n", " # wa=True,\n", " # v=True,\n", " # output=genome_sorted_bins_file)\n", "\n", " elif os.path.isfile(blacklist):\n", " info('Using blacklist file %s to filter out the regions' %blacklist)\n", " blacklist_filepath = blacklist\n", " check_file(blacklist_filepath)\n", "\n", " genome_sorted_bins_file_temp = genome_sorted_bins_file.replace('.bed','.filtered.bed')\n", "\n", " cmd = 'bedtools intersect -a \"%s\" -b \"%s\" -v > %s ' % (genome_sorted_bins_file,\n", " blacklist_filepath,\n", " genome_sorted_bins_file_temp)\n", " genome_sorted_bins_file = genome_sorted_bins_file_temp\n", "\n", " # tiled_genome.intersect(blacklist_filepath,\n", " # wa=True,\n", " # v=True,\n", " # output=genome_sorted_bins_file)\n", " else:\n", " error('Incorrect blacklist option provided. '\n", " 'It is neither a file nor a genome')\n", " sys.exit(1)\n", " return genome_sorted_bins_file\n", "\n", "### if bigwig\n", "def copy_bigwigs(data_filenames, sample_names, tracks_directory):\n", " import shutil\n", " bigwig_filenames = [os.path.join(tracks_directory,\n", " '%s.bw' % sample_name)\n", " for sample_name in sample_names]\n", "\n", " for data_filename, bigwig_filename in zip(data_filenames,\n", " bigwig_filenames):\n", " shutil.copy2(data_filename, bigwig_filename)\n", "\n", " return bigwig_filenames\n", "\n", "def to_binned_tracks_if_bigwigs(data_filenames,\n", " intermediate_directory,\n", " binned_sample_names,\n", " genome_sorted_bins_file):\n", "\n", " binned_rpm_filenames = [os.path.join(intermediate_directory,\n", " '%s.rpm' % binned_sample_name)\n", " for binned_sample_name in binned_sample_names]\n", "\n", "\n", " binned_bedgraph_filenames = [os.path.join(intermediate_directory,\n", " '%s.rpm.bedgraph' % binned_sample_name)\n", " for binned_sample_name in binned_sample_names]\n", "\n", " for data_filename, binned_rpm_filename, binned_bedgraph_filename in zip(data_filenames,\n", " binned_rpm_filenames,\n", " binned_bedgraph_filenames):\n", "\n", " if not (os.path.exists(binned_rpm_filename) and do_not_recompute):\n", " info('Processing:%s' % data_filename)\n", "\n", "\n", " awk_command = \"awk \\'{printf(\\\"%s\\\\t%d\\\\t%d\\\\tp_%s\\\\n\\\", $1,$2,$3,NR)}\\'\"\n", "\n", " cmd = awk_command + ' < \"%s\" | bigWigAverageOverBed \"%s\" /dev/stdin /dev/stdout -bedOut=\"%s\" | cut -f5 > \"%s\"' % (genome_sorted_bins_file,\n", " data_filename,\n", " binned_bedgraph_filename,\n", " binned_rpm_filename)\n", "\n", " sb.call(cmd, shell=True)\n", "\n", " if not keep_intermediate_files:\n", " info('Deleting %s' % binned_bedgraph_filename)\n", " try:\n", " os.remove(binned_bedgraph_filename)\n", " except:\n", " pass\n", "\n", " return binned_rpm_filenames\n", "#######\n", "# convert bam files to genome-wide rpm tracks\n", "def to_filtered_deduped_bams(bam_filenames,\n", " output_directory,\n", " n_processes):\n", "\n", " filtered_bam_directory = os.path.join(output_directory, 'FILTERED_BAMS')\n", "\n", " if not os.path.exists(filtered_bam_directory):\n", " os.makedirs(filtered_bam_directory)\n", "\n", " bam_filtered_nodup_filenames = [os.path.join(\n", " filtered_bam_directory,\n", " '%s.filtered.nodup%s' % (os.path.splitext(os.path.basename(bam_filename))))\n", " for bam_filename in bam_filenames]\n", "\n", " for bam_filename, bam_filtered_nodup_filename in zip(bam_filenames,\n", " bam_filtered_nodup_filenames):\n", "\n", " if not (os.path.exists(bam_filtered_nodup_filename) and do_not_recompute):\n", "\n", " info('Processing:%s' % bam_filename)\n", " bam_temp_filename = os.path.join(os.path.dirname(bam_filtered_nodup_filename),\n", " '%s.temp%s' % os.path.splitext(\n", " os.path.basename(bam_filtered_nodup_filename)))\n", " info('Removing unmapped, mate unmapped, not primary alignment,'\n", " ' low MAPQ reads, and reads failing qc')\n", "\n", " cmd = 'sambamba view -f bam -l 0 -t %d -F ' \\\n", " '\"not (unmapped or mate_is_unmapped or' \\\n", " ' failed_quality_control or duplicate' \\\n", " ' or secondary_alignment)and mapping_quality >= 30\"' \\\n", " ' \"%s\" -o \"%s\"' % (n_processes,\n", " bam_filename,\n", " bam_temp_filename)\n", "\n", " # cmd = 'sambamba view -f bam -l 0 -t %d -F \"not failed_quality_control\" \"%s\" -o \"%s\"' % (n_processes,\n", " # bam_filename,\n", " # bam_temp_filename)\n", " sb.call(cmd, shell=True)\n", " info('Removing optical duplicates')\n", " cmd = 'sambamba markdup -l 5 -t %d \"%s\" \"%s\" ' % (\n", " n_processes,\n", " bam_temp_filename,\n", " bam_filtered_nodup_filename)\n", " sb.call(cmd, shell=True)\n", "\n", " try:\n", " os.remove(bam_temp_filename)\n", " os.remove(bam_temp_filename + '.bai')\n", " except:\n", " pass\n", "\n", " return bam_filtered_nodup_filenames\n", "\n", "def to_normalized_extended_reads_tracks(bam_filenames,\n", " sample_names,\n", " tracks_directory,\n", " chr_len_filename,\n", " read_ext):\n", "\n", " bedgraph_filenames = [os.path.join(tracks_directory, '%s.bedgraph' % sample_name)\n", " for sample_name in sample_names]\n", " bigwig_filenames = [filename.replace('.bedgraph', '.bw')\n", " for filename in bedgraph_filenames]\n", "\n", " for bam_filename, bedgraph_filename, bigwig_filename in zip(bam_filenames,\n", " bedgraph_filenames,\n", " bigwig_filenames):\n", "\n", " if not (os.path.exists(bedgraph_filename) and do_not_recompute):\n", " info('Processing:%s' % bam_filename)\n", "\n", " info('Computing Scaling Factor...')\n", " scaling_factor = get_scaling_factor(bam_filename)\n", " info('Scaling Factor: %e' % scaling_factor)\n", " info('Converting %s to bed and extending read length...' %bam_filename)\n", " info('Normalizing counts by scaling factor...')\n", "\n", "\n", " cmd = 'sambamba view -f bam \"%s\" | bamToBed | slopBed -r \"%s\" -l 0 -s -i stdin -g \"%s\" | ' \\\n", " 'genomeCoverageBed -g \"%s\" -i stdin -bg -scale %.64f| bedtools sort -i stdin > \"%s\"' % (\n", " bam_filename, read_ext, chr_len_filename, chr_len_filename, scaling_factor, bedgraph_filename)\n", " sb.call(cmd, shell=True)\n", "\n", " # BedTool(bam_filename). \\\n", " # bam_to_bed(). \\\n", " # slop(r=read_ext,\n", " # l=0,\n", " # s=True,\n", " # g=chr_len_filename). \\\n", " # genome_coverage(bg=True,\n", " # scale=scaling_factor,\n", " # g=chr_len_filename).\\\n", " # sort().\\\n", " # saveas(bedgraph_filename)\n", "\n", " if not keep_intermediate_files:\n", " info('Deleting %s' % bam_filename)\n", " try:\n", " os.remove(bam_filename)\n", " except:\n", " pass\n", "\n", " if keep_intermediate_files:\n", " if not (os.path.exists(bigwig_filename) and do_not_recompute):\n", " info('Converting BedGraph to BigWig')\n", " cmd = 'bedGraphToBigWig \"%s\" \"%s\" \"%s\"' % (bedgraph_filename,\n", " chr_len_filename,\n", " bigwig_filename)\n", " sb.call(cmd, shell=True)\n", "\n", " info('Converted %s to bigwig' % bedgraph_filename)\n", "\n", " return bedgraph_filenames, bigwig_filenames\n", "\n", "\n", "def to_binned_tracks(bedgraph_filenames,\n", " binned_sample_names,\n", " tracks_directory,\n", " intermediate_directory,\n", " chr_len_filename,\n", " genome_sorted_bins_file):\n", "\n", " bedgraph_binned_filenames = [os.path.join(tracks_directory,\n", " '%s.bedgraph' % binned_sample_name)\n", " for binned_sample_name in binned_sample_names]\n", "\n", " binned_rpm_filenames = [os.path.join(intermediate_directory,\n", " '%s.rpm' % binned_sample_name)\n", " for binned_sample_name in binned_sample_names]\n", "\n", " bigwig_binned_filenames = [filename.replace('.bedgraph', '.bw')\n", " for filename in bedgraph_binned_filenames]\n", "\n", " for bedgraph_filename, bedgraph_binned_filename,\\\n", " binned_rpm_filename, bigwig_binned_filename in zip(bedgraph_filenames,\n", " bedgraph_binned_filenames,\n", " binned_rpm_filenames,\n", " bigwig_binned_filenames):\n", "\n", " if not (os.path.exists(binned_rpm_filename) and do_not_recompute):\n", " info('Making constant binned rpm values file: %s' % binned_rpm_filename)\n", "\n", " cmd = 'bedtools map -a \"%s\" -b \"%s\" -c 4 -o mean -null 0.0 > \"%s\"' % (genome_sorted_bins_file,\n", " bedgraph_filename,\n", " bedgraph_binned_filename)\n", " sb.call(cmd, shell=True)\n", "\n", "\n", " # bedgraph = BedTool(genome_sorted_bins_file). \\\n", " # map(b=bedgraph_filename,\n", " # c=4,\n", " # o='mean',\n", " # null=0.0).\\\n", " # saveas(bedgraph_binned_filename)\n", "\n", " cmd = 'cut -f4 \"%s\" > \"%s\" ' % (bedgraph_binned_filename, binned_rpm_filename)\n", " sb.call(cmd, shell=True)\n", "\n", " # bedgraph.to_dataframe()['name'].\\\n", " # to_csv(binned_rpm_filename,\n", " # index=False)\n", "\n", " # bedgraph = BedTool(genome_sorted_bins_file). \\\n", " # intersect(bed_extended, c=True). \\\n", " # each(normalize_count, scaling_factor). \\\n", " # saveas(bedgraph_filename)\n", "\n", " info('Binned Bedgraph saved...')\n", "\n", " if not (os.path.exists(bigwig_binned_filename) and do_not_recompute):\n", " info('Converting BedGraph to BigWig')\n", " cmd = 'bedGraphToBigWig \"%s\" \"%s\" \"%s\"' % (bedgraph_binned_filename,\n", " chr_len_filename,\n", " bigwig_binned_filename)\n", " sb.call(cmd, shell=True)\n", " info('Converted %s to bigwig' % bedgraph_binned_filename)\n", " if not keep_intermediate_files:\n", " info('Deleting %s' % bedgraph_binned_filename)\n", " try:\n", " os.remove(bedgraph_binned_filename)\n", " except:\n", " pass\n", "\n", " return binned_rpm_filenames\n", "\n", "def load_binned_rpm_tracks(binned_sample_names, binned_rpm_filenames):\n", " # load all the tracks\n", " info('Loading the processed tracks')\n", " df_chip = {}\n", " for binned_sample_name, binned_rpm_filename in zip(binned_sample_names,\n", " binned_rpm_filenames):\n", "\n", " df_chip[binned_sample_name] = pd.read_csv(binned_rpm_filename,\n", " squeeze=True,\n", " header=None)\n", "\n", " info('Loading %s from file %s' % (binned_sample_name,\n", " binned_rpm_filename))\n", "\n", " df_chip = pd.DataFrame(df_chip)\n", "\n", " return df_chip\n", "\n", "def to_binned_normalized_tracks(df_chip,\n", " coordinates_bin,\n", " binned_sample_names,\n", " chr_len_filename,\n", " tracks_directory):\n", "\n", " df_chip_normalized = pd.DataFrame(quantile_normalization(df_chip.values),\n", " columns=df_chip.columns,\n", " index=df_chip.index)\n", "\n", " bedgraph_binned_normalized_filenames = [os.path.join(tracks_directory,\n", " '%s_quantile_normalized.bedgraph' % binned_sample_name)\n", " for binned_sample_name in binned_sample_names]\n", " bigwig_binned_normalized_filenames = [filename.replace('.bedgraph', '.bw')\n", " for filename in bedgraph_binned_normalized_filenames]\n", " # write quantile normalized tracks\n", " for binned_sample_name, bedgraph_binned_normalized_filename, bigwig_binned_normalized_filename in zip(\n", " binned_sample_names,\n", " bedgraph_binned_normalized_filenames,\n", " bigwig_binned_normalized_filenames):\n", "\n", " if not (os.path.exists(bedgraph_binned_normalized_filename) and do_not_recompute):\n", " info('Writing binned track: %s' % bigwig_binned_normalized_filename)\n", " joined_df = pd.DataFrame.join(coordinates_bin, df_chip_normalized[binned_sample_name])\n", " joined_df.to_csv(bedgraph_binned_normalized_filename,\n", " sep='\\t',\n", " header=False,\n", " index=False)\n", "\n", " if not (os.path.exists(bigwig_binned_normalized_filename) and do_not_recompute):\n", " cmd = 'bedGraphToBigWig \"%s\" \"%s\" \"%s\"' % (bedgraph_binned_normalized_filename,\n", " chr_len_filename,\n", " bigwig_binned_normalized_filename)\n", " sb.call(cmd, shell=True)\n", " info('Converted %s to bigwig' % bedgraph_binned_normalized_filename)\n", " if not keep_intermediate_files:\n", " info('Deleting %s' % bedgraph_binned_normalized_filename)\n", " try:\n", " os.remove(bedgraph_binned_normalized_filename)\n", " except:\n", " pass\n", "\n", " return df_chip_normalized, bigwig_binned_normalized_filenames\n", "\n", "def find_hpr_coordinates(df_chip,\n", " coordinates_bin,\n", " th_rpm,\n", " transformation,\n", " max_regions_percentage,\n", " output_directory):\n", "\n", " from scipy.stats import zscore\n", " import matplotlib as mpl\n", " mpl.use('Agg')\n", " import pylab as pl\n", "\n", " # th_rpm=args.th_rpm\n", " # transformation=args.transformation\n", " # max_regions_percentage=args.max_regions_percentage\n", " # th_rpm=np.min(df_chip.apply(lambda x: np.percentile(x,th_rpm)))\n", " th_rpm_est = find_th_rpm(df_chip, th_rpm)\n", " info('Estimated th_rpm:%s' % th_rpm_est)\n", "\n", " df_chip_not_empty = df_chip.loc[(df_chip > th_rpm_est).any(1), :]\n", "\n", " if transformation == 'log2':\n", " df_chip_not_empty = df_chip_not_empty.applymap(log2_transform)\n", " info('Using log2 transformation')\n", "\n", " elif transformation == 'angle':\n", " df_chip_not_empty = df_chip_not_empty.applymap(angle_transform)\n", " info('Using angle transformation')\n", "\n", " else:\n", " info('Using no transformation')\n", "\n", " iod_values = df_chip_not_empty.var(1) / df_chip_not_empty.mean(1)\n", "\n", " ####calculate the inflation point a la superenhancers\n", " scores = iod_values\n", " min_s = np.min(scores)\n", " max_s = np.max(scores)\n", "\n", " N_POINTS = len(scores)\n", " x = np.linspace(0, 1, N_POINTS)\n", " y = sorted((scores - min_s) / (max_s - min_s))\n", " m = smooth((np.diff(y) / np.diff(x)), 50)\n", " m = m - 1\n", " m[m <= 0] = np.inf\n", " m[:int(len(m) * (1 - max_regions_percentage))] = np.inf\n", " idx_th = np.argmin(m) + 1\n", "\n", " # print idx_th,\n", " th_iod = sorted(iod_values)[idx_th]\n", " # print th_iod\n", "\n", " hpr_idxs = iod_values > th_iod\n", "\n", " hpr_iod_scores = iod_values[hpr_idxs]\n", " # print len(iod_values),len(hpr_idxs),sum(hpr_idxs), sum(hpr_idxs)/float(len(hpr_idxs)),\n", "\n", " info('Selected %f%% regions (%d)' % (sum(hpr_idxs) / float(len(hpr_idxs)) * 100, sum(hpr_idxs)))\n", " coordinates_bin['iod'] = iod_values\n", " # we remove the regions \"without\" signal in any of the cell types\n", " coordinates_bin.dropna(inplace=True)\n", "\n", " df_chip_hpr_zscore = df_chip_not_empty.loc[hpr_idxs, :].apply(zscore, axis=1)\n", "\n", "\n", " ###plot selection\n", " pl.figure()\n", " pl.title('Selection of the HPRs')\n", " pl.plot(x, y, 'r', lw=3)\n", " pl.plot(x[idx_th], y[idx_th], '*', markersize=20)\n", " x_ext = np.linspace(-0.1, 1.2, N_POINTS)\n", " y_line = (m[idx_th] + 1.0) * (x_ext - x[idx_th]) + y[idx_th];\n", " pl.plot(x_ext, y_line, '--k', lw=3)\n", " pl.xlim(0, 1.1)\n", " pl.ylim(0, 1)\n", " pl.xlabel('Fraction of bins')\n", " pl.ylabel('Score normalized')\n", " pl.savefig(os.path.join(output_directory,\n", " 'SELECTION_OF_VARIABILITY_HOTSPOT.pdf'))\n", " pl.close()\n", "\n", " return hpr_idxs, coordinates_bin, df_chip_hpr_zscore, hpr_iod_scores\n", "\n", "\n", "def hpr_to_bigwig(coordinates_bin,\n", " tracks_directory,\n", " chr_len_filename):\n", "\n", " bedgraph_iod_track_filename = os.path.join(tracks_directory,\n", " 'VARIABILITY.bedgraph')\n", "\n", " bw_iod_track_filename = os.path.join(tracks_directory,\n", " 'VARIABILITY.bw')\n", "\n", " # create a track for IGV\n", "\n", " if not (os.path.exists(bw_iod_track_filename) and do_not_recompute):\n", "\n", " info('Generating variability track in bigwig format in:%s'\n", " % bw_iod_track_filename)\n", "\n", " coordinates_bin.to_csv(bedgraph_iod_track_filename,\n", " sep='\\t',\n", " header=False,\n", " index=False)\n", " sb.call('bedGraphToBigWig \"%s\" \"%s\" \"%s\"' % (bedgraph_iod_track_filename,\n", " chr_len_filename,\n", " bw_iod_track_filename),\n", " shell=True)\n", " try:\n", " os.remove(bedgraph_iod_track_filename)\n", " except:\n", " pass\n", " return bw_iod_track_filename\n", "\n", "def hpr_to_bedgraph(hpr_idxs,\n", " coordinates_bin,\n", " tracks_directory,\n", " output_directory):\n", "\n", " bedgraph_hpr_filename = os.path.join(tracks_directory,\n", " 'SELECTED_VARIABILITY_HOTSPOT.bedgraph')\n", "\n", " bed_hpr_filename = os.path.join(output_directory,\n", " 'SELECTED_VARIABILITY_HOTSPOT.bed')\n", "\n", " to_write = coordinates_bin.loc[hpr_idxs[hpr_idxs].index]\n", " to_write.dropna(inplace=True)\n", " to_write['bpstart'] = to_write['bpstart'].astype(int)\n", " to_write['bpend'] = to_write['bpend'].astype(int)\n", "\n", " to_write.to_csv(bedgraph_hpr_filename,\n", " sep='\\t',\n", " header=False,\n", " index=False)\n", "\n", " if not (os.path.exists(bed_hpr_filename) and do_not_recompute):\n", "\n", " info('Writing the HPRs in: \"%s\"' % bed_hpr_filename)\n", " sb.call('sort -k1,1 -k2,2n \"%s\" |'\n", " ' bedtools merge -i stdin > \"%s\"' % (bedgraph_hpr_filename,\n", " bed_hpr_filename),\n", " shell=True)\n", "\n", " return bed_hpr_filename\n", "\n", "def write_specific_regions(coordinates_bin,\n", " df_chip_hpr_zscore,\n", " specific_regions_directory,\n", " depleted,\n", " z_score_low,\n", " z_score_high):\n", " # write target\n", " info('Writing Specific Regions for each cell line...')\n", " coord_zscore = coordinates_bin.copy()\n", " for col in df_chip_hpr_zscore:\n", " regions_specific_filename = 'Regions_specific_for_%s_z_%.2f.bedgraph' % (\n", " os.path.basename(col).replace('.rpm', ''), z_score_high)\n", " specific_output_filename = os.path.join(specific_regions_directory,\n", " regions_specific_filename)\n", " specific_output_bed_filename = specific_output_filename.replace('.bedgraph', '.bed')\n", "\n", " if not (os.path.exists(specific_output_bed_filename) and do_not_recompute):\n", " if depleted:\n", " z_score_high = -z_score_high\n", " coord_zscore['z-score'] = df_chip_hpr_zscore.loc[\n", " df_chip_hpr_zscore.loc[:, col] < z_score_high, col]\n", " else:\n", " coord_zscore['z-score'] = df_chip_hpr_zscore.loc[\n", " df_chip_hpr_zscore.loc[:, col] > z_score_high, col]\n", " coord_zscore.dropna().to_csv(specific_output_filename,\n", " sep='\\t',\n", " header=False,\n", " index=False)\n", "\n", " info('Writing:%s' % specific_output_bed_filename)\n", " sb.call('sort -k1,1 -k2,2n \"%s\" |'\n", " ' bedtools merge -i stdin > \"%s\"' % (specific_output_filename,\n", " specific_output_bed_filename),\n", " shell=True)\n", " # write background\n", " info('Writing Background Regions for each cell line...')\n", " coord_zscore = coordinates_bin.copy()\n", " for col in df_chip_hpr_zscore:\n", "\n", " bg_output_filename = os.path.join(specific_regions_directory,\n", " 'Background_for_%s_z_%.2f.bedgraph' % (\n", " os.path.basename(col).replace('.rpm', ''),\n", " z_score_low))\n", "\n", " bg_output_bed_filename = bg_output_filename.replace('.bedgraph', '.bed')\n", "\n", " if not (os.path.exists(bg_output_bed_filename) and do_not_recompute):\n", "\n", " if depleted:\n", " z_score_low = -z_score_low\n", " coord_zscore['z-score'] = df_chip_hpr_zscore.loc[\n", " df_chip_hpr_zscore.loc[:, col] > z_score_low, col]\n", " else:\n", " coord_zscore['z-score'] = df_chip_hpr_zscore.loc[\n", " df_chip_hpr_zscore.loc[:, col] < z_score_low, col]\n", "\n", " coord_zscore.dropna().to_csv(bg_output_filename,\n", " sep='\\t',\n", " header=False,\n", " index=False)\n", "\n", " info('Writing:%s' % bg_output_bed_filename)\n", " sb.call(\n", " 'sort -k1,1 -k2,2n -i \"%s\" |'\n", " ' bedtools merge -i stdin > \"%s\"' % (bg_output_filename,\n", " bg_output_bed_filename),\n", " shell=True)\n", "\n", "\n", "def create_igv_track_file(hpr_iod_scores,\n", " bed_hpr_filename,\n", " bw_iod_track_filename,\n", " genome_name,\n", " output_directory,\n", " binned_sample_names,\n", " sample_names,\n", " disable_quantile_normalization):\n", "\n", " import xml.etree.cElementTree as ET\n", " import glob\n", "\n", " igv_session_filename = os.path.join(output_directory, 'OPEN_ME_WITH_IGV.xml')\n", " info('Creating an IGV session file (.xml) in: %s' % igv_session_filename)\n", "\n", " session = ET.Element(\"Session\")\n", " session.set(\"genome\", genome_name)\n", " session.set(\"hasGeneTrack\", \"true\")\n", " session.set(\"version\", \"7\")\n", " resources = ET.SubElement(session, \"Resources\")\n", " panel = ET.SubElement(session, \"Panel\")\n", "\n", " resource_items = []\n", " track_items = []\n", "\n", " min_h = np.mean(hpr_iod_scores) - 2 * np.std(hpr_iod_scores)\n", " max_h = np.mean(hpr_iod_scores) + 2 * np.std(hpr_iod_scores)\n", " mid_h = np.mean(hpr_iod_scores)\n", " # write the tracks\n", " for binned_sample_name in binned_sample_names:\n", " if disable_quantile_normalization:\n", " track_full_path = os.path.join(output_directory, 'TRACKS', '%s.bw' % binned_sample_name)\n", " else:\n", " track_full_path = os.path.join(output_directory, 'TRACKS',\n", " '%s_quantile_normalized.bw' % binned_sample_name)\n", "\n", " track_filename = rem_base_path(track_full_path, output_directory)\n", "\n", " if os.path.exists(track_full_path):\n", " resource_items.append(ET.SubElement(resources, \"Resource\"))\n", " resource_items[-1].set(\"path\", track_filename)\n", " track_items.append(ET.SubElement(panel, \"Track\"))\n", " track_items[-1].set('color', \"0,0,178\")\n", " track_items[-1].set('id', track_filename)\n", " track_items[-1].set(\"name\", binned_sample_name)\n", "\n", " resource_items.append(ET.SubElement(resources, \"Resource\"))\n", " resource_items[-1].set(\"path\", rem_base_path(bw_iod_track_filename, output_directory))\n", "\n", " track_items.append(ET.SubElement(panel, \"Track\"))\n", " track_items[-1].set('color', \"178,0,0\")\n", " track_items[-1].set('id', rem_base_path(bw_iod_track_filename, output_directory))\n", " track_items[-1].set('renderer', \"HEATMAP\")\n", " track_items[-1].set(\"colorScale\",\n", " \"ContinuousColorScale;%e;%e;%e;%e;0,153,255;255,255,51;204,0,0\" % (\n", " mid_h, min_h, mid_h, max_h))\n", " track_items[-1].set(\"name\", 'VARIABILITY')\n", "\n", " resource_items.append(ET.SubElement(resources, \"Resource\"))\n", " resource_items[-1].set(\"path\", rem_base_path(bed_hpr_filename, output_directory))\n", " track_items.append(ET.SubElement(panel, \"Track\"))\n", " track_items[-1].set('color', \"178,0,0\")\n", " track_items[-1].set('id', rem_base_path(bed_hpr_filename, output_directory))\n", " track_items[-1].set('renderer', \"HEATMAP\")\n", " track_items[-1].set(\"colorScale\",\n", " \"ContinuousColorScale;%e;%e;%e;%e;0,153,255;255,255,51;204,0,0\" % (\n", " mid_h, min_h, mid_h, max_h))\n", " track_items[-1].set(\"name\", 'HOTSPOTS')\n", "\n", " for sample_name in sample_names:\n", " track_full_path = \\\n", " glob.glob(os.path.join(output_directory, 'SPECIFIC_REGIONS',\n", " 'Regions_specific_for_%s*.bedgraph' % sample_name))[0]\n", " specific_track_filename = rem_base_path(track_full_path, output_directory)\n", " if os.path.exists(track_full_path):\n", " resource_items.append(ET.SubElement(resources, \"Resource\"))\n", " resource_items[-1].set(\"path\", specific_track_filename)\n", "\n", " track_items.append(ET.SubElement(panel, \"Track\"))\n", " track_items[-1].set('color', \"178,0,0\")\n", " track_items[-1].set('id', specific_track_filename)\n", " track_items[-1].set('renderer', \"HEATMAP\")\n", " track_items[-1].set(\"colorScale\",\n", " \"ContinuousColorScale;%e;%e;%e;%e;0,153,255;255,255,51;204,0,0\" % (\n", " mid_h, min_h, mid_h, max_h))\n", " track_items[-1].set(\"name\", 'REGION SPECIFIC FOR %s' % sample_name)\n", "\n", " tree = ET.ElementTree(session)\n", " tree.write(igv_session_filename, xml_declaration=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Start Pipeline" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "[H A Y S T A C K H O T S P O T]\n", "\n", "-SELECTION OF VARIABLE REGIONS-\n", "\n", "Version 0.5.0\n", "\n" ] } ], "source": [ "print '\\n[H A Y S T A C K H O T S P O T]'\n", "print('\\n-SELECTION OF VARIABLE REGIONS-\\n')\n", "print 'Version %s\\n' % HAYSTACK_VERSION" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 1: Initialize Pipeline" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO @ Wed, 16 Aug 2017 16:30:26:\n", "\t {'blacklist': 'hg19', 'samples_filename_or_bam_folder': '/mnt/hd2/TEST_DATASET/samples_names_hotspots.txt', 'disable_quantile_normalization': False, 'n_processes': 8, 'name': '', 'max_regions_percentage': 0.1, 'th_rpm': 99, 'z_score_low': 0.25, 'output_directory': '/mnt/hd2/test_data/OUTPUT_RESULTS', 'read_ext': 200, 'bin_size': 200, 'keep_intermediate_files': False, 'depleted': False, 'genome_name': 'hg19', 'z_score_high': 1.5, 'input_is_bigwig': False, 'do_not_recompute': False, 'transformation': 'angle', 'chrom_exclude': 'chrX|chrY'} \n", "\n" ] } ], "source": [ "# step 1.1\n", "parser = get_args()\n", "input_args=['/mnt/hd2/TEST_DATASET/samples_names_hotspots.txt',\n", " 'hg19',\n", " '--output_directory',\n", " '/mnt/hd2/test_data/OUTPUT_RESULTS',\n", " '--bin_size',\n", " '200',\n", " '--blacklist',\n", " 'hg19']\n", "\n", "args = parser.parse_args(input_args)\n", "info(vars(args))\n", "global do_not_recompute\n", "global keep_intermediate_files\n", "do_not_recompute = args.do_not_recompute\n", "keep_intermediate_files= args.keep_intermediate_files" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# step 1.2\n", "check_required_packages()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# step 1.3: create directories\n", "if args.name:\n", " directory_name = 'HAYSTACK_HOTSPOTS_on_%s' % args.name\n", "else:\n", " directory_name = 'HAYSTACK_HOTSPOTS'\n", "if args.output_directory:\n", " output_directory = os.path.join(args.output_directory,\n", " directory_name)\n", "else:\n", " output_directory = directory_name\n", "if not os.path.exists(output_directory):\n", " os.makedirs(output_directory)\n", "\n", "tracks_directory = os.path.join(output_directory,\n", " 'TRACKS')\n", "if not os.path.exists(tracks_directory):\n", " os.makedirs(tracks_directory)\n", "intermediate_directory = os.path.join(output_directory,\n", " 'INTERMEDIATE')\n", "if not os.path.exists(intermediate_directory):\n", " os.makedirs(intermediate_directory)\n", "specific_regions_directory = os.path.join(output_directory,\n", " 'SPECIFIC_REGIONS')\n", "if not os.path.exists(specific_regions_directory):\n", " os.makedirs(specific_regions_directory)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO @ Wed, 16 Aug 2017 16:30:30:\n", "\t Skipping header/comment line:#Haystack samples description file\n", " \n", "\n", "INFO @ Wed, 16 Aug 2017 16:30:30:\n", "\t Skipping header/comment line:#NAME\tCHIP_SEQ_BAM_FILENAME\t\n", " \n", "\n", "INFO @ Wed, 16 Aug 2017 16:30:30:\n", "\t Skipping header/comment line:#Note that any line starting with '#' or empty is ignored\n", " \n", "\n", "INFO @ Wed, 16 Aug 2017 16:30:30:\n", "\t Checking samples files location... \n", "\n" ] } ], "source": [ "# step 1.4: get filepaths\n", "sample_names, data_filenames = get_data_filepaths(args.samples_filename_or_bam_folder,\n", " args.input_is_bigwig)\n", "binned_sample_names = ['%s.%dbp' % (sample_name, args.bin_size)\n", " for sample_name in sample_names]" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['/mnt/hd2/TEST_DATASET/K562H3k27ac_sorted_rmdup.bam',\n", " '/mnt/hd2/TEST_DATASET/Gm12878H3k27ac_sorted_rmdup.bam',\n", " '/mnt/hd2/TEST_DATASET/Hepg2H3k27ac_sorted_rmdup.bam',\n", " '/mnt/hd2/TEST_DATASET/H1hescH3k27ac_sorted_rmdup.bam',\n", " '/mnt/hd2/TEST_DATASET/HsmmH3k27ac_sorted_rmdup.bam',\n", " '/mnt/hd2/TEST_DATASET/NhlfH3k27ac_sorted_rmdup.bam']" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_filenames" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO @ Wed, 16 Aug 2017 16:30:45:\n", "\t Initializing Genome:hg19 \n", "\n", "INFO @ Wed, 16 Aug 2017 16:30:45:\n", "\t genome_directory: /mnt/hd2/Dropbox (Partners HealthCare)/PROJECTS/2017_07_HAYSTACK/haystack/haystack_data/genomes \n", "\n", "INFO @ Wed, 16 Aug 2017 16:30:45:\n", "\t File /mnt/hd2/Dropbox (Partners HealthCare)/PROJECTS/2017_07_HAYSTACK/haystack/haystack_data/genomes/hg19.2bit exists, skipping download \n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/mnt/hd2/Dropbox (Partners HealthCare)/PROJECTS/2017_07_HAYSTACK/haystack/haystack_data\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO @ Wed, 16 Aug 2017 16:30:45:\n", "\t File /mnt/hd2/Dropbox (Partners HealthCare)/PROJECTS/2017_07_HAYSTACK/haystack/haystack_data/genomes/hg19_chr_lengths.txt exists, skipping generation \n", "\n", "INFO @ Wed, 16 Aug 2017 16:30:45:\n", "\t File /mnt/hd2/Dropbox (Partners HealthCare)/PROJECTS/2017_07_HAYSTACK/haystack/haystack_data/genomes/hg19_meme_bg exists, skipping generation \n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Genome initializated\n" ] } ], "source": [ "_, chr_len_filename, _= initialize_genome(args.genome_name,answer='')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 2: Create tiled genome" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO @ Wed, 16 Aug 2017 16:30:57:\n", "\t Creating bins of 200bp in /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/hg19.200bp.bins.sorted.bed \n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/mnt/hd2/Dropbox (Partners HealthCare)/PROJECTS/2017_07_HAYSTACK/haystack/haystack_data\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO @ Wed, 16 Aug 2017 16:31:32:\n", "\t Using hg19 blacklist file hg19 to filter out the regions \n", "\n" ] } ], "source": [ "genome_sorted_bins_file = create_tiled_genome(args.genome_name,\n", " output_directory,\n", " chr_len_filename,\n", " args.bin_size,\n", " args.chrom_exclude,\n", " args.blacklist)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 3: Convert files to genome-wide rpm tracks" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO @ Wed, 16 Aug 2017 16:35:08:\n", "\t Processing:/mnt/hd2/TEST_DATASET/K562H3k27ac_sorted_rmdup.bam \n", "\n", "INFO @ Wed, 16 Aug 2017 16:35:08:\n", "\t Removing unmapped, mate unmapped, not primary alignment, low MAPQ reads, and reads failing qc \n", "\n", "INFO @ Wed, 16 Aug 2017 16:35:33:\n", "\t Removing optical duplicates \n", "\n", "INFO @ Wed, 16 Aug 2017 16:36:17:\n", "\t Processing:/mnt/hd2/TEST_DATASET/Gm12878H3k27ac_sorted_rmdup.bam \n", "\n", "INFO @ Wed, 16 Aug 2017 16:36:17:\n", "\t Removing unmapped, mate unmapped, not primary alignment, low MAPQ reads, and reads failing qc \n", "\n", "INFO @ Wed, 16 Aug 2017 16:36:40:\n", "\t Removing optical duplicates \n", "\n", "INFO @ Wed, 16 Aug 2017 16:37:18:\n", "\t Processing:/mnt/hd2/TEST_DATASET/Hepg2H3k27ac_sorted_rmdup.bam \n", "\n", "INFO @ Wed, 16 Aug 2017 16:37:18:\n", "\t Removing unmapped, mate unmapped, not primary alignment, low MAPQ reads, and reads failing qc \n", "\n", "INFO @ Wed, 16 Aug 2017 16:37:39:\n", "\t Removing optical duplicates \n", "\n", "INFO @ Wed, 16 Aug 2017 16:38:12:\n", "\t Processing:/mnt/hd2/TEST_DATASET/H1hescH3k27ac_sorted_rmdup.bam \n", "\n", "INFO @ Wed, 16 Aug 2017 16:38:12:\n", "\t Removing unmapped, mate unmapped, not primary alignment, low MAPQ reads, and reads failing qc \n", "\n", "INFO @ Wed, 16 Aug 2017 16:39:01:\n", "\t Removing optical duplicates \n", "\n", "INFO @ Wed, 16 Aug 2017 16:40:45:\n", "\t Processing:/mnt/hd2/TEST_DATASET/HsmmH3k27ac_sorted_rmdup.bam \n", "\n", "INFO @ Wed, 16 Aug 2017 16:40:45:\n", "\t Removing unmapped, mate unmapped, not primary alignment, low MAPQ reads, and reads failing qc \n", "\n", "INFO @ Wed, 16 Aug 2017 16:41:19:\n", "\t Removing optical duplicates \n", "\n", "INFO @ Wed, 16 Aug 2017 16:42:31:\n", "\t Processing:/mnt/hd2/TEST_DATASET/NhlfH3k27ac_sorted_rmdup.bam \n", "\n", "INFO @ Wed, 16 Aug 2017 16:42:31:\n", "\t Removing unmapped, mate unmapped, not primary alignment, low MAPQ reads, and reads failing qc \n", "\n", "INFO @ Wed, 16 Aug 2017 16:43:02:\n", "\t Removing optical duplicates \n", "\n", "INFO @ Wed, 16 Aug 2017 16:43:43:\n", "\t Processing:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/FILTERED_BAMS/K562H3k27ac_sorted_rmdup.filtered.nodup.bam \n", "\n", "INFO @ Wed, 16 Aug 2017 16:43:43:\n", "\t Computing Scaling Factor... \n", "\n", "INFO @ Wed, 16 Aug 2017 16:43:49:\n", "\t Scaling Factor: 6.807407e-02 \n", "\n", "INFO @ Wed, 16 Aug 2017 16:43:49:\n", "\t Converting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/FILTERED_BAMS/K562H3k27ac_sorted_rmdup.filtered.nodup.bam to bed and extending read length... \n", "\n", "INFO @ Wed, 16 Aug 2017 16:43:49:\n", "\t Normalizing counts by scaling factor... \n", "\n", "INFO @ Wed, 16 Aug 2017 16:49:51:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/FILTERED_BAMS/K562H3k27ac_sorted_rmdup.filtered.nodup.bam \n", "\n", "INFO @ Wed, 16 Aug 2017 16:49:52:\n", "\t Processing:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/FILTERED_BAMS/Gm12878H3k27ac_sorted_rmdup.filtered.nodup.bam \n", "\n", "INFO @ Wed, 16 Aug 2017 16:49:52:\n", "\t Computing Scaling Factor... \n", "\n", "INFO @ Wed, 16 Aug 2017 16:49:57:\n", "\t Scaling Factor: 7.779452e-02 \n", "\n", "INFO @ Wed, 16 Aug 2017 16:49:57:\n", "\t Converting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/FILTERED_BAMS/Gm12878H3k27ac_sorted_rmdup.filtered.nodup.bam to bed and extending read length... \n", "\n", "INFO @ Wed, 16 Aug 2017 16:49:57:\n", "\t Normalizing counts by scaling factor... \n", "\n", "INFO @ Wed, 16 Aug 2017 16:52:04:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/FILTERED_BAMS/Gm12878H3k27ac_sorted_rmdup.filtered.nodup.bam \n", "\n", "INFO @ Wed, 16 Aug 2017 16:52:04:\n", "\t Processing:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/FILTERED_BAMS/Hepg2H3k27ac_sorted_rmdup.filtered.nodup.bam \n", "\n", "INFO @ Wed, 16 Aug 2017 16:52:04:\n", "\t Computing Scaling Factor... \n", "\n", "INFO @ Wed, 16 Aug 2017 16:52:11:\n", "\t Scaling Factor: 8.590669e-02 \n", "\n", "INFO @ Wed, 16 Aug 2017 16:52:11:\n", "\t Converting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/FILTERED_BAMS/Hepg2H3k27ac_sorted_rmdup.filtered.nodup.bam to bed and extending read length... \n", "\n", "INFO @ Wed, 16 Aug 2017 16:52:11:\n", "\t Normalizing counts by scaling factor... \n", "\n", "INFO @ Wed, 16 Aug 2017 16:54:05:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/FILTERED_BAMS/Hepg2H3k27ac_sorted_rmdup.filtered.nodup.bam \n", "\n", "INFO @ Wed, 16 Aug 2017 16:54:05:\n", "\t Processing:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/FILTERED_BAMS/H1hescH3k27ac_sorted_rmdup.filtered.nodup.bam \n", "\n", "INFO @ Wed, 16 Aug 2017 16:54:05:\n", "\t Computing Scaling Factor... \n", "\n", "INFO @ Wed, 16 Aug 2017 16:54:22:\n", "\t Scaling Factor: 3.883596e-02 \n", "\n", "INFO @ Wed, 16 Aug 2017 16:54:22:\n", "\t Converting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/FILTERED_BAMS/H1hescH3k27ac_sorted_rmdup.filtered.nodup.bam to bed and extending read length... \n", "\n", "INFO @ Wed, 16 Aug 2017 16:54:22:\n", "\t Normalizing counts by scaling factor... \n", "\n", "INFO @ Wed, 16 Aug 2017 17:13:18:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/FILTERED_BAMS/H1hescH3k27ac_sorted_rmdup.filtered.nodup.bam \n", "\n", "INFO @ Wed, 16 Aug 2017 17:13:19:\n", "\t Processing:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/FILTERED_BAMS/HsmmH3k27ac_sorted_rmdup.filtered.nodup.bam \n", "\n", "INFO @ Wed, 16 Aug 2017 17:13:19:\n", "\t Computing Scaling Factor... \n", "\n", "INFO @ Wed, 16 Aug 2017 17:13:39:\n", "\t Scaling Factor: 5.674800e-02 \n", "\n", "INFO @ Wed, 16 Aug 2017 17:13:39:\n", "\t Converting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/FILTERED_BAMS/HsmmH3k27ac_sorted_rmdup.filtered.nodup.bam to bed and extending read length... \n", "\n", "INFO @ Wed, 16 Aug 2017 17:13:39:\n", "\t Normalizing counts by scaling factor... \n", "\n", "INFO @ Wed, 16 Aug 2017 17:21:54:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/FILTERED_BAMS/HsmmH3k27ac_sorted_rmdup.filtered.nodup.bam \n", "\n", "INFO @ Wed, 16 Aug 2017 17:21:55:\n", "\t Processing:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/FILTERED_BAMS/NhlfH3k27ac_sorted_rmdup.filtered.nodup.bam \n", "\n", "INFO @ Wed, 16 Aug 2017 17:21:55:\n", "\t Computing Scaling Factor... \n", "\n", "INFO @ Wed, 16 Aug 2017 17:22:08:\n", "\t Scaling Factor: 6.976885e-02 \n", "\n", "INFO @ Wed, 16 Aug 2017 17:22:08:\n", "\t Converting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/FILTERED_BAMS/NhlfH3k27ac_sorted_rmdup.filtered.nodup.bam to bed and extending read length... \n", "\n", "INFO @ Wed, 16 Aug 2017 17:22:08:\n", "\t Normalizing counts by scaling factor... \n", "\n", "INFO @ Wed, 16 Aug 2017 17:24:38:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/FILTERED_BAMS/NhlfH3k27ac_sorted_rmdup.filtered.nodup.bam \n", "\n", "INFO @ Wed, 16 Aug 2017 17:24:38:\n", "\t Making constant binned rpm values file: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/INTERMEDIATE/K562.200bp.rpm \n", "\n", "INFO @ Wed, 16 Aug 2017 17:25:29:\n", "\t Binned Bedgraph saved... \n", "\n", "INFO @ Wed, 16 Aug 2017 17:25:29:\n", "\t Converting BedGraph to BigWig \n", "\n", "INFO @ Wed, 16 Aug 2017 17:25:52:\n", "\t Converted /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/K562.200bp.bedgraph to bigwig \n", "\n", "INFO @ Wed, 16 Aug 2017 17:25:52:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/K562.200bp.bedgraph \n", "\n", "INFO @ Wed, 16 Aug 2017 17:25:52:\n", "\t Making constant binned rpm values file: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/INTERMEDIATE/GM12878.200bp.rpm \n", "\n", "INFO @ Wed, 16 Aug 2017 17:26:29:\n", "\t Binned Bedgraph saved... \n", "\n", "INFO @ Wed, 16 Aug 2017 17:26:29:\n", "\t Converting BedGraph to BigWig \n", "\n", "INFO @ Wed, 16 Aug 2017 17:26:53:\n", "\t Converted /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/GM12878.200bp.bedgraph to bigwig \n", "\n", "INFO @ Wed, 16 Aug 2017 17:26:53:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/GM12878.200bp.bedgraph \n", "\n", "INFO @ Wed, 16 Aug 2017 17:26:53:\n", "\t Making constant binned rpm values file: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/INTERMEDIATE/HEPG2.200bp.rpm \n", "\n", "INFO @ Wed, 16 Aug 2017 17:27:28:\n", "\t Binned Bedgraph saved... \n", "\n", "INFO @ Wed, 16 Aug 2017 17:27:28:\n", "\t Converting BedGraph to BigWig \n", "\n", "INFO @ Wed, 16 Aug 2017 17:27:51:\n", "\t Converted /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/HEPG2.200bp.bedgraph to bigwig \n", "\n", "INFO @ Wed, 16 Aug 2017 17:27:51:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/HEPG2.200bp.bedgraph \n", "\n", "INFO @ Wed, 16 Aug 2017 17:27:51:\n", "\t Making constant binned rpm values file: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/INTERMEDIATE/H1hesc.200bp.rpm \n", "\n", "INFO @ Wed, 16 Aug 2017 17:28:46:\n", "\t Binned Bedgraph saved... \n", "\n", "INFO @ Wed, 16 Aug 2017 17:28:46:\n", "\t Converting BedGraph to BigWig \n", "\n", "INFO @ Wed, 16 Aug 2017 17:29:06:\n", "\t Converted /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/H1hesc.200bp.bedgraph to bigwig \n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO @ Wed, 16 Aug 2017 17:29:06:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/H1hesc.200bp.bedgraph \n", "\n", "INFO @ Wed, 16 Aug 2017 17:29:06:\n", "\t Making constant binned rpm values file: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/INTERMEDIATE/HSMM.200bp.rpm \n", "\n", "INFO @ Wed, 16 Aug 2017 17:29:48:\n", "\t Binned Bedgraph saved... \n", "\n", "INFO @ Wed, 16 Aug 2017 17:29:48:\n", "\t Converting BedGraph to BigWig \n", "\n", "INFO @ Wed, 16 Aug 2017 17:30:11:\n", "\t Converted /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/HSMM.200bp.bedgraph to bigwig \n", "\n", "INFO @ Wed, 16 Aug 2017 17:30:11:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/HSMM.200bp.bedgraph \n", "\n", "INFO @ Wed, 16 Aug 2017 17:30:12:\n", "\t Making constant binned rpm values file: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/INTERMEDIATE/NHLF.200bp.rpm \n", "\n", "INFO @ Wed, 16 Aug 2017 17:30:53:\n", "\t Binned Bedgraph saved... \n", "\n", "INFO @ Wed, 16 Aug 2017 17:30:53:\n", "\t Converting BedGraph to BigWig \n", "\n", "INFO @ Wed, 16 Aug 2017 17:31:16:\n", "\t Converted /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/NHLF.200bp.bedgraph to bigwig \n", "\n", "INFO @ Wed, 16 Aug 2017 17:31:16:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/NHLF.200bp.bedgraph \n", "\n" ] } ], "source": [ "if args.input_is_bigwig:\n", " bigwig_filenames = copy_bigwigs(data_filenames,\n", " sample_names,\n", " tracks_directory)\n", " binned_rpm_filenames = to_binned_tracks_if_bigwigs(bigwig_filenames,\n", " intermediate_directory,\n", " binned_sample_names,\n", " genome_sorted_bins_file)\n", "else:\n", " if args.do_not_filter_bams:\n", " bam_filtered_nodup_filenames= data_filenames\n", " else:\n", " bam_filtered_nodup_filenames=to_filtered_deduped_bams(data_filenames,\n", " output_directory,\n", " args.n_processes)\n", "\n", " bedgraph_filenames, bigwig_filenames=\\\n", " to_normalized_extended_reads_tracks(bam_filtered_nodup_filenames,\n", " sample_names,\n", " tracks_directory,\n", " chr_len_filename,\n", " args.read_ext)\n", "\n", " binned_rpm_filenames = to_binned_tracks(bedgraph_filenames,\n", " binned_sample_names,\n", " tracks_directory,\n", " intermediate_directory,\n", " chr_len_filename,\n", " genome_sorted_bins_file)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 4: Create dataframe and perform quantile normalization" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO @ Wed, 16 Aug 2017 18:20:02:\n", "\t Loading the processed tracks \n", "\n", "INFO @ Wed, 16 Aug 2017 18:20:03:\n", "\t Loading K562.200bp from file /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/INTERMEDIATE/K562.200bp.rpm \n", "\n", "INFO @ Wed, 16 Aug 2017 18:20:04:\n", "\t Loading GM12878.200bp from file /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/INTERMEDIATE/GM12878.200bp.rpm \n", "\n", "INFO @ Wed, 16 Aug 2017 18:20:05:\n", "\t Loading HEPG2.200bp from file /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/INTERMEDIATE/HEPG2.200bp.rpm \n", "\n", "INFO @ Wed, 16 Aug 2017 18:20:06:\n", "\t Loading H1hesc.200bp from file /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/INTERMEDIATE/H1hesc.200bp.rpm \n", "\n", "INFO @ Wed, 16 Aug 2017 18:20:07:\n", "\t Loading HSMM.200bp from file /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/INTERMEDIATE/HSMM.200bp.rpm \n", "\n", "INFO @ Wed, 16 Aug 2017 18:20:08:\n", "\t Loading NHLF.200bp from file /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/INTERMEDIATE/NHLF.200bp.rpm \n", "\n", "INFO @ Wed, 16 Aug 2017 18:20:11:\n", "\t Normalizing the data... \n", "\n", "INFO @ Wed, 16 Aug 2017 18:20:17:\n", "\t Writing binned track: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/K562.200bp_quantile_normalized.bw \n", "\n", "INFO @ Wed, 16 Aug 2017 18:20:54:\n", "\t Converted /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/K562.200bp_quantile_normalized.bedgraph to bigwig \n", "\n", "INFO @ Wed, 16 Aug 2017 18:20:54:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/K562.200bp_quantile_normalized.bedgraph \n", "\n", "INFO @ Wed, 16 Aug 2017 18:20:54:\n", "\t Writing binned track: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/GM12878.200bp_quantile_normalized.bw \n", "\n", "INFO @ Wed, 16 Aug 2017 18:21:31:\n", "\t Converted /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/GM12878.200bp_quantile_normalized.bedgraph to bigwig \n", "\n", "INFO @ Wed, 16 Aug 2017 18:21:31:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/GM12878.200bp_quantile_normalized.bedgraph \n", "\n", "INFO @ Wed, 16 Aug 2017 18:21:31:\n", "\t Writing binned track: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/HEPG2.200bp_quantile_normalized.bw \n", "\n", "INFO @ Wed, 16 Aug 2017 18:22:09:\n", "\t Converted /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/HEPG2.200bp_quantile_normalized.bedgraph to bigwig \n", "\n", "INFO @ Wed, 16 Aug 2017 18:22:09:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/HEPG2.200bp_quantile_normalized.bedgraph \n", "\n", "INFO @ Wed, 16 Aug 2017 18:22:09:\n", "\t Writing binned track: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/H1hesc.200bp_quantile_normalized.bw \n", "\n", "INFO @ Wed, 16 Aug 2017 18:22:43:\n", "\t Converted /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/H1hesc.200bp_quantile_normalized.bedgraph to bigwig \n", "\n", "INFO @ Wed, 16 Aug 2017 18:22:43:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/H1hesc.200bp_quantile_normalized.bedgraph \n", "\n", "INFO @ Wed, 16 Aug 2017 18:22:44:\n", "\t Writing binned track: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/HSMM.200bp_quantile_normalized.bw \n", "\n", "INFO @ Wed, 16 Aug 2017 18:23:19:\n", "\t Converted /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/HSMM.200bp_quantile_normalized.bedgraph to bigwig \n", "\n", "INFO @ Wed, 16 Aug 2017 18:23:19:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/HSMM.200bp_quantile_normalized.bedgraph \n", "\n", "INFO @ Wed, 16 Aug 2017 18:23:19:\n", "\t Writing binned track: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/NHLF.200bp_quantile_normalized.bw \n", "\n", "INFO @ Wed, 16 Aug 2017 18:23:55:\n", "\t Converted /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/NHLF.200bp_quantile_normalized.bedgraph to bigwig \n", "\n", "INFO @ Wed, 16 Aug 2017 18:23:55:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/NHLF.200bp_quantile_normalized.bedgraph \n", "\n" ] } ], "source": [ "df_chip = load_binned_rpm_tracks(binned_sample_names,\n", " binned_rpm_filenames)\n", "\n", "coordinates_bin = pd.read_csv(genome_sorted_bins_file,\n", " names=['chr_id',\n", " 'bpstart',\n", " 'bpend'],\n", " sep='\\t',\n", " header=None,\n", " usecols=[0, 1, 2])\n", "\n", "if not args.disable_quantile_normalization:\n", " info('Normalizing the data...')\n", "\n", " df_chip, bigwig_binned_normalized_filenames = \\\n", " to_binned_normalized_tracks(df_chip,\n", " coordinates_bin,\n", " binned_sample_names,\n", " chr_len_filename,\n", " tracks_directory)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GM12878.200bpH1hesc.200bpHEPG2.200bpHSMM.200bpK562.200bpNHLF.200bp
100000.0726610.4185580.4794850.2570870.1159710.365177
100010.0726610.1349160.1112280.1673630.1874330.506813
100020.1199960.0737390.1364920.1332980.1442260.354775
100030.1029080.1258660.1364920.2044320.0849890.225098
100040.1029080.0745100.0735850.2139140.1944430.321748
100050.1199960.0915200.1112280.0804370.1442260.258250
100060.1199960.0421410.1112280.0739550.0583430.167291
100070.0726610.0421410.1364920.0097090.0583430.058343
100080.0726610.1114590.1613160.0739550.2445270.185959
100090.0726610.2758130.3307250.1673630.3989870.315885
\n", "
" ], "text/plain": [ " GM12878.200bp H1hesc.200bp HEPG2.200bp HSMM.200bp K562.200bp \\\n", "10000 0.072661 0.418558 0.479485 0.257087 0.115971 \n", "10001 0.072661 0.134916 0.111228 0.167363 0.187433 \n", "10002 0.119996 0.073739 0.136492 0.133298 0.144226 \n", "10003 0.102908 0.125866 0.136492 0.204432 0.084989 \n", "10004 0.102908 0.074510 0.073585 0.213914 0.194443 \n", "10005 0.119996 0.091520 0.111228 0.080437 0.144226 \n", "10006 0.119996 0.042141 0.111228 0.073955 0.058343 \n", "10007 0.072661 0.042141 0.136492 0.009709 0.058343 \n", "10008 0.072661 0.111459 0.161316 0.073955 0.244527 \n", "10009 0.072661 0.275813 0.330725 0.167363 0.398987 \n", "\n", " NHLF.200bp \n", "10000 0.365177 \n", "10001 0.506813 \n", "10002 0.354775 \n", "10003 0.225098 \n", "10004 0.321748 \n", "10005 0.258250 \n", "10006 0.167291 \n", "10007 0.058343 \n", "10008 0.185959 \n", "10009 0.315885 " ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_chip[10000:10010]" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
chr_idbpstartbpend
0chr11000010200
1chr11020010400
2chr11160011800
3chr11180012000
4chr11200012200
\n", "
" ], "text/plain": [ " chr_id bpstart bpend\n", "0 chr1 10000 10200\n", "1 chr1 10200 10400\n", "2 chr1 11600 11800\n", "3 chr1 11800 12000\n", "4 chr1 12000 12200" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coordinates_bin.head()" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO @ Wed, 16 Aug 2017 18:23:55:\n", "\t Normalizing the data... \n", "\n", "INFO @ Wed, 16 Aug 2017 18:24:01:\n", "\t Writing binned track: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/K562.200bp_quantile_normalized.bw \n", "\n", "INFO @ Wed, 16 Aug 2017 18:24:36:\n", "\t Converted /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/K562.200bp_quantile_normalized.bedgraph to bigwig \n", "\n", "INFO @ Wed, 16 Aug 2017 18:24:36:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/K562.200bp_quantile_normalized.bedgraph \n", "\n", "INFO @ Wed, 16 Aug 2017 18:24:38:\n", "\t Writing binned track: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/GM12878.200bp_quantile_normalized.bw \n", "\n", "INFO @ Wed, 16 Aug 2017 18:25:14:\n", "\t Converted /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/GM12878.200bp_quantile_normalized.bedgraph to bigwig \n", "\n", "INFO @ Wed, 16 Aug 2017 18:25:14:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/GM12878.200bp_quantile_normalized.bedgraph \n", "\n", "INFO @ Wed, 16 Aug 2017 18:25:14:\n", "\t Writing binned track: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/HEPG2.200bp_quantile_normalized.bw \n", "\n", "INFO @ Wed, 16 Aug 2017 18:25:52:\n", "\t Converted /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/HEPG2.200bp_quantile_normalized.bedgraph to bigwig \n", "\n", "INFO @ Wed, 16 Aug 2017 18:25:52:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/HEPG2.200bp_quantile_normalized.bedgraph \n", "\n", "INFO @ Wed, 16 Aug 2017 18:25:52:\n", "\t Writing binned track: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/H1hesc.200bp_quantile_normalized.bw \n", "\n", "INFO @ Wed, 16 Aug 2017 18:26:25:\n", "\t Converted /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/H1hesc.200bp_quantile_normalized.bedgraph to bigwig \n", "\n", "INFO @ Wed, 16 Aug 2017 18:26:25:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/H1hesc.200bp_quantile_normalized.bedgraph \n", "\n", "INFO @ Wed, 16 Aug 2017 18:26:25:\n", "\t Writing binned track: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/HSMM.200bp_quantile_normalized.bw \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:01:\n", "\t Converted /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/HSMM.200bp_quantile_normalized.bedgraph to bigwig \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:01:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/HSMM.200bp_quantile_normalized.bedgraph \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:01:\n", "\t Writing binned track: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/NHLF.200bp_quantile_normalized.bw \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:38:\n", "\t Converted /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/NHLF.200bp_quantile_normalized.bedgraph to bigwig \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:38:\n", "\t Deleting /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/NHLF.200bp_quantile_normalized.bedgraph \n", "\n" ] } ], "source": [ "if not args.disable_quantile_normalization:\n", " info('Normalizing the data...')\n", "\n", " df_chip, bigwig_binned_normalized_filenames = to_binned_normalized_tracks(df_chip,\n", " coordinates_bin,\n", " binned_sample_names,\n", " chr_len_filename,\n", " tracks_directory)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GM12878.200bpH1hesc.200bpHEPG2.200bpHSMM.200bpK562.200bpNHLF.200bp
100000.0726610.4185580.4794850.2570870.1159710.365177
100010.0726610.1349160.1112280.1673630.1874330.506813
100020.1199960.0737390.1364920.1332980.1442260.354775
100030.1029080.1258660.1364920.2044320.0849890.225098
100040.1029080.0745100.0735850.2139140.1944430.321748
100050.1199960.0915200.1112280.0804370.1442260.258250
100060.1199960.0421410.1112280.0739550.0583430.167291
100070.0726610.0421410.1364920.0097090.0583430.058343
100080.0726610.1114590.1613160.0739550.2445270.185959
100090.0726610.2758130.3307250.1673630.3989870.315885
\n", "
" ], "text/plain": [ " GM12878.200bp H1hesc.200bp HEPG2.200bp HSMM.200bp K562.200bp \\\n", "10000 0.072661 0.418558 0.479485 0.257087 0.115971 \n", "10001 0.072661 0.134916 0.111228 0.167363 0.187433 \n", "10002 0.119996 0.073739 0.136492 0.133298 0.144226 \n", "10003 0.102908 0.125866 0.136492 0.204432 0.084989 \n", "10004 0.102908 0.074510 0.073585 0.213914 0.194443 \n", "10005 0.119996 0.091520 0.111228 0.080437 0.144226 \n", "10006 0.119996 0.042141 0.111228 0.073955 0.058343 \n", "10007 0.072661 0.042141 0.136492 0.009709 0.058343 \n", "10008 0.072661 0.111459 0.161316 0.073955 0.244527 \n", "10009 0.072661 0.275813 0.330725 0.167363 0.398987 \n", "\n", " NHLF.200bp \n", "10000 0.365177 \n", "10001 0.506813 \n", "10002 0.354775 \n", "10003 0.225098 \n", "10004 0.321748 \n", "10005 0.258250 \n", "10006 0.167291 \n", "10007 0.058343 \n", "10008 0.185959 \n", "10009 0.315885 " ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_chip[10000:10010]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 5: Determine HP regions" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO @ Wed, 16 Aug 2017 18:27:38:\n", "\t Determine High Plastic Regions (HPR) \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:39:\n", "\t Estimated th_rpm:1.24605886357 \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:46:\n", "\t Using angle transformation \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:48:\n", "\t Selected 9.001698% regions (44150) \n", "\n" ] } ], "source": [ "info('Determine High Plastic Regions (HPR)')\n", "hpr_idxs, coordinates_bin, df_chip_hpr_zscore, hpr_iod_scores =\\\n", " find_hpr_coordinates(df_chip,\n", " coordinates_bin,\n", " args.th_rpm,\n", " args.transformation,\n", " args.max_regions_percentage,\n", " output_directory)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GM12878.200bpH1hesc.200bpHEPG2.200bpHSMM.200bpK562.200bpNHLF.200bp
4371-0.497677-0.444716-0.357257-0.4948862.233648-0.439112
4446-0.504576-0.4111532.234961-0.411709-0.453761-0.453761
4447-0.534564-0.4242472.232858-0.427933-0.361750-0.484365
5147-0.980266-0.165562-0.8583191.215587-0.7207071.509266
5869-0.2570622.175382-0.488582-0.719592-0.044464-0.665682
\n", "
" ], "text/plain": [ " GM12878.200bp H1hesc.200bp HEPG2.200bp HSMM.200bp K562.200bp \\\n", "4371 -0.497677 -0.444716 -0.357257 -0.494886 2.233648 \n", "4446 -0.504576 -0.411153 2.234961 -0.411709 -0.453761 \n", "4447 -0.534564 -0.424247 2.232858 -0.427933 -0.361750 \n", "5147 -0.980266 -0.165562 -0.858319 1.215587 -0.720707 \n", "5869 -0.257062 2.175382 -0.488582 -0.719592 -0.044464 \n", "\n", " NHLF.200bp \n", "4371 -0.439112 \n", "4446 -0.453761 \n", "4447 -0.484365 \n", "5147 1.509266 \n", "5869 -0.665682 " ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_chip_hpr_zscore.head()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO @ Wed, 16 Aug 2017 18:27:55:\n", "\t HPR to bigwig \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:55:\n", "\t Generating variability track in bigwig format in:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/TRACKS/VARIABILITY.bw \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:56:\n", "\t HPR to bedgraph \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:56:\n", "\t Writing the HPRs in: \"/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/SELECTED_VARIABILITY_HOTSPOT.bed\" \n", "\n" ] } ], "source": [ "info('HPR to bigwig')\n", "bw_iod_track_filename = hpr_to_bigwig(coordinates_bin,\n", " tracks_directory,\n", " chr_len_filename)\n", "info('HPR to bedgraph')\n", "bed_hpr_filename = hpr_to_bedgraph(hpr_idxs,\n", " coordinates_bin,\n", " tracks_directory,\n", " output_directory)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO @ Wed, 16 Aug 2017 18:27:56:\n", "\t Save files \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:56:\n", "\t Writing Specific Regions for each cell line... \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:56:\n", "\t Writing:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/SPECIFIC_REGIONS/Regions_specific_for_GM12878.200bp_z_1.50.bed \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:56:\n", "\t Writing:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/SPECIFIC_REGIONS/Regions_specific_for_H1hesc.200bp_z_1.50.bed \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:57:\n", "\t Writing:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/SPECIFIC_REGIONS/Regions_specific_for_HEPG2.200bp_z_1.50.bed \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:57:\n", "\t Writing:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/SPECIFIC_REGIONS/Regions_specific_for_HSMM.200bp_z_1.50.bed \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:57:\n", "\t Writing:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/SPECIFIC_REGIONS/Regions_specific_for_K562.200bp_z_1.50.bed \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:57:\n", "\t Writing:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/SPECIFIC_REGIONS/Regions_specific_for_NHLF.200bp_z_1.50.bed \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:57:\n", "\t Writing Background Regions for each cell line... \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:57:\n", "\t Writing:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/SPECIFIC_REGIONS/Background_for_GM12878.200bp_z_0.25.bed \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:57:\n", "\t Writing:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/SPECIFIC_REGIONS/Background_for_H1hesc.200bp_z_0.25.bed \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:57:\n", "\t Writing:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/SPECIFIC_REGIONS/Background_for_HEPG2.200bp_z_0.25.bed \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:58:\n", "\t Writing:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/SPECIFIC_REGIONS/Background_for_HSMM.200bp_z_0.25.bed \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:58:\n", "\t Writing:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/SPECIFIC_REGIONS/Background_for_K562.200bp_z_0.25.bed \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:58:\n", "\t Writing:/mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/SPECIFIC_REGIONS/Background_for_NHLF.200bp_z_0.25.bed \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:58:\n", "\t Creating an IGV session file (.xml) in: /mnt/hd2/test_data/OUTPUT_RESULTS/HAYSTACK_HOTSPOTS/OPEN_ME_WITH_IGV.xml \n", "\n", "INFO @ Wed, 16 Aug 2017 18:27:58:\n", "\t Workflow completed! \n", "\n" ] } ], "source": [ "info('Save files')\n", "write_specific_regions(coordinates_bin,\n", " df_chip_hpr_zscore,\n", " specific_regions_directory,\n", " args.depleted,\n", " args.z_score_low,\n", " args.z_score_high)\n", "create_igv_track_file(hpr_iod_scores,\n", " bed_hpr_filename,\n", " bw_iod_track_filename,\n", " args.genome_name,\n", " output_directory,\n", " binned_sample_names,\n", " sample_names,\n", " args.disable_quantile_normalization)\n", "\n", "info('Workflow completed!')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.4" } }, "nbformat": 4, "nbformat_minor": 1 }