#!/usr/bin/env perl # These are safe to load. Core modules distributed with Perl itself. use strict; use warnings; use Getopt::Long; use File::Temp; use File::Basename; use Cwd qw (abs_path cwd); use Term::ANSIColor qw(:constants); use Data::Dumper; Getopt::Long::Configure('prefix=--'); my ($LASTCHANGEDBY) = q$LastChangedBy: konganti $ =~ m/.+?\:(.+)/; my ($LASTCHANGEDDATE) = q$LastChangedDate: 2019-10-04 14:52:27 -0500 (Fri, 04 October 2019) $ =~ m/.+?\:(.+)/; my ($VERSION) = q$LastChangedRevision: 1.3.1 $ =~ m/.+?\:\s*(.*)\s*.*/; my $AUTHORFULLNAME = 'Kranti Konganti'; my ($help, $quiet, $setup, $get_uq_sc_opts, $fetch_sc_opts, $categorize_sc_opts, $cuffcmp_opts, $deps, $out, $start_cpc, $start_rnafold, $rm_int_plots, $start_infernal, $num_cpu, $local_lib, $inf_cov, $skip_cpc_core, $skip_rnafold_core, $skip_cmscan_core, $lncRNApipe_ver_info, $no_update_check, $skip_get_uq, $lncRNApipe_succ, $num_cpu_by_2, $skip_cpc, $debug, $sp_list, $pipe_config, $conf_run_id, $setup_compiler, $blast_strand, $cpanm_args, $donot_wait4jobIDs); my $kill_jobs_thru_conf = my $show_lncrna_counts = []; my $is_valid_option = GetOptions('help:s' => \$help, 'quiet' => \$quiet, 'setup:s' => \$setup, 'run=s' => \$out, 'cuffcompare|cuffcmp=s' => \$cuffcmp_opts, 'cat-ncRNAs=s' => \$categorize_sc_opts, 'fetch-seq:s' => \$fetch_sc_opts, 'get-uq-feat=s' => \$get_uq_sc_opts, 'cpc' => \$start_cpc, 'cpc-strand-reverse' => \$blast_strand, 'rnafold:s' => \$start_rnafold, 'rm-int-plots' => \$rm_int_plots, 'infernal:s' => \$start_infernal, 'cpu=i' => \$num_cpu, 'skip-cpc-core' => \$skip_cpc_core, 'skip-rnafold-core' => \$skip_rnafold_core, 'skip-cmscan-core' => \$skip_cmscan_core, 'coverage-infernal|cov-inf=f' => \$inf_cov, 'local-lib=s' => \$local_lib, 'version' => \$lncRNApipe_ver_info, 'no-update-chk|noup|d' => \$no_update_check, 'skip-get|skip-get-uq' => \$skip_get_uq, 'skip-cpc' => \$skip_cpc, 'send-run-report=s' => \$debug, 'list=s' => \$sp_list, 'configuration=s' => \$pipe_config, 'kill-conf-jobs=s@' => \$kill_jobs_thru_conf, 'setup-compiler=s' => \$setup_compiler, 'dont-wait-4-job-ids' => \$donot_wait4jobIDs, 'show-counts=s@' => \$show_lncrna_counts); # Initialize known defaults my $CAT_SC = 'categorize_ncRNAs.pl'; my $FETCH_SC = 'fetch_seq_from_ucsc.pl'; my $GET_UQ_SC = 'get_unique_features.pl'; my $CPC_SH_SC = 'CPC.sh'; my $ANN_INF_SC = 'annotate_final_ncRNAs.sh'; my $RELPLOT_MOD_SC = 'relplot_mod.pl'; my $SEND_MAIL_SC = 'send_lncRNApipe_log.pl'; my $GEN_INFERNAL_HITS_SC = "gen_infernalHits_gtf.pl"; my $HGCONF = 'hg.conf.sh'; my $USER_HOME = dirname(abs_path($0)); $inf_cov = 10 if (!defined $inf_cov); $skip_get_uq = 1 if (defined $skip_get_uq || (!defined $get_uq_sc_opts && defined $fetch_sc_opts)); if (defined $num_cpu) { $num_cpu_by_2 = sprintf("%.0f", $num_cpu / 2); } else { $num_cpu_by_2 = 1; } if (!$is_valid_option) { die "\nInvalid / No Option(s) Provided:$! See $0 --help for valid options.\n\n"; } elsif (defined $lncRNApipe_ver_info) { my $io = run_lncRNApipe('version'); $io->this_script_info('lncRNApipe', $VERSION, $AUTHORFULLNAME, $LASTCHANGEDBY, $LASTCHANGEDDATE, 'gnu'); exit 0; } elsif (defined $setup) { if (defined $setup_compiler) { die "\nCompiler argument should be in the format COMPILER_VAR_NAME=COMPILER and your\n" . "compiler should be in your UNIX \$PATH.\n\nEx: --setup-compiler CC=g++\n\n" if ($setup_compiler !~ m/^\w+=.+/); $cpanm_args = "--configure-args $setup_compiler"; } else { $cpanm_args = ''; } setup_lncRNApipe(`pwd`) } elsif (defined $out || (defined $help && $help eq '')) { run_lncRNApipe(); } elsif (defined $debug) { my $depconf_fh = read_conf('/.lncRNApipe.depconf'); my $pl_dep_paths_fh = read_conf('/.lncRNApipe.PERLLIBS'); my $pl_inc_string = source_env($pl_dep_paths_fh); get_deps($depconf_fh); system("perl $pl_inc_string $deps->{'email'} $debug"); } elsif (defined $help && $help ne '') { my $depconf_fh = read_conf('/.lncRNApipe.depconf'); my $pl_dep_paths_fh = read_conf('/.lncRNApipe.PERLLIBS'); my $pl_inc_string = source_env($pl_dep_paths_fh); get_deps($depconf_fh); system("$deps->{'cuffcompare'} -h") if ($help =~ m/cuff|cuffcompare/i); system("perl $pl_inc_string $deps->{'cat'} -h") if ($help =~ m/cat|cat-ncRNAs/i); system("perl $pl_inc_string $deps->{'get'} -h") if ($help =~ m/get|get-uq-feat/i); system("perl $pl_inc_string $deps->{'fetch'} -h") if ($help =~ m/fetch|fetch-seq/i); system("$deps->{'RNAfold'} --detailed-help") if ($help =~ m/rnafold|rna|fold/i); system("$deps->{'cmscan'} -h") if ($help =~m /infernal|inf/i); print "\nSee http://cpc.cbi.pku.edu.cn/docs/install_guide.jsp for help documentation.\n\n" if ($help =~ m/cpc/i); close $depconf_fh; close $pl_dep_paths_fh; } elsif (defined $sp_list) { my $io = run_lncRNApipe('version'); list($io); exit 0; } elsif (defined $show_lncrna_counts && scalar @$show_lncrna_counts > 0) { my $io = run_lncRNApipe('version'); foreach my $tr (@$show_lncrna_counts) { print $tr, "\n"; disp_ncRNA_counts($io, $tr, 'Total identified'); } } elsif (defined $pipe_config) { my $io = run_lncRNApipe('version'); $io->error('Cannot find the configuration file [ ' . $pipe_config . ' ] or the file is empty!') if (!-s $pipe_config || !-e $pipe_config); # Where were our modules installed. my $depconf_fh = read_conf('/.lncRNApipe.depconf'); my $pl_dep_paths_fh = read_conf('/.lncRNApipe.PERLLIBS'); my $pl_inc_string = source_env($pl_dep_paths_fh); get_deps($depconf_fh); # Will fail if not installed properly. require YAML; YAML->import( qw( LoadFile ) ); my $lncRNApipe_config = LoadFile($pipe_config); #$io->c_time("Read parameters: \n\n" . Data::Dumper->Dump([$lncRNApipe_config], [qw(lncRNApipe_config)])); # Now run lncRNApipe from config file run_lncRNApipe($lncRNApipe_config); } else { ################################################################################################ # # # Check for updates first, unless disabled as some serious bugs may have been fixed # # ================================================================================= # # # ################################################################################################ print "\nHELP!\n--------\n\n\t$0 --h cuff\n\t$0 --h cat-ncRNAs\n\t$0 --h get-uq-feat\n\t$0 --h fetch\n\t$0 --h cpc\n\t$0 --h rnafold\n\t$0 --h inf\n\n"; check_for_updates(); print "\n$0 called with no command line options. See $0 --help for options.\n\n"; } ################################################################################################ # # # Run the pipeline # # ================ # # # ################################################################################################ sub run_lncRNApipe { my $shall_I_run = shift; my $outdir; my $pl_dep_paths_fh = read_conf('/.lncRNApipe.PERLLIBS'); my $pl_inc_string = source_env($pl_dep_paths_fh); $shall_I_run = '' if (!defined $shall_I_run); # We will quit, if the setup was unsuccessful. require IO::Routine; IO::Routine->import(); my $io = IO::Routine->new($help, $quiet); return $io if ($shall_I_run eq 'version'); my $s_time = $io->start_timer; binmode(STDOUT, ":utf8"); $io->verify_options([$out, $is_valid_option]) if (defined $help && $help eq ''); # Validate options here after help documentation is printed $io->c_time('Validating options...'); $io->c_time("Starting \x{2632}\x{2634} lncRNApipe Pipeline..."); my $dep_tools_fh = $io->open_file('<', $USER_HOME . '/.lncRNApipe.depconf'); get_deps($dep_tools_fh); # Check blast strand options for CPC. if (defined $blast_strand) { #$io->error("The binary blastall takes only 1, 2, or 3 for -S argument, wherein\n" . # "1 means search the plus strand of the query sequence\n" . # "2 means search the minus strand of the query sequence\n" . # "3 means earch both strands of the query sequence\n\n" . # "You entered: $blast_strand !"); $blast_strand = '-r'; } elsif (!defined $blast_strand) { #$io->warning("Will run CPC using blastall [ with default option: -S 1 ]." . # "\nUse --blast-strand option with lncRNApipe to change this.", 'INFO!!'); $blast_strand = ''; } # User has requested to run the pipeline from assembly stage. Handle it here. run_lncRNApipe_from_conf($io, $shall_I_run, $s_time) if (ref($shall_I_run) eq 'HASH'); if (!defined $out) { $outdir = $io->validate_create_path($out, 'do not create', 'lncRNApipe pipeline output directory'); } else { $outdir = $io->validate_create_path($out, 'create', 'lncRNApipe pipeline output directory') } $io->exist_sys_cmd(['mkdir']); my $cuffcmp_dir = $outdir . 'cuffcompare'; # Run cuffcompare ################################################################################ do { module_header($io, 'Module 1: Running cuffcompare...'); $io->execute_system_command("mkdir -p $cuffcmp_dir", 'Making output directory for cuffcompare [ ' . $cuffcmp_dir . ' ]') if (!-d $cuffcmp_dir); # Check if -i is provided with cuffcompare and adjust accordingly. (my $cuffcmp_opts_trs_in_file) = ($cuffcmp_opts =~ m/-i\s+([^\s]*)/); if ($cuffcmp_opts_trs_in_file) { my $i_fh = $io->open_file('<', $cuffcmp_opts_trs_in_file); my $i_files = do {local $/; <$i_fh>}; my $i_file_str = join(' ', split(/\n|\r/, $i_files)); $cuffcmp_opts =~ s/-i\s+[^\s]*/$i_file_str/; } $io->execute_system_command($deps->{'cuffcompare'} . " -T -o $cuffcmp_dir/lncRNApipe_cuffcmp " . join('-', split(/\-/, $cuffcmp_opts)), "Command call:\n" . "-------------\n" . $deps->{'cuffcompare'} . " -T -o $cuffcmp_dir/lncRNApipe_cuffcmp " . join('-', split(/\-/, $cuffcmp_opts))); } if (defined($cuffcmp_opts) && $cuffcmp_opts ne ''); # Find out if local FASTA mentioned with cuffcompare before running fetch_seq_from_ucsc.pl ######## my $local_ref_seq = ''; # Use local reference FASTA instead of fetching, if provided with cuffcompare # This needs to be outside do {}, to guarantee parsing, even if -fetch-seq does not have any options. ($local_ref_seq) = ($cuffcmp_opts =~ m/-s\s+([^\s]*)/) if ($cuffcmp_opts); if ($local_ref_seq && -s $local_ref_seq && -e $local_ref_seq) { $io->warning('Using local FASTA reference [ ' . $io->file_basename($local_ref_seq, 'suffix') . ' ] to fetch sequences to maintain consistency.' . "\n** Requires Bio::SeqIO module to be installed and available **", 'INFO!!'); $fetch_sc_opts = "-local $local_ref_seq"; } elsif( ($local_ref_seq eq '' || !-s $local_ref_seq || !-e $local_ref_seq) && (defined $fetch_sc_opts && $fetch_sc_opts eq '')) { $io->error("We need to know where to fetch FASTA sequences from [ -local or -db or -ncbi ] ??\n\nSee $0 --h fetch for options."); } # Run categorize_ncRNAs.pl ######################################################################## do { module_header($io, 'Module 2: Running categorize_ncRNAs.pl'); my $is_failed = ''; my $cat_dir = $outdir . 'categorize_ncRNAs'; my $cat_class_zero = $cat_dir . '/.cat.extractPat.zero'; if (!-e "$cuffcmp_dir/lncRNApipe_cuffcmp.tracking" || !-s "$cuffcmp_dir/lncRNApipe_cuffcmp.tracking") { print "\nCannot find Cuffcompare tracking [ $cuffcmp_dir/lncRNApipe_cuffcmp.tracking ] file...\n"; $lncRNApipe_succ = pipeline_status($io, 'ERROR'); } $io->execute_system_command("mkdir -p $cat_dir", 'Making output directory for ' . $io->file_basename($deps->{'cat'}, 'suffix') . ' [ ' . $cat_dir . ' ]') if (!-d $cat_dir); if (defined $cuffcmp_opts && $cuffcmp_opts ne '') { $lncRNApipe_succ = pipeline_status($io, 'ERROR!', "Required options not mentioned for -cat module.\nSee '" . "$0 --h cat" . "' for required options.") if ($categorize_sc_opts !~ m/sample/); my ($tr_files, $tr_size) = get_num_tr_files($cuffcmp_opts); my $annot_for_cat = ''; if ($categorize_sc_opts !~ m/-ann.*?\s+.+/) { my $annot = get_annot($cuffcmp_opts); $io->c_time('Converting annotation file supplied with cuffcompare to gene prediction format [ GTF -> genePred ]'); $annot_for_cat = '-annot ' . $cat_dir . '/' . $io->file_basename($annot) . '.txt'; my $gtfToGenePred_failed = $is_failed = $io->execute_get_sys_cmd_output($deps->{'bin-gtfToGenePred'} . ' -genePredExt -geneNameAsName2 ' . $annot . ' ' . $cat_dir . '/' . $io->file_basename($annot) . '.txt', "Command call:\n" . "-------------\n" . $deps->{'bin-gtfToGenePred'} . ' -genePredExt -geneNameAsName2 ' . $annot . ' ' . $cat_dir . '/' . $io->file_basename($annot) . '.txt'); $gtfToGenePred_failed =~ s/[\s\r\n]+//g; if ($gtfToGenePred_failed =~ m/exonFramesfieldisbeingadded/i) { $io->warning('Trying without -genePredExt ...', 'INFO!!'); $gtfToGenePred_failed = $is_failed = $io->execute_get_sys_cmd_output($deps->{'bin-gtfToGenePred'} . ' -geneNameAsName2 ' . $annot . ' ' . $cat_dir . '/' . $io->file_basename($annot) . '.txt', "Command call:\n" . "-------------\n" . $deps->{'bin-gtfToGenePred'} . ' -geneNameAsName2 ' . $annot . ' ' . $cat_dir . '/' . $io->file_basename($annot) . '.txt'); } $io->error('gtfToGenePred quit with the following error:' . "\n\n$is_failed\nBailing out!"), $lncRNApipe_succ = 0 if ($gtfToGenePred_failed !~ m/couldnotcapture|noversioninformation.+?gtfToGenePred\)$/i); } $categorize_sc_opts .= " -cpu $num_cpu" if (defined $num_cpu); $is_failed = $io->execute_get_sys_cmd_output('perl' . $pl_inc_string . $deps->{'cat'} . ' ' . join('-', split(/\-|\--/, $categorize_sc_opts)) . " -cuffcmp $cuffcmp_dir/lncRNApipe_cuffcmp.tracking -out $cat_dir " . "-bin $deps->{'bin-gtfToGenePred'} " . $annot_for_cat . ' ' . $tr_files, "Command call:\n" . "-------------\n" . $deps->{'cat'} . ' ' . join('-', split(/\-|\--/, $categorize_sc_opts)) . " -cuffcmp $cuffcmp_dir/lncRNApipe_cuffcmp.tracking -out $cat_dir " . "-bin $deps->{'bin-gtfToGenePred'} " . $annot_for_cat . ' ' . $tr_files); } else { $lncRNApipe_succ = pipeline_status($io, 'ERROR!', "Required options not mentioned for -cat module.\nSee '" . "$0 --h cat" . "' for required options.") if ($categorize_sc_opts !~ m/cuff|annot|out|sample/); $deps->{'cat'} .= " -cpu $num_cpu" if (defined $num_cpu); $is_failed = $io->execute_get_sys_cmd_output('perl' . $pl_inc_string . $deps->{'cat'} . ' ' . "-cuffcmp $cuffcmp_dir/lncRNApipe_cuffcmp.tracking -out $cat_dir " . "-bin $deps->{'bin-gtfToGenePred'} " . join('-', split(/\-|\--/, $categorize_sc_opts)), "Command call:\n" . "-------------\n" . $deps->{'cat'} . ' ' . "-cuffcmp $cuffcmp_dir/lncRNApipe_cuffcmp.tracking -out $cat_dir " . "-bin $deps->{'bin-gtfToGenePred'} " . join('-', split(/\-|\--/, $categorize_sc_opts))); } $lncRNApipe_succ = pipeline_status($io, $is_failed); # Rare case exit code. When categorize_ncRNAs.pl could not find cuffcompare class codes requested by user or pipeline. if (-e $cat_class_zero) { $io->warning("We could not find any cuffcompare class codes as requested [ in $cuffcmp_dir/lncRNApipe_cuffcmp.tracking ] ...", 'INFO!!'); $lncRNApipe_succ = 1; undef $fetch_sc_opts; undef $start_cpc; undef $start_rnafold; undef $start_infernal; undef $skip_cpc; } } if (defined($categorize_sc_opts) && $categorize_sc_opts ne ''); # Run get_unique_features.pl ###################################################################### my $known_or_unique_suffix_keyword = 'unique'; my $novel_or_known_suffix_keyword = 'novel'; do { my $cpu = do_parallel($num_cpu) if (defined $num_cpu); $lncRNApipe_succ = get_pipeline_status_code($cpu) if (defined $num_cpu); module_header($io, 'Module 3: Running get_unique_features.pl'); my $is_failed = ''; my $cat_dir = $outdir . 'get_unique_features'; $io->execute_system_command("mkdir -p $cat_dir", 'Making output directory for ' . $io->file_basename($deps->{'get'}, 'suffix') . ' [ ' . $cat_dir . ' ]') if (!-d $cat_dir); opendir (lncRNApipe_categorize_mod, $outdir . 'categorize_ncRNAs') || $io->error("Cannot open directory $outdir" . 'categorize_ncRNAs to read lncRNApipe class files.'); push my @lncRNApipe_class_files, grep {/putative\.class\.lncRNAs\.gtf$/} readdir lncRNApipe_categorize_mod; if ($#lncRNApipe_class_files < 0) { $lncRNApipe_succ = pipeline_status($io, 'ERROR', "Cannot find output in $cat_dir ..."); } $lncRNApipe_succ = pipeline_status($io, 'ERROR!', "Required options not mentioned for -get module.\nSee '" . "$0 --h get" . "' for required options.") if ($get_uq_sc_opts !~ m/sf|cf/); if ($get_uq_sc_opts =~ m/-known/i) { $known_or_unique_suffix_keyword = $novel_or_known_suffix_keyword = 'known'; } foreach my $lncRNApipe_class_file (@lncRNApipe_class_files) { $cpu->start and next if (defined $num_cpu); $lncRNApipe_class_file = $outdir . 'categorize_ncRNAs/' . $lncRNApipe_class_file; my $unique_ncRNAs = $cat_dir . '/' . $io->file_basename($lncRNApipe_class_file) . ".$known_or_unique_suffix_keyword.gtf"; $unique_ncRNAs = $cat_dir . '/' . $io->file_basename($lncRNApipe_class_file) . ".$known_or_unique_suffix_keyword.gtf"; unlink $unique_ncRNAs if (-e $unique_ncRNAs); my $sff_opt = ''; if ($get_uq_sc_opts !~ m/(-sff|--source-file-format)/) { $io->c_time("Assuming supplied known ncRNAs to be in BED format...\n" . 'You can change this option with -sff. Ex: -sff gtf'); $sff_opt = '-sff bed'; } $is_failed = $io->execute_get_sys_cmd_output('perl' . $pl_inc_string . $deps->{'get'} . ' ' . '-q -cf ' . $lncRNApipe_class_file . " -cff gtf $sff_opt ". join('-', split(/\-|\--/, $get_uq_sc_opts)) . ' -unique -out ' . $unique_ncRNAs, "Command call:\n" . "-------------\n" . $deps->{'get'} . ' ' . '-q -cf ' . $lncRNApipe_class_file . " -cff gtf $sff_opt ". join('-', split(/\-|\--/, $get_uq_sc_opts)) . ' -unique -out ' . $unique_ncRNAs); # This module does not output anything on success. if ($is_failed !~ m/could not capture/i) { $lncRNApipe_succ = pipeline_status($io, $is_failed); } else { $lncRNApipe_succ = 1; } if (-e $unique_ncRNAs && !-s $unique_ncRNAs) { $io->warning("No unique features found. In fact, this is not an error.\n" . 'It just means that any putative ncRNAs that have been extracted from --cat-ncRNAs module do not have any overlaps with known ncRNAs.'); $io->warning('Proceeding with files from "categorize_ncRNAs" module using Symbolic link ...', 'INFO!!'); $io->execute_system_command("ln -s -f $lncRNApipe_class_file $unique_ncRNAs"); } else { disp_ncRNA_counts($io, $unique_ncRNAs, "Total number of putative $known_or_unique_suffix_keyword ncRNAs"); } $cpu->finish(0, {'lncRNApipe_succ' => $lncRNApipe_succ}) if (defined $num_cpu); } $cpu->wait_all_children if (defined $num_cpu); close lncRNApipe_categorize_mod; } if (defined($get_uq_sc_opts) && $get_uq_sc_opts ne ''); # Run fetch_seq_from_ucsc.pl ###################################################################### do { my $cpu = do_parallel($num_cpu) if (defined $num_cpu); $lncRNApipe_succ = get_pipeline_status_code($cpu) if (defined $num_cpu); my $uq_tr_feat_dir = 'get_unique_features'; my $tr_feat_here_suffix = 'putative\.class\.lncRNAs\.' . $known_or_unique_suffix_keyword . '\.gtf$'; module_header($io, 'Module 4: Running fetch_seq_from_ucsc.pl'); my $is_failed = ''; my $cat_dir = $outdir . 'fetch_seq_from_ucsc'; $io->execute_system_command("mkdir -p $cat_dir", 'Making output directory for ' . $io->file_basename($deps->{'fetch'}, 'suffix') . ' [ ' . $cat_dir . ' ]') if (!-d $cat_dir); if (defined $skip_get_uq) { $uq_tr_feat_dir = 'categorize_ncRNAs'; $tr_feat_here_suffix = 'putative\.class\.lncRNAs\.gtf$'; } opendir (lncRNApipe_get_unique_mod, $outdir . $uq_tr_feat_dir) || $io->error("Cannot open directory $outdir" . $uq_tr_feat_dir . ' to read unique ncRNA features.'); push my @lncRNApipe_unique_files, grep {/$tr_feat_here_suffix$/} readdir lncRNApipe_get_unique_mod; if ($#lncRNApipe_unique_files < 0) { $lncRNApipe_succ = pipeline_status($io, 'ERROR', "Cannot find output in $cat_dir ..."); } foreach my $lncRNApipe_unique_file (@lncRNApipe_unique_files) { $cpu->start and next if (defined $num_cpu); $lncRNApipe_unique_file = $outdir . $uq_tr_feat_dir . '/' . $lncRNApipe_unique_file; $is_failed = $io->execute_get_sys_cmd_output('perl' . $pl_inc_string . $deps->{'fetch'} . ' ' . join('-', split(/\-|\--/, $fetch_sc_opts)) . ' -tmap ' . $lncRNApipe_unique_file . " -out $cat_dir -ff gtf -id 'transcript_id\\s+\\\"(.+?)\\\"' -skip '\\ttranscript\\t'", "Command call:\n" . "-------------\n" . $deps->{'fetch'} . ' ' . join('-', split(/\-|\--/, $fetch_sc_opts)) . ' -tmap ' . $lncRNApipe_unique_file . " -out $cat_dir -ff gtf -id 'transcript_id\\s+\\\"(.+?)\\\"' -skip '\\ttranscript\\t'"); $lncRNApipe_succ = pipeline_status($io, $is_failed); $cpu->finish(0, {'lncRNApipe_succ' => $lncRNApipe_succ}) if (defined $num_cpu); } $cpu->wait_all_children if (defined $num_cpu); close lncRNApipe_get_unique_mod; } if (defined $fetch_sc_opts); # Finally Run CPC, RNAfold and Infernal ############################################################## do { my $cpu = do_parallel($num_cpu) if (defined $num_cpu); $lncRNApipe_succ = get_pipeline_status_code($cpu) if (defined $num_cpu); opendir (lncRNApipe_fetch_seq_mod, $outdir . 'fetch_seq_from_ucsc') || $io->error("Cannot open directory $outdir" . 'fetch_seq_from_ucsc to read unique ncRNA features.'); push my @lncRNApipe_unique_seq_files, grep {/putative\.class\.lncRNAs.*?\.fa$/} readdir lncRNApipe_fetch_seq_mod; #my $blastall_path = sub { # my @file_parts = $io->file_basename(shift, 'all'); # chop $file_parts[1] if ($file_parts[1] =~ m/\/$/); # return $file_parts[1] #}; $io->exist_sys_cmd(['cut']); my $lncRNApipe_final = $outdir . 'lncRNApipe.final'; my $cat_dir = $outdir . 'CPC'; $io->execute_system_command("mkdir -p $lncRNApipe_final", 'Making output directory to store final list of ncRNAs [ ' . $io->file_basename($lncRNApipe_final) . ' ]') if (!-d $lncRNApipe_final); foreach my $lncRNApipe_unique_seq_file (@lncRNApipe_unique_seq_files) { $cpu->start and next if (defined $num_cpu); $num_cpu_by_2 = sprintf("%.0f", $num_cpu / scalar(@lncRNApipe_unique_seq_files)) if (defined $num_cpu); $num_cpu_by_2 = 1 if ($num_cpu_by_2 <= 0); $lncRNApipe_unique_seq_file = $outdir . 'fetch_seq_from_ucsc/' . $lncRNApipe_unique_seq_file; my $lncRNApipe_unique_seq_noncoding_file = $outdir . 'fetch_seq_from_ucsc/' . $io->file_basename($lncRNApipe_unique_seq_file) . '.noncoding.FASTA'; my $CPC_out_file = $cat_dir . '/' . $io->file_basename($lncRNApipe_unique_seq_file) . '.CPC.predict.txt'; my $cpc_work_dir = $cat_dir . '/' . $io->file_basename($lncRNApipe_unique_seq_file) . '.CPC.out' ; my $lncRNApipe_final_trs = $lncRNApipe_final . '/' . $io->file_basename($lncRNApipe_unique_seq_file) . '.final.gtf'; my $lncRNApipe_allInfernal_trs = $lncRNApipe_final . '/' . $io->file_basename($lncRNApipe_unique_seq_file) . '.allInfernalHits.gtf'; my $RNAfold_dir = $lncRNApipe_final . '/' . $io->file_basename($lncRNApipe_unique_seq_file) . '.pre.RNAfold'; my $RNAfold_dir_final = $lncRNApipe_final . '/' . $io->file_basename($lncRNApipe_unique_seq_file) . '.final.RNAfold'; my $RNAfold_mfe = $RNAfold_dir . '/' . $io->file_basename($lncRNApipe_unique_seq_file) . '.mfe'; my $unique_gtf = $outdir . 'get_unique_features/' . $io->file_basename($lncRNApipe_unique_seq_file) . '.gtf'; my $pre_infernal = $lncRNApipe_final . '/' . $io->file_basename($lncRNApipe_unique_seq_file) . '.pre.infernal'; my $final_infernal = $lncRNApipe_final . '/' . $io->file_basename($lncRNApipe_unique_seq_file) . '.final.infernal'; if ($skip_get_uq || !-e $unique_gtf) { $unique_gtf = $outdir . 'categorize_ncRNAs/' . $io->file_basename($lncRNApipe_unique_seq_file) . '.gtf'; } # Run CPC #################################################################################### do { module_header($io, 'Module 5: Running CPC' . ' [ on ' . $io->file_basename($lncRNApipe_unique_seq_file, 'suffix') . ' ] ...'); unlink $lncRNApipe_final_trs if (-e $lncRNApipe_final_trs); $ENV{'CPC_HOME'} = $deps->{'cpc_home'} if (!exists $ENV{'CPC_HOME'}); $ENV{'PATH'} = $deps->{'cpc_home'} . '/bin:' . $ENV{'PATH'} if (dirname((split /\:/, $ENV{'PATH'})[0]) ne $deps->{'cpc_home'}); if (!-d $ENV{'CPC_HOME'} || $ENV{'CPC_HOME'} eq '') { $io->error("Environment variable CPC_HOME must be set pointing to CPC directory before running CPC.\n" . 'See CPC installation instructions (http://cpc.cbi.pku.edu.cn/docs/install_guide.jsp)'); } $io->execute_system_command("mkdir -p $cat_dir", 'Making output directory for ' . $io->file_basename($deps->{'cpc'}, 'suffix') . ' [ ' . $cat_dir . ' ]') if (!-d $cat_dir); #$io->execute_system_command("mkdir -p $cpc_work_dir") if (!-d $cpc_work_dir); my $cpc_cpu = 1; $cpc_cpu = $num_cpu_by_2 if (defined $num_cpu); if (!defined $skip_cpc_core) { $io->execute_system_command("activate $deps->{cpc_home}/envs/cpc2"); $io->execute_system_command("$deps->{cpc_home}/envs/cpc2/bin/python $deps->{'cpc_home'}/cpc2/bin/CPC2.py -i " . $lncRNApipe_unique_seq_file . ' ' . $blast_strand . " -o $CPC_out_file"); $io->execute_system_command("deactivate"); # $io->execute_system_command('BLAST_STRAND=' . $blast_strand . ' NUM_CPU=' . $cpc_cpu . ' ' . $deps->{'cpc'} . # ' "' . $lncRNApipe_unique_seq_file . ' ' . $CPC_out_file . ' ' . $cpc_work_dir . ' ' . # $cpc_work_dir . '" ' . $blastall_path->($deps->{'blastall'}), # "Command call:\n" . # "-------------\n" . 'BLAST_STRAND=' . $blast_strand . ' NUM_CPU=' . $cpc_cpu . ' ' . $deps->{'cpc'} . # ' "' . $lncRNApipe_unique_seq_file . ' ' . $CPC_out_file . ' ' . $cpc_work_dir . ' ' . # $cpc_work_dir . '" ' . $blastall_path->($deps->{'blastall'})); } } if (defined $start_cpc); # To resume the remaining 2 modules, we need to check for CPC prediction file and adjust the file. if (!defined $skip_cpc) { if (-e $CPC_out_file && -s $CPC_out_file) { $io->c_time('Getting noncoding "only" transcripts [ from ' . $io->file_basename($unique_gtf, 'suffix') . " ] ..."); $io->execute_system_command("grep -v '#' $CPC_out_file | grep noncoding | cut -f 1 | " . 'while read trid; do grep -P "${trid//\./\\\.}\W" ' . "$unique_gtf; done &> $lncRNApipe_final_trs"); $io->c_time('Creating FASTA for noncoding "only" transcripts [ from ' . $io->file_basename($unique_gtf, 'suffix') . " ] ..."); $io->execute_system_command("grep -v '#' $CPC_out_file | grep noncoding | cut -f 1 | ". 'while read trid; do grep -A 1 -P "${trid//\./\\\.}\W" ' . "$lncRNApipe_unique_seq_file; done &> $lncRNApipe_unique_seq_noncoding_file"); $lncRNApipe_unique_seq_file = $lncRNApipe_unique_seq_noncoding_file; } else { $lncRNApipe_succ = 0; $io->error("CPC prediction step seems to have failed... \n" . "See --skip-cpc if you do not want to run CPC altogether\n\nBailing out!"); } } elsif (defined $skip_cpc) { $io->c_time('Creating a place holder [ --skip-cpc requested ] ...'); $CPC_out_file = $lncRNApipe_final . '/.' . $io->file_basename($lncRNApipe_unique_seq_file) . '.ph'; $io->execute_system_command('grep -oP ">.+?\\s+" ' . $lncRNApipe_unique_seq_file . " | sed -e 's/>//g' | sed -e 's/\\s+\$//g' " . '| while read trid; do echo -e "${trid}\tnoncoding"; done > ' . $CPC_out_file); $io->execute_system_command("cp $unique_gtf $lncRNApipe_final_trs"); } # Do RNAfold and draw entropy plot ############################################################ do { module_header($io, 'Module 6: Running RNAfold predictions [ on ' . $io->file_basename($lncRNApipe_unique_seq_file, 'suffix') . ' ] ...'); $io->execute_system_command("mkdir -p $RNAfold_dir") if (!-d $RNAfold_dir); chdir $RNAfold_dir; $io->c_time('Making directory for RNAfold predictions [ ' . $io->file_basename($RNAfold_dir_final, 'suffix') . ' ] ...'); $io->execute_system_command("mkdir -p $RNAfold_dir_final") if (!-d $RNAfold_dir_final); $io->warning('CPC was not run! Will run RNAfold on all transcript models.') if (defined $skip_cpc); if (!defined $skip_rnafold_core) { $io->c_time('Running RNAfold with -p flag [ on ' . $io->file_basename($lncRNApipe_unique_seq_file, 'suffix') . ' ]. This may take very long time ...'); $io->execute_system_command($deps->{'RNAfold'} . ' -p ' . $start_rnafold . ' < ' . $lncRNApipe_unique_seq_file . ' &> ' . $RNAfold_mfe, "Command call:\n" . "-------------\n" . $deps->{'RNAfold'} . ' -p ' . $start_rnafold . ' < ' . $lncRNApipe_unique_seq_file . ' &> ' . $RNAfold_mfe); } $io->c_time('Now generating RNAfold color plots ...'); opendir (RNAfold_pl, $RNAfold_dir) || $io->error("Cannot open directory $RNAfold_dir to generate plots"); push my @RNA_relPlots, grep {/\_dp\.ps$/} readdir RNAfold_pl; $io->error('Cannot find _dp PostScript files. Bailing out!') if (scalar(@RNA_relPlots) == 0); foreach my $dp_ (@RNA_relPlots) { $dp_ = $RNAfold_dir . '/' . $dp_; my $ss_ = $RNAfold_dir . '/' . $io->file_basename($dp_, 'suffix'); $ss_ =~ s/\_dp\.ps$/\_ss\.ps/i; my $ps = $ss_; $ps =~ s/\_ss\.ps$/\_fstruct\.ps/; my $pre_ps = $ps; $io->error('Cannot find corresponding _dp or _ss PostScript file(s) [ for transcript ' . $io->file_basename($ps) . ' ] or any content in it. Bailing out!') if (!-e $ss_ || !-e $dp_ || !-s $ss_ || !-s $dp_); $ss_ = esc_tr_id($ss_); $dp_ = esc_tr_id($dp_); $ps = esc_tr_id($pre_ps); $io->execute_system_command($deps->{'relplot'} . ' ' . $ss_ . ' ' . $dp_ . ' > ' . $ps, "Command call:\n" . "-------------\n" . $deps->{'relplot'} . ' ' . $ss_ . ' ' . $dp_ . ' > ' . $ps); $io->c_time('Cleaning up *_ss.ps and *_dp.ps files ...'), $io->execute_system_command("rm $dp_"), $io->execute_system_command("rm $ss_") if ($rm_int_plots); $io->error('RNAfold prediction step failed. Cannot generate final PostScript file from _ss.ps and _dp.ps.' . "\nBailing out!") if (!-e $pre_ps || !-s $pre_ps); $io->execute_system_command("mv $ps $RNAfold_dir_final/."); } close RNAfold_pl; if (-e $CPC_out_file && -s $CPC_out_file) { $io->c_time('Filtering transcripts [ ' . $io->file_basename($unique_gtf, 'suffix') . ' ] ...'); $io->execute_system_command("grep noncoding $CPC_out_file | cut -f 1 | " . "while read trid; do find $RNAfold_dir -type f -name \${trid//\\./\\\\.}_fstruct.ps" . " -exec mv -t $RNAfold_dir_final" . "/\ {} \\;; done;"); } $io->c_time('Moving around files and cleaning up directories ...'); $io->execute_system_command("mv $RNAfold_dir/*.mfe $RNAfold_dir_final/.") if (-e $RNAfold_mfe); # All possible checks complete, RNAfold has finished. $lncRNApipe_succ = 1; } if (defined $start_rnafold); # Start Infernal and edit final GTF ############################################################# do { my $tbl = $final_infernal . '/' . $io->file_basename($lncRNApipe_unique_seq_file) . '.txt'; my $raw = $final_infernal . '/' . $io->file_basename($lncRNApipe_unique_seq_file) . '.cmscan'; $io->execute_system_command("mkdir -p $final_infernal") if (!-d $final_infernal); module_header($io, 'Module 7: Running cmscan from Infernal [ on ' . $io->file_basename($lncRNApipe_unique_seq_file, 'suffix') . ' ] ...'); $deps->{'cmscan'} .= " --cpu $num_cpu_by_2" if (defined $num_cpu); if (!defined $skip_cmscan_core) { $io->execute_system_command($deps->{'cmscan'} . ' ' . ' --tblout ' . $tbl . ' ' . $deps->{'rfam_cm'} . ' ' . $lncRNApipe_unique_seq_file . ' &> ' . $raw, "Command call:\n" . "-------------\n" . $deps->{'cmscan'} . ' ' . ' --tblout ' . $tbl . ' ' . $deps->{'rfam_cm'} . ' ' . $lncRNApipe_unique_seq_file . ' &> ' . $raw); } $io->warning('CPC was not run! Will run Infernal on all transcript models.') if (defined $skip_cpc); if (-e $CPC_out_file && -s $CPC_out_file) { $io->exist_sys_cmd(['sort', 'head', 'awk', 'uniq', 'mv', 'bc']); $io->c_time('Updating final GTF file with Infernal annotations...'); my $tmp_annot = $io->open_file('>', $lncRNApipe_final_trs . '.tmp'); unlink $lncRNApipe_allInfernal_trs if (-e $lncRNApipe_allInfernal_trs); my $infernal_run = $io->execute_get_sys_cmd_output("INF_GTF=$lncRNApipe_allInfernal_trs FINAL_GTF=$lncRNApipe_final_trs" . " CM_TXT_OUT=$tbl CPC_TXT_OUT=$CPC_out_file COV=$inf_cov" . ' ' . $deps->{'ann-inf'}, "Command call:\n" . "-------------\n" . "INF_GTF=$lncRNApipe_allInfernal_trs FINAL_GTF=$lncRNApipe_final_trs" . " CM_TXT_OUT=$tbl CPC_TXT_OUT=$CPC_out_file COV=$inf_cov" . ' ' . $deps->{'ann-inf'}); $io->execute_system_command('perl' . $pl_inc_string . $deps->{'gen-inf'} . ' -q ' . '--tbl ' . $tbl . ' --final ' . $lncRNApipe_final_trs . ' &> ' . $lncRNApipe_allInfernal_trs, "Command call:\n" . "-------------\n" . $deps->{'gen-inf'} . ' -q ' . ' --tbl ' . $tbl . ' --final ' . $lncRNApipe_final_trs . ' &> ' . $lncRNApipe_allInfernal_trs); $lncRNApipe_succ = pipeline_status($io, 'ERROR!', 'Cannot get annotation from Infernal run') if (!$infernal_run || $infernal_run =~ m/could not capture/i || !-e $tbl || !-e $raw || !-s $tbl || !-s $raw || !-s $lncRNApipe_allInfernal_trs); print $tmp_annot $infernal_run; close $tmp_annot; $io->c_time('Moving around files...'); $io->execute_system_command("mv $lncRNApipe_final_trs" . '.tmp ' . $lncRNApipe_final_trs); } } if (defined $start_infernal); chomp (my $final_nc_tr_count = $io->execute_get_sys_cmd_output('grep -oP \'transcript_id\\s+\\".+?\\"\' ' . $lncRNApipe_final_trs . ' | grep -oP \'\\".+\\"\' | sed -e \'s/\\"//g\' | sort -n | uniq | wc -l')); if (!$final_nc_tr_count) { $io->execute_system_command( "cat $cat_dir/*.CPC.predict.txt | head", "Truncated output from $cat_dir/*.CPC.predict.txt"); $io->error("Could not get final putative lncRNA count.\n" . "Try re-running the final module. Take a look at --skip-cpc-core, --skip-rnafold-core and --skip-cmscan-core options." . "\nThis may also mean that the putative lncRNAs that were used with CPC may have all been flagged as \"coding\"."); } else { # Since we will have some transcripts by now that are filtered and passed all stages of the pipeline $lncRNApipe_succ = 1; } disp_ncRNA_counts($io, $lncRNApipe_final_trs, "Final putative $novel_or_known_suffix_keyword lncRNA count"); $cpu->finish(0, {'lncRNApipe_test' => $lncRNApipe_succ}) if (defined $num_cpu); unlink $CPC_out_file if (defined $skip_cpc); } $cpu->wait_all_children if (defined $cpu); close lncRNApipe_fetch_seq_mod; } if (defined $start_cpc || defined $start_rnafold || defined $start_infernal || defined $skip_cpc); close $dep_tools_fh; close $pl_dep_paths_fh; # \x{2633} if ($lncRNApipe_succ) { $io->warning('INFERNAL authors state the following:' . "\n\n" . 'We recommend that you use Rfam/Infernal for vertebrate genome annotation with extreme caution! ' . "\n" . 'Nevertheless, Rfam/Infernal does tell us about important sequence similarities that are effectively undetectable by other means. ' . "\n" . 'However, in complex eukaryotic genomes, it is important to treat hits as sequence similarity ' . "\n" . 'information (much as you might treat BLAST hits), rather than as evidence of bona fide ncRNA genes.', 'INFO!!'); $io->c_time("\x{2632}\x{2634} lncRNApipe Pipeline finished!"); } else { $io->c_time("\x{2632}\x{2634} lncRNApipe Pipeline aborted(?)"); } $io->end_timer($s_time); } ################################################################################################ # # # Install the pipeline # # ==================== # # # ################################################################################################ sub setup_lncRNApipe { chomp(my $install_dir = shift); chomp(my $dl_util = `which wget 2>&1`); binmode(STDOUT, ":utf8"); if ($install_dir ne 'get_dl_UTIL') { print "\n\x{1f473} Hi! I am Vayu and I will attempt to install lncRNApipe for you."; print "\nChecking for platform independent prerequisites and some of the GNU Core Utilities on UNIX based machines...\n"; } # Find out if user wants to download master or devel $setup = 'master' if (defined $setup && $setup eq ''); # Wget or Curl if ($dl_util !~ m/.+?wget$/i) { chomp ($dl_util = `which curl 2>&1`); succ_or_fail(0, 'Curl|Wget') if ($dl_util !~ m/.+?curl$/i); succ_or_fail(1, 'curl'); $dl_util .= ' -LksO '; } elsif ($dl_util =~ m/.+?(wget)$/i) { succ_or_fail(1, $1); $dl_util .= ' --no-check-certificate '; } else{ succ_or_fail(0, 'Curl|Wget'); } # Return dl util only if requested return $dl_util if ($install_dir eq 'get_dl_UTIL'); # BASH check_util('bash'); # Unzip check_util('unzip'); # Make check_util('make'); # Echo check_util('echo'); # Uname check_util('uname'); # bunzip2 check_util('bunzip2'); # gunzip check_util('gunzip'); # rm check_util('rm'); # cp check_util('cp'); # mkdir check_util('mkdir'); # touch check_util('touch'); # mv check_util('mv'); # find check_util('find'); # bc check_util('bc'); # head check_util('head'); # sort check_util('sort'); # cat check_util('cat'); # ln check_util('ln'); # wc check_util('wc'); # ps check_util('ps'); # Mandatory by Single UNIX Specification. Just checking for sanity check_util('awk'); print "\n\nWe definetely need the custom module, IO::Routine.\n\n" . "Attempting to fetch IO::Routine from Perl-for-Bioinformatics repository on github...\n"; if (-d "$install_dir/.build") { system("rm -rf $install_dir/.build"); } if (-d '.build') { system("rm -rf .build"); } if (-d "$install_dir/.lncRNApipe.depbin") { system("rm -rf $install_dir/.lncRNApipe.depbin"); } if (-d "$install_dir/.lncRNApipe.depconf") { system("rm -rf $install_dir/.lncRNApipe.depconf"); } system("mkdir $install_dir/.build"); succ_or_fail(0, '.build dir') if (!-d "$install_dir/.build"); system("rm $install_dir/$setup.zip") if (-e "$install_dir/$setup.zip"); my $dl_util_slave = ''; if ($dl_util =~ m/.*?wget\s+/i) { $dl_util_slave = $dl_util . "-O $setup.zip"; } else { $dl_util_slave = $dl_util; } # Since the sandboxed binaries are large, let the $dl_util print download status. print "\n$dl_util_slave https://github.com/biocoder/Perl-for-Bioinformatics/archive/$setup.zip\n\n"; system("$dl_util_slave https://github.com/biocoder/Perl-for-Bioinformatics/archive/$setup.zip"); system("mv $install_dir/$setup.zip $install_dir/.build/$setup.zip"); succ_or_fail(0, '$setup.zip') if (!-e "$install_dir/.build/$setup.zip"); print "Inflating $setup.zip...\n"; print "\nunzip -d $install_dir/.build $install_dir/.build/$setup.zip\n"; system("unzip -d $install_dir/.build $install_dir/.build/$setup.zip > /dev/null 2>&1"); # Use cpanm to install required perl modules. system("$dl_util https://raw.githubusercontent.com/miyagawa/cpanminus/$setup/cpanm > /dev/null 2>&1"); succ_or_fail(0, 'cpanm') if (!-e 'cpanm'); system("mv cpanm $install_dir/.build"); system("chmod 755 $install_dir/.build/cpanm"); my $pm_install_dir = $install_dir; if (defined $local_lib) { chop $local_lib if ($local_lib =~ m/\/$/); $pm_install_dir = $local_lib; } #cpanm_status($pm_install_dir, 'Module::Load', $install_dir); # This is a core module. print "\nInstalling required Perl Modules...\n"; cpanm_status($pm_install_dir, 'Set::IntervalTree', $install_dir, $cpanm_args); cpanm_status($pm_install_dir, 'LWP::Simple', $install_dir, $cpanm_args); cpanm_status($pm_install_dir, 'XML::XPath', $install_dir, $cpanm_args); cpanm_status($pm_install_dir, 'XML::XPath::XMLParser', $install_dir, $cpanm_args); cpanm_status($pm_install_dir, 'Parallel::ForkManager', $install_dir, $cpanm_args); cpanm_status($pm_install_dir, 'MIME::Lite', $install_dir, $cpanm_args); cpanm_status($pm_install_dir, 'Email::Valid', $install_dir, $cpanm_args); cpanm_status($pm_install_dir, 'YAML', $install_dir, $cpanm_args); cpanm_status($pm_install_dir, 'Data::Uniqid', $install_dir, $cpanm_args); if (!ask_user("\n\nDo you already have BioPerl installed?", 120)) { print "\nOK ... This may take a while ... Grab a snack or a cup of coffee!\n"; $cpanm_args .= ' --force '; cpanm_status($pm_install_dir, 'Bio::SeqIO', $install_dir, $cpanm_args); } #print "\nThank you cpanm!\n"; my $pm_path = "$pm_install_dir/PERLLIBS:$pm_install_dir/PERLLIBS/lib:$pm_install_dir/PERLLIBS/lib/perl5"; print "\n\nInstalling IO::Routine...\n\n"; system("mkdir $pm_install_dir/PERLLIBS") if (!-d "$pm_install_dir/PERLLIBS"); my $custom_pm_log = `cd $install_dir/.build/Perl-for-Bioinformatics-$setup/IO-Routine;perl Makefile.PL PREFIX=$pm_install_dir/PERLLIBS LIB=$pm_install_dir/PERLLIBS/lib && make && make test && make install; 2>&1`; if ($custom_pm_log =~ m/fail|error|cannot/i) { succ_or_fail(0, 'IO::Routine'); } else { succ_or_fail(1, 'IO::Routine'); } require "$pm_install_dir/PERLLIBS/lib/IO/Routine.pm"; my $io = IO::Routine->new($help, $quiet); chomp(my $lncRNApipe_root = dirname(abs_path($0))); if (!-d "$lncRNApipe_root/.lncRNApipe.depbin") { system("mkdir $install_dir/.lncRNApipe.depbin"); } if (!-d "$lncRNApipe_root/lncRNApipe-test") { system("mkdir $install_dir/lncRNApipe-test"); } print "\n\nMoving required tools and scripts to current install path...\n"; system("cp -rf $install_dir/.build/Perl-for-Bioinformatics-$setup/NGS-Utils/.lncRNApipe.depbin/* $install_dir/.lncRNApipe.depbin/."); system("cp -rf $install_dir/.build/Perl-for-Bioinformatics-$setup/NGS-Utils/* $install_dir/."); print "Moving test data to current install path...\n"; system("cp -rf $install_dir/.build/Perl-for-Bioinformatics-$setup/NGS-Utils/lncRNApipe-test/* $install_dir/lncRNApipe-test/."); print "Cleaning up build directory...\n"; system("rm -rf $install_dir/.build"); $io->execute_system_command(0, 'Detecting system architecture...'); my $sys_arch_info = $io->execute_get_sys_cmd_output('uname -a'); $io->error('This is not a UNIX based machine ... Aborting installation!') if (!$sys_arch_info || $sys_arch_info !~ m/linux|darwin/i); $io->error('It is not a 64-bit machine!' . "\nIt is your responsibility to make sure that you have the following tools" . " installed for your system architecture and also must be found in \$PATH:\n" . "\ngtfToGenePred, genePredToGtf, bowtie, tophat, trimmomatic-0.36.jar\ncufflinks, cuffcompare, cuffdiff, cmscan, sambamba, formatdb, CPC2, and RNAfold\n\nBailing out!") if ($sys_arch_info !~ m/x86\_64/i); my $sys_arch = ''; if ($sys_arch_info =~ m/darwin/i) { $sys_arch = 'darwin'; $io->execute_system_command(0, 'Skipping version requirement check for system level commands [ tar ], [ cut ] and [ wc ].' . "\nFreeBSD's tools does not provide version numbers (?)"); check_util('tar'); check_util('cut'); check_util('wc'); # Darwin special case... chomp(my $darwin_sed = `sed --version 2>&1`); chomp(my $darwin_grep = `grep --version 2>&1`); if ($darwin_sed !~ m/gnu/i || $darwin_grep !~ m/gnu/i) { print "\n"; $io->warning("We need GNU's grep and sed instead of FreeBSD's."); chomp(my $homebrew = `which brew 2>&1`); print "Looking to see if you have homebrew to install GNU tools [ grep and sed ] ...\n"; if ($homebrew =~ m/.*brew$/i) { succ_or_fail(1, 'homebrew'); } else { $io->error('Please install homebrew. See installation instructions at http://brew.sh/' . "\nThen, do the following:\n\nbrew update\nbrew tap homebrew/dupes\n"); } print "\n\nPlease execute the following commands in order and" . " rerun the setup procedure to successfully install lncRNApipe pipeline.\n"; $io->execute_system_command(0, "brew install coreutils\n" . "ln -s /usr/local/bin/ggrep /usr/local/bin/grep\n" . "ln -s /usr/local/bin/gsed /usr/local/bin/sed\n" . "ln -s /usr/local/bin/gstat /usr/local/bin/stat\n" . 'echo "/usr/local/bin:$PATH" >> ~/.bash_profile' . "\nsource ~/.bash_profile"); print "Skipping version check for GNU grep, GNU sed and GNU stat...\n"; succ_or_fail(2, 'sed'); succ_or_fail(2, 'grep'); succ_or_fail(2, 'stat'); print "\n\nAborting setup...\n\n"; exit; } else { $io->check_sys_level_cmds(['grep', 'sed', 'stat'], ['2.6.3', '4.2.1', '0']); succ_or_fail(1, 'grep'); succ_or_fail(1, 'sed'); succ_or_fail(1, 'stat') } } else { $sys_arch = 'linux'; $io->check_sys_level_cmds(['grep', 'sed', 'tar', 'cut', 'wc', 'stat'], ['2.6.3', '4.2.1', '0', '8', '8', '0']); succ_or_fail(1, 'grep'); succ_or_fail(1, 'sed'); succ_or_fail(1, 'tar'); succ_or_fail(1, 'cut'); succ_or_fail(1, 'wc'); succ_or_fail(1, 'stat') } print "\n\nSetting up PERL5LIB paths...\n"; unlink "$USER_HOME/.lncRNApipe.PERLLIBS" if (-e "$USER_HOME/.lncRNApipe.PERLLIBS"); my $pl_dep_fh = $io->open_file('>', "$USER_HOME/.lncRNApipe.PERLLIBS"); print $pl_dep_fh $pm_path; $io->execute_system_command(0, "\nChecking for lncRNApipe pipeline tool dependencies..." . "\n\nWriting tool dependency chain to $USER_HOME/.lncRNApipe.depconf"); my $dep_fh = $io->open_file('>', "$USER_HOME/.lncRNApipe.depconf"); # We will figure out if user is installing at a different location other than # from the cloned repo. $install_dir = $lncRNApipe_root if (-d "$lncRNApipe_root/.lncRNApipe.depbin" && $lncRNApipe_root !~ m/^\./); if (-e "$install_dir/.lncRNApipe.depbin/Rfam.cm.1_1.bz2" && -e "$install_dir/.lncRNApipe.depbin/Rfam.cm.1_1.i1m.bz2") { print "Decompressing Rfam CM files...\n"; system("bunzip2 $install_dir/.lncRNApipe.depbin/Rfam.cm.1_1.bz2"); system("bunzip2 $install_dir/.lncRNApipe.depbin/Rfam.cm.1_1.i1m.bz2"); } print $dep_fh check_bio_util('cuffcompare', $install_dir, $sys_arch), "\n"; #print $dep_fh check_bio_util('blastall', $install_dir, $sys_arch), "\n"; print $dep_fh check_bio_util('RNAfold', $install_dir, $sys_arch), "\n"; print $dep_fh check_bio_util('gtfToGenePred', $install_dir, $sys_arch), "\n"; print $dep_fh check_bio_util('genePredToGtf', $install_dir, $sys_arch), "\n"; print $dep_fh check_bio_util('trimmomatic', $install_dir, $sys_arch), "\n"; print $dep_fh check_bio_util('bowtie', $install_dir, $sys_arch), "\n"; print $dep_fh check_bio_util('tophat', $install_dir, $sys_arch), "\n"; print $dep_fh check_bio_util('cufflinks', $install_dir, $sys_arch), "\n"; print $dep_fh check_bio_util('sambamba', $install_dir, $sys_arch), "\n"; print $dep_fh check_bio_util('cmscan', $install_dir, $sys_arch), "\n"; print $dep_fh check_bio_util('formatdb', $install_dir, $sys_arch), "\n"; print $dep_fh "$install_dir/" . $CAT_SC, "\n" if (check_native("$install_dir/" . $CAT_SC)); print $dep_fh "$install_dir/" . $FETCH_SC, "\n" if (check_native("$install_dir/" . $FETCH_SC)); print $dep_fh "$install_dir/" . $GET_UQ_SC, "\n" if (check_native("$install_dir/" . $GET_UQ_SC)); print $dep_fh "$install_dir/" . $RELPLOT_MOD_SC, "\n" if (check_native("$install_dir/" . $RELPLOT_MOD_SC)); print $dep_fh "$install_dir/" . $CPC_SH_SC, "\n" if (check_native("$install_dir/" . $CPC_SH_SC)); print $dep_fh "$install_dir/" . $ANN_INF_SC, "\n" if (check_native("$install_dir/" . $ANN_INF_SC)); print $dep_fh "$install_dir/" . $SEND_MAIL_SC, "\n" if (check_native("$install_dir/" . $SEND_MAIL_SC)); print $dep_fh "$install_dir/" . $GEN_INFERNAL_HITS_SC, "\n" if (check_native("$install_dir/" . $GEN_INFERNAL_HITS_SC)); print $dep_fh "$install_dir/" . $HGCONF, "\n" if (check_native("$install_dir/" . $HGCONF)); if (-e "$install_dir/.lncRNApipe.depbin/Rfam.cm.1_1" && -s "$install_dir/.lncRNApipe.depbin/Rfam.cm.1_1") { print $dep_fh "$install_dir/.lncRNApipe.depbin/Rfam.cm.1_1\n"; succ_or_fail(1, "Rfam.cm.1_1"); } else { succ_or_fail(0, "Rfam.cm.1_1"); } #if (!exists $ENV{'CPC_HOME'}) { if (ask_user("\n\nDo you want me to attempt to install CPC?", 120)) { setup_cpc($dl_util, $dep_fh, $sys_arch_info); succ_or_fail(1, 'CPC'); } else { succ_or_fail(2, 'CPC'); } #print "\n\nSeems like Coding Potential Calculator (CPC) is not installed. This may not be a problem if you do not intend to use CPC."; #print "\nSee CPC installation instructions (http://cpc.cbi.pku.edu.cn/docs/install_guide.jsp) if you would like to run CPC module with lncRNApipe."; #} close $dep_fh; $io->execute_system_command(0, "\n\nSetup complete. See \"perl lncRNApipe --h\" to run the pipeline with options."); # Finally copy .hg.conf to user home forcefully since we may have the updated version. system("cp -f $install_dir/$HGCONF $ENV{'HOME'}/.hg.conf > /dev/null 2>&1"); system("chmod 600 $ENV{'HOME'}/.hg.conf > /dev/null 2>&1"); # For build check exit; } # Try and install CPC into current install path. sub setup_cpc { my $dl_util = shift; my $fh = shift; my $sys_arch = shift; #return if (exists $ENV{'CPC_HOME'}); my $depconf_fh = read_conf('/.lncRNApipe.depconf'); my $curr_dir = cwd . '/.cpc'; system("rm -rf $curr_dir") if (-d $curr_dir); print $fh "cpcHome:$curr_dir\n"; $ENV{'CPC_HOME'} = $curr_dir; get_deps($depconf_fh); my $miniconda_inst = 'https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh'; if ($sys_arch =~ m/darwin/i && $sys_arch =~ m/x86_64/i) { $miniconda_inst = 'https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh'; } my $miniconda_inst_name = basename($miniconda_inst); my $cpc_saved = 'master.zip'; $cpc_saved = $setup . '.zip' if (defined $setup && $setup ne ''); print "\n\nAttempting to setup CPC 2.0.\nThis may take a while as we will isolate the Python environment. Stay put...\n\n"; # Install isolated python, biopython and numpy system("$dl_util $miniconda_inst > /dev/null 2>&1"); system("chmod 755 $miniconda_inst_name"); system("./$miniconda_inst_name -b -u -p $curr_dir > /dev/null 2>&1"); system("$curr_dir/bin/conda update -y -n base conda > /dev/null 2>&1"); system("$curr_dir/bin/conda config --set changeps1 False > /dev/null 2>&1"); system("$curr_dir/bin/conda create -y -n cpc2 python=2.7 biopython numpy -c bioconda"); # Install CPC2 system("$dl_util https://github.com/biocoder/CPC2/archive/master.zip > /dev/null 2>&1"); system("unzip -d $curr_dir $cpc_saved > /dev/null 2>&1"); system("rm $cpc_saved"); system("mkdir -p $curr_dir/cpc2"); system("mv $curr_dir/CPC2-master/* $curr_dir/cpc2/."); system("chmod 755 $curr_dir/bin/activate"); system("chmod 755 $curr_dir/bin/deactivate"); #system("ln -s $curr_dir/cpc2/bin/CPC2.py $curr_dir/."); chdir "$curr_dir/cpc2/libs/libsvm/libsvm-3.18"; system("make clean && make > /dev/null 2>&1"); system("rmdir $curr_dir/CPC2-master"); chdir "$curr_dir/../"; system("rm $miniconda_inst_name"); # chdir "$curr_dir/cpc-master/libs/estate"; # system("make clean && make > /dev/null 2>&1"); # chdir "$curr_dir/data"; # my $format_db = 'uniref90.fasta'; # if (!ask_user("\nIf you already have uniref90.fasta file somewhere, then there is no point in wasting time to download it again.\n" . # "Do you already have uniref90.fasta somewhere on this system?", 300)) { # print "\nAttempting protein database download...\n\n"; # $dl_util =~ s/$cpc_saved/uniref90\.fasta\.gz/; # if (!-e "$curr_dir/data/uniref90.fa") { # system("$dl_util ftp://ftp.ebi.ac.uk/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz"); # $format_db = "uniref90.fasta.gz"; # } # else { # print "\nFASTA file [ uniref90.fasta ] already exists in $curr_dir/data. Maybe download was incomplete ?"; # print "\nBailing out to make sure we are not formatting incomplete FASTA !" . # "\n\nMake sure you have properly downloaded uniref90.fasta file from:" . # "\n\tftp://ftp.ebi.ac.uk/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz" . # "\nand run the following command:\n" . # "\n\t$deps->{'formatdb'} -i $curr_dir/data/$format_db -p T -n prot_db\n\n"; # exit 0; # } # } # else { # my $uniref90_fasta_err = ''; # until (-r $format_db && -e $format_db && -s $format_db) { # $format_db = ask_user("$uniref90_fasta_err\nEnter full path to uniref90.fasta:", 300, 'fasta|fa'); # $uniref90_fasta_err = "\nCannot find $format_db or the file is empty or not readable...\n\nCPC [ ERROR: $! ] !!\n"; # } # } # if ($format_db =~ m/\.gz$/) { # print "\nDecompressing uniref90.fasta.gz...\n"; # system("gunzip uniref90.fasta.gz"); # $format_db =~ s/\.gz$//; # } # if (!-e "$curr_dir/data/formatdb.log" || # !-s "$curr_dir/data/formatdb.log" || # !-e "$curr_dir/data/prot_db.pal" || # !-s "$curr_dir/data/prot_db.pal") { # print "\nFormatting database...\n"; # system($deps->{'formatdb'} . " -i $format_db -p T -n prot_db"); # } return; } # Check for pipeline scripts. sub check_native { my $nat_sc = shift; # Special case, match perl script, shell script and hg.conf.sh (my $sc) = ($nat_sc =~ m/.*\/(.+?\.pl|.+?\.sh)$/); if (-e $nat_sc) { succ_or_fail(1, $sc); return 1; } else { succ_or_fail(0, $sc); } return 0; } # Check for dependencies sub check_bio_util { my $cmd = shift; my $install_dir = shift; my $sys_arch = shift; chomp (my $biocmd4arch = `which $cmd 2>&1`); my $sandboxed4arch = $sys_arch . '_' . $cmd; # blastall special case. CPC looks for blastall not linux_ or darwin_ if ($sandboxed4arch =~ m/blastall/i) { system("ln -s -f $install_dir/.lncRNApipe.depbin/$sandboxed4arch $install_dir/.lncRNApipe.depbin/blastall"); $sandboxed4arch = 'blastall'; } if ($biocmd4arch !~ m/.*$cmd$/i && -d "$install_dir/.lncRNApipe.depbin" && -e "$install_dir/.lncRNApipe.depbin/$sandboxed4arch") { succ_or_fail(1, $cmd); $biocmd4arch = "$install_dir/.lncRNApipe.depbin/$sandboxed4arch"; } elsif ($biocmd4arch =~ m/.*$cmd$/i) { succ_or_fail(1, $cmd); } else { succ_or_fail(0, $cmd); } return $biocmd4arch; } # Check for system utilities sub check_util { my $cmd = shift; chomp (my $util = `which $cmd 2>&1`); if ($util !~ m/.+?$cmd/i) { succ_or_fail(0, $cmd); print "\nAborting...\n\n"; exit; } else { succ_or_fail(1, $cmd); } return $util; } # Print success or failure of lncRNApipe pipeline modules sub pipeline_status { my $self = shift; my $status = shift; my $msg = shift; if ($status =~ m/ERROR|STDERR/i) { if ($msg) { print "\nERROR!\n------\n$msg\n"; } else { print "\n$status\n"; } $self->c_time("\x{2632}\x{2634} lncRNApipe Pipeline aborted(?)\n\n"); exit; } elsif ($status !~ m/STDERR/i) { print $status; } return 1; } # Print success or failure with character length adjustments sub succ_or_fail { my $code = shift; my $msg = shift; my $char_white_space = 70 - length($msg); $char_white_space = 5 if ($char_white_space < 0); print "\n"; printf("%s%*s%s", $msg, $char_white_space, '... ', BOLD GREEN . 'OK' . RESET) if ($code == 1); if ($code == 0) { printf("%s%*s%s", $msg, $char_white_space, '... ', BOLD RED . 'FAIL' . RESET); print "\n\n"; exit; } printf("%s%*s%s", $msg, $char_white_space, '... ', BOLD YELLOW . 'SKIP' . RESET) if ($code == 2); printf("%s%*s%s", $msg, $char_white_space, '... ', BOLD YELLOW . 'WARN' . RESET) if ($code == 3); return; } # Format module header to easily spot the module start in log output sub module_header { my $c_io = shift; my $heading = shift; my $char_hash_len = 73 - length($heading); $char_hash_len = 0 if ($char_hash_len < 0); my $char_hash_fixed = '#' x 27; my $char_hash_var = '#' x $char_hash_len; my $heading_string = sprintf("%s%s%s", $char_hash_fixed, ' ' . $heading . ' ', $char_hash_var); $c_io->c_time($heading_string); return; } # Source the environment sub source_env { chomp(my $pl_dep_paths_fh = shift); chomp(my $pl_paths = do { local $/; <$pl_dep_paths_fh> }); my $perl_inc_string = ' '; foreach my $pl_path (split(/\:/, $pl_paths)) { $perl_inc_string .= '-I' . $pl_path . ' '; push @INC, $pl_path; } return $perl_inc_string; } # Report cpanm status sub cpanm_status { my $install_dir = shift; my $pm = shift; my $cpanm_bin = shift; my $cpanm_args = shift; my $cpanm_log = `$cpanm_bin/.build/cpanm $cpanm_args -l $install_dir/PERLLIBS $pm 2>&1`; if ($cpanm_log !~ m/.*?success|up to date.*/i && $pm =~ m/xml/i) { print STDOUT "\n\nYou need to install XML parser C libraries.\n\n\t\* On Ubuntu / Debian based Linux distributions, as root user, do:\n\n\t\tapt-get install libexpat1 libexpat1-dev\n\n\t\* On RedHat / Fedora / CentOS based Linux distributions, as root user do:\n\n\t\tyum install expat expat-devel\n\n"; succ_or_fail(0, $pm); } if ($cpanm_log !~ m/.*?success|up to date.*/i) { if (-e "$ENV{'HOME'}/.cpanm/build.log") { print "\n\n"; system("cat $ENV{'HOME'}/.cpanm/build.log"); print "\n"; } succ_or_fail(0, $pm); } else { succ_or_fail(1, $pm); } return; } # Report transcript files and size sub get_num_tr_files { my $cmd_args = shift; my @args = split/\s+/, $cmd_args; my $last_opt_elem = my $tr_size = 0; my $tr_files = ''; for (0 .. $#args) { $last_opt_elem = $_ + 2 if ($args[$_] =~ m/^\-/); } $tr_size = $#args - $last_opt_elem; $tr_files = join(' ', @args[$last_opt_elem .. $#args]); return($tr_files, $tr_size); } # Get number of transcripts sub get_num_trs { my $self = shift; my $file = shift; $self->exist_sys_cmd(['wc', 'sort', 'uniq']); return $self->execute_get_sys_cmd_output('grep -oP \'transcript_id \\".+?\\"\' ' . $file . ' | awk \'{print $2}\' | sed -e \'s%\"%%g\' | sort -n | uniq | wc -l'); } # Get cuffcompare's annotation file sub get_annot { my $cmd_args = shift; my $annot_file = ''; ($annot_file) = ($cmd_args =~ m/.*?-r\s+(.+?)\s+.*/i); return $annot_file; } # Ask for user input sub ask_user { my $msg = shift; my $timeout = shift; my $general = shift; my $ans; $timeout = 60 if (!$timeout); while (1) { print $msg, ' [ y/n ]: '; eval { local $SIG{ALRM} = sub { die "DidNotAnswer" }; alarm $timeout; chomp($ans = ); }; succ_or_fail(0, "\n\nAborted! [ No answer for $timeout seconds. ] ??") if ($@ =~ m/DidNotAnswer/i); if ($general && $ans !~ m/^y|n|$general$/i) { print "\nInvalid answer!\n"; next; } elsif (!$general && $ans !~ m/^y|n$/i) { print "\nInvalid answer!\n"; next; } else { alarm 0; last; } } return 1 if ($ans =~ m/^y$/i); return 0 if ($ans =~ m/^n$/i); return $ans if ($ans =~ m/fasta|fa$/); } # Get and verify dependencies sub get_deps { my $dep_tools_fh = shift; my $dep_tools = do {local $/; <$dep_tools_fh>}; ($deps->{'cuffcompare'}) = ($dep_tools =~ m/(.+?cuffcompare)/i); #($deps->{'blastall'}) = ($dep_tools =~ m/(.+?blastall)/i); ($deps->{'RNAfold'}) = ($dep_tools =~ m/(.+?rnafold)/i); ($deps->{'bin-gtfToGenePred'}) = ($dep_tools =~ m/(.+?gtfToGenePred)/i); ($deps->{'bin-genePredToGtf'}) = ($dep_tools =~ m/(.+?genePredToGtf)/i); ($deps->{'trimmomatic'}) = ($dep_tools =~ m/(.+?trimmomatic)/i); ($deps->{'bowtie'}) = ($dep_tools =~ m/(.+?bowtie)/i); ($deps->{'tophat'}) = ($dep_tools =~ m/(.+?tophat)/i); ($deps->{'cufflinks'}) = ($dep_tools =~ m/(.+?cufflinks)/i); ($deps->{'sambamba'}) = ($dep_tools =~ m/(.+?sambamba)/i); ($deps->{'hgconf'}) = ($dep_tools =~ m/(.+?$HGCONF)/i); ($deps->{'cmscan'}) = ($dep_tools =~ m/(.+?cmscan)/i); ($deps->{'formatdb'}) = ($dep_tools =~ m/(.+?formatdb)/i); ($deps->{'rfam_cm'}) = ($dep_tools =~ m/(.+?Rfam\.cm\.1\_1)/i); ($deps->{'cat'}) = ($dep_tools =~ m/(.+?$CAT_SC)/i); ($deps->{'fetch'}) = ($dep_tools =~ m/(.+?$FETCH_SC)/i); ($deps->{'get'}) = ($dep_tools =~ m/(.+?$GET_UQ_SC)/i); ($deps->{'cpc'}) = ($dep_tools =~ m/(.+?$CPC_SH_SC)/i); ($deps->{'ann-inf'}) = ($dep_tools =~ m/(.+?$ANN_INF_SC)/i); ($deps->{'relplot'}) = ($dep_tools =~ m/(.+?$RELPLOT_MOD_SC)/i); ($deps->{'email'}) = ($dep_tools =~ m/(.+?$SEND_MAIL_SC)/i); ($deps->{'gen-inf'}) = ($dep_tools =~ m/(.+?$GEN_INFERNAL_HITS_SC)/i); ($deps->{'cpc_home'}) = ($dep_tools =~ m/cpcHome\:(.+)/i) if (!exists $ENV{'CPC_HOME'}); foreach my $dep (keys %$deps) { die "\n\nCannot find required dependency [ " . $dep . " ]. May be reinstalling lncRNApipe will resolve the issue?\n\n" if (!$deps->{$dep} || !-e $deps->{$dep}); } return; } # Read lncRNApipe configuration sub read_conf { my $conf = shift; open(my $conf_fh, '<', $USER_HOME . $conf) || die "\nCannot open $USER_HOME$conf" . ".\nMay be setup was unsuccessful (?): $!\!\n" . "\nRun \"perl lncRNApipe -setup\" to try to setup lncRNApipe pipeline.\n\n"; return $conf_fh; } # Escape tr ids sub esc_tr_id { my $tr_id = shift; $tr_id =~ s/\|/\\\|/; $tr_id =~ s/\:/\\\:/; return $tr_id; } # Load parallel fork module sub do_parallel { my $num_cpu = shift; require Parallel::ForkManager; Parallel::ForkManager->import(); my $cpu = Parallel::ForkManager->new($num_cpu); $cpu->set_max_procs($num_cpu); return $cpu; } # Display ncRNA category counts sub disp_ncRNA_counts { my $c_io = shift; my $file = shift; my $total_desc = shift; chomp (my $num_lincs = $c_io->execute_get_sys_cmd_output("grep -iP '\ttranscript\t.+?lncRNA_type.+?lincrna.*?(Infernal)?' $file | wc -l")); $num_lincs = 0 if ($num_lincs =~ m/^could not capture/i); chomp (my $num_concs = $c_io->execute_get_sys_cmd_output("grep -iP '\ttranscript\t.+?lncRNA_type.+?conc.*?.*?(Infernal)?' $file | wc -l")); $num_concs = 0 if ($num_concs =~ m/^could not capture/i); chomp (my $num_poncs = $c_io->execute_get_sys_cmd_output("grep -iP '\ttranscript\t.+?lncRNA_type.+?ponc.*?.*?(Infernal)?' $file | wc -l")); $num_poncs = 0 if ($num_poncs =~ m/^could not capture/i); chomp (my $num_incs = $c_io->execute_get_sys_cmd_output("grep -iP '\ttranscript\t.+?lncRNA_type.+?\"inc.+?intronic.*?.*?(Infernal)?' $file | wc -l")); $num_incs = 0 if ($num_incs =~ m/^could not capture/i); chomp (my $num_ex_ov = $c_io->execute_get_sys_cmd_output("grep -iP '\ttranscript\t.+?lncRNA_type.+?exonic\\s+overlap.*?.*?(Infernal)?' $file | wc -l")); $num_ex_ov = 0 if ($num_ex_ov =~ m/^could not capture/i); chomp (my $num_inf_ann = $c_io->execute_get_sys_cmd_output("grep -iP '\ttranscript\t.+?BitScore' $file | grep -iP '\ttranscript\t' | wc -l")); $num_inf_ann = 0 if ($num_inf_ann =~ m/^could not capture/i); print("\n\nlncRNA Summary [ " . $c_io->file_basename($file, 'suffix') . " ] :\n" . "---------------------------------------------------------------------------------------\n" . "LincRNAs: $num_lincs\n" . "Intronic overlaps - Concs: $num_concs\n" . "Intronic overlaps - Poncs: $num_poncs\n" . "Intronic overlaps - Incs: $num_incs\n" . "Exonic overlaps: $num_ex_ov\n" . $total_desc . ': ' . ($num_lincs + $num_concs + $num_incs + $num_ex_ov + $num_poncs) . "\nInfernal matches: $num_inf_ann\n" . "\n\n" ); return; } # Check for new release on github sub check_for_updates { return if (defined $no_update_check); my $io = run_lncRNApipe('version'); $io->warning('Checking for updates on github.com/biocoder ...', 'INFO!!'); my $newest_release = $io->execute_get_sys_cmd_output("curl -Lks https://api.github.com/repos/biocoder/Perl-for-Bioinformatics/releases | grep 'tag_name' | head -n 1"); my $newest_release_date = $io->execute_get_sys_cmd_output("curl -Lks https://api.github.com/repos/biocoder/Perl-for-Bioinformatics/releases | grep 'created_at' | head -n 1"); if ($newest_release =~ m/^could not/i) { $io->warning('Unable to contact github.com for updates ... May be we have reached the X-rate-limit! :-(' . "\nUse --d option if you wish to disable update check."); return; } $newest_release =~ s/\"|\,|\v//g; $newest_release_date =~ s/\"|\,|\v|Z//g; $newest_release_date =~ s/T/ /; ($newest_release) = ($newest_release =~ m/.+?name.+?(\d.*)/); ($newest_release_date) = ($newest_release_date =~ m/\:(.+)/); $newest_release_date = $io->strip_leading_and_trailing_spaces($newest_release_date); my $is_upToDate = $io->check_sys_level_cmds(['lncRNApipe'], ["$newest_release"], 'warn'); $io->warning('You are using the latest version of the software which is v' . $newest_release . " [ Release date: $newest_release_date ]." . "\nUse --d option if you wish to disable update check.", 'OK!') if (!$is_upToDate); chomp(my $last_commit_msg = $io->execute_get_sys_cmd_output("curl -Lks https://api.github.com/repos/biocoder/Perl-for-Bioinformatics/commits?per_page=1 | grep 'message'")); chomp(my $last_commit_date = $io->execute_get_sys_cmd_output("curl -Lks https://api.github.com/repos/biocoder/Perl-for-Bioinformatics/commits?per_page=1 | grep 'date' | head -n 1")); if ($last_commit_msg =~ m/^could not/i) { $io->warning('Unable to contact github.com for latest commit message ... May be we have reached the X-rate-limit! :-(' . "\nUse --d option if you wish to disable update check."); return; } $last_commit_msg =~ s/^\s+|\"|\,$//g; $last_commit_msg =~ s/message:\s*/Last Commit: /; $last_commit_msg .= '.' if ($last_commit_msg !~ m/\.$/); $last_commit_date =~ s/\"|\,|\v|Z//g; $last_commit_date =~ s/T/ /g; ($last_commit_date) = ($last_commit_date =~ m/\:(.+)/); $last_commit_date = $io->strip_leading_and_trailing_spaces($last_commit_date); if ($VERSION !~ m/$newest_release/) { $io->warning($last_commit_msg . "\nVersion: v$newest_release\nCommit date: $last_commit_date", 'NEW!'); } else { $io->warning($last_commit_msg, 'COMMIT'); } return; } # Get lncRNApipe pipeline status code after child exits sub get_pipeline_status_code { my $cpu = shift; return $cpu->run_on_finish( sub { my ($pid, $exit_code, $p_identifier, $exit_sig, $core_dump, $stat) = @_; return $stat->{'lncRNApipe_succ'}; }); } # Show users ENSEMBL species list. sub list { my $io = shift; my $release_only = shift; $io->error('Currently, only ENSEMBL listing is supported.' . "\nGo to https://genome.ucsc.edu/FAQ/FAQreleases.html#TOP to view UCSC version names.") if (defined $sp_list && $sp_list !~ m/^ENSEMBL$/i); $io->c_time('Finding out latest release version at ENSEMBL...'); chomp(my $latest_release = $io->execute_get_sys_cmd_output("curl -Lks ftp://ftp.ensembl.org/pub/ -l | grep release | sort -nr -k9 | head -n 1")); return $latest_release if ($release_only); $io->c_time('Latest ENSEMBL release is: ' . $latest_release . "\n\nSpecies List for $latest_release:\n=================================="); $io->execute_system_command("curl -Lks ftp://ftp.ensembl.org/pub/$latest_release/gtf/ -l"); print "\n"; return; } # Run the assembly stage of the pipeline. sub run_lncRNApipe_from_conf { my $io = shift; my $params = shift; my $clock = shift; my $beverb = "\n\nPlease check your config file [ " . $io->file_basename($pipe_config, 'suffix') . ' ] and provide correct value(s).'; my $scheduler_job_depends = my $job_deps_str_lncRNApipe_post_assembly = ''; my $cufflinks_after_assembly_check = my $check_these_log_files = my $cuffdiff_bam_files = ''; my $autogen_job_scripts_timestamp = localtime; $autogen_job_scripts_timestamp =~ s/(\s+|\:)/\_/g; my $job_scripts_dir_name = 'job_scripts.' . $autogen_job_scripts_timestamp; # Verify minimum required params. verify_config_file($io, $beverb, $params->{'overwriteOutputDir'}, 'overwriteOutputDir', 'yes|no'); verify_config_file($io, $beverb, $params->{'performTranscriptAssembly'}, 'performTranscriptAssembly', 'yes|no'); verify_config_file($io, $beverb, $params->{'sourceDB'}, 'sourceDB', 'ensembl|ucsc|custom'); verify_config_file($io, $beverb, $params->{'scheduler'}, 'scheduler', 'pbs|sge|lsf|slurm|none'); verify_config_file($io, $beverb, $params->{'runRNAfold'}, 'runRNAfold', 'yes|no'); verify_config_file($io, $beverb, $params->{'runcmscan'}, 'runcmscan', 'yes|no'); verify_config_file($io, $beverb, $params->{'batchSubCmd'}, 'batchSubCmd', 'qsub|bsub|sbatch|bash') if (defined $params->{'scheduler'}); verify_config_file($io, $beverb, $params->{'runCuffdiffForAllTranscripts'}, 'runCuffdiffForAllTranscripts', 'yes|no'); if ($params->{'scheduler'} =~ m/^SGE$/i) { # -hold_jid jobid1,jobid2,jobid3 $scheduler_job_depends = '${JOB_ID}'; $params->{'killCmd'} = 'qdel'; } elsif ($params->{'scheduler'} =~ m/^PBS$/i) { # -W depend=afterok:jobid1:jobid2:jobid3 $scheduler_job_depends = '${PBS_JOBID}'; $params->{'killCmd'} = 'qdel'; } elsif ($params->{'scheduler'} =~ m/^SLURM$/i) { # -W depend=afterok:jobid1:jobid2:jobid3 $scheduler_job_depends = '${SLURM_JOBID}'; $params->{'killCmd'} = 'scancel'; } elsif ($params->{'scheduler'} =~ m/^LSF$/i) { # -w "done(jobid1) && done(jobid2) && done(jobid3)" $scheduler_job_depends = '${LSB_JOBID}'; $params->{'killCmd'} = 'qdel'; } else { $params->{'batchSubCmd'} = 'bash'; $params->{'killCmd'} = 'kill -9'; } # Kill jobs and exit if requested. if (defined $kill_jobs_thru_conf) { kill_jobs_and_exit($kill_jobs_thru_conf, $io, $params); } if (defined $params->{'overwriteOutputDir'}) { $io->c_time('OK. Removing output directory [ '. $io->file_basename($params->{'outputDir'}). ' ] ...'), my $rm_output_dir = $io->execute_get_sys_cmd_output("rm -rf $params->{'outputDir'}") if (defined $params->{'outputDir'} && -d $params->{'outputDir'} && $params->{'overwriteOutputDir'} =~ /^YES$/i); $io->error("Error occcured while force deleting directory:\n" . $rm_output_dir) if ($rm_output_dir && $rm_output_dir !~ m/could not capture/i); mkdirs($io, [$params->{'outputDir'}]); } #elsif (defined $params->{'outputDir'} || -d $params->{'outputDir'}) { #$io->error("Output directory [ $params->{'outputDir'} ] exists.\n" . # "Use [overwriteOutputDir: YES] to overwrite output directory." . $beverb); #} $io->error('No information regarding reference annotation provided. [sourceDB: ?], [species: ?], [useThisAnnotation: ?]' . $beverb) if ( (!defined $params->{'sourceDB'} || !defined $params->{'species'}) && !defined $params->{'useThisAnnotation'}); $io->error('Annotation file [ ' . $params->{'useThisAnnotation'} . ' ] does not exist or is empty.' . $beverb) if (defined $params->{'useThisAnnotation'} && ( !-e $params->{'useThisAnnotation'} || -z $params->{'useThisAnnotation'}) ); $io->error("[genomeFasta: ] not mentioned OR\n" . "cannot find genome FASTA file OR\nthe genome FASTA file is empty." . $beverb) if (!defined $params->{'genomeFasta'} || !-e $params->{'genomeFasta'} || !-s $params->{'genomeFasta'}); $io->error("[categorize_ncRNAs: ] options not mentioned.\nIt is required in either case of [performTranscriptAssembly: YES]" . " OR [performTranscriptAssembly: NO]." . $beverb) if (!defined $params->{'categorize_ncRNAs'}); $io->error('-sample-names option is required for [categorize_ncRNAs: ].' . $beverb) if (!is_option_exists_in_anon($params->{'categorize_ncRNAs'}, 'sample')); $io->error('-annot option is not required for [categorize_ncRNAs: ].' . $beverb) if (is_option_exists_in_anon($params->{'categorize_ncRNAs'}, '-annot')); $io->error('-cuff option is not required for [categorize_ncRNAs: ].' . $beverb) if (is_option_exists_in_anon($params->{'categorize_ncRNAs'}, '-cuff')); $io->error('-extract-pat option is not required for [categorize_ncRNAs: ].' . $beverb) if (is_option_exists_in_anon($params->{'categorize_ncRNAs'}, '-extract')); $io->error('-L option is not required for [cufflinks: ].' . $beverb) if (is_option_exists_in_anon($params->{'cufflinks'}, '-L')); $io->error('-sf and -sff options are required in [get_unique_features: ] when [sourceDB: ] is "CUSTOM".') if (is_option_exists_in_anon([$params->{'sourceDB'}], 'CUSTOM')) && ( !is_option_exists_in_anon($params->{'get_unique_features'}, '-sff\s+') || !is_option_exists_in_anon($params->{'get_unique_features'}, '-sf\s+') ); $io->error('In [get_unique_features: ] options, "-sff" is required when you are specifying a list of known ncRNAs.' . $beverb) if (defined $params->{'get_unique_features'} && !is_option_exists_in_anon($params->{'get_unique_features'}, '-sff\s+') && is_option_exists_in_anon($params->{'get_unique_features'}, '-sf\s+')); #$io->error('In [cuffdiff: ] options, "-b" is not required. We got that from [genomeFasta: ].' . $beverb) #if (defined $params->{'cuffdiff'} && is_option_exists_in_anon($params->{'cuffdiff'}, '-b\s+')); #$io->error('In [cufflinks: ] options, "-b" is not required. We got that from [genomeFasta: ].' . $beverb) #if (defined $params->{'cufflinks'} && is_option_exists_in_anon($params->{'cufflinks'}, '-b\s+')); $params->{'species'} = 'species' if (!defined $params->{'species'}); # Necessary evil my $tophat_o_dir = $params->{'outputDir'} . '/tophat'; my $cufflinks_o_dir = $params->{'outputDir'} . '/cufflinks'; my $trim_o_dir = $params->{'outputDir'} . '/trimmomatic'; my $job_scripts_dir = $params->{'outputDir'} . "/$job_scripts_dir_name"; my $ref_annot_dir = $params->{'outputDir'} . '/ref_annotation'; my $cuffdiff_known = $params->{'outputDir'} . '/cuffdiff_known_ncRNAs'; my $cuffdiff_novel = $params->{'outputDir'} . '/cuffdiff_novel_ncRNAs'; my $cuffmerge_known = $params->{'outputDir'} . '/cuffmerge_known_ncRNAs'; my $cuffmerge_novel = $params->{'outputDir'} . '/cuffmerge_novel_ncRNAs'; $io->c_time('Preparing output directories...'); mkdirs($io, [$job_scripts_dir, $cuffdiff_novel, $cuffdiff_known, $cuffmerge_known, $cuffmerge_novel, $ref_annot_dir]); # Create runID if ( !-e "$params->{'outputDir'}/.lncRNApipe.runID" || !is_job_done("$params->{'outputDir'}/.lncRNApipe.runID", $io, 'get_runID', 0) ) { # Generate run id require Data::Uniqid; Data::Uniqid->import( qw( uniqid ) ); $conf_run_id = Data::Uniqid->uniqid; $io->execute_system_command("echo \"Run=$conf_run_id\" > $params->{'outputDir'}/.lncRNApipe.runID"); } unlink "$job_scripts_dir/job_IDs_tophat.$conf_run_id" if (-e "$job_scripts_dir/job_IDs_tophat.$conf_run_id"); unlink "$job_scripts_dir/job_IDs_cufflinks.$conf_run_id" if (-e "$job_scripts_dir/job_IDs_cufflinks.$conf_run_id"); $io->execute_system_command("touch $job_scripts_dir/job_IDs_tophat.$conf_run_id"); $io->execute_system_command("touch $job_scripts_dir/job_IDs_cufflinks.$conf_run_id"); my $dl_status = my $post_dl_status = 'could not capture'; # Assume the download went fine. It will fail otherwise. my $nc_annot_gtf = "$ref_annot_dir/$params->{'species'}_$params->{'sourceDB'}_known_ncRNAs.gtf"; my $annot_gtf = "$ref_annot_dir/$params->{'species'}_$params->{'sourceDB'}.gtf"; if (!defined $params->{'useThisAnnotation'}) { $io->c_time('Checking for either wget or curl...'); $io->execute_system_command("rm $annot_gtf") if (-e $annot_gtf); if ($params->{'sourceDB'} =~ m/ensembl/i) { my $release = list($io, 'release'); my $dl_util = setup_lncRNApipe('get_dl_UTIL'); print "\n"; $io->c_time('Downloading annotation from ' . $params->{'sourceDB'} . '...'); if ($dl_util =~ m/curl/i) { chomp(my $file = $io->execute_get_sys_cmd_output("curl -s ftp://ftp.ensembl.org/pub/$release/gtf/$params->{'species'}/ | grep -e '^-' | awk '{ print \$9 }' | grep -i gtf")); $dl_util =~ s/O/ -o $ref_annot_dir\/$params->{'species'}_$params->{'sourceDB'}\.gtf\.gz/; $dl_status = $io->execute_get_sys_cmd_output("$dl_util ftp://ftp.ensembl.org/pub/$release/gtf/$params->{'species'}/$file > /dev/null 2>&1"); } else { $dl_util .= "-O $ref_annot_dir/$params->{'species'}_$params->{'sourceDB'}.gtf.gz" if ($dl_util =~ m/wget/i); $dl_status = $io->execute_get_sys_cmd_output("$dl_util ftp://ftp.ensembl.org/pub/$release/gtf/$params->{'species'}/*gtf.gz > /dev/null 2>&1"); } $dl_status = $io->execute_get_sys_cmd_output("gunzip $ref_annot_dir/$params->{'species'}_$params->{'sourceDB'}.gtf.gz"); $post_dl_status = $io->execute_get_sys_cmd_output("rm $ref_annot_dir/$params->{'species'}_$params->{'sourceDB'}.gtf.gz") if (-e "$ref_annot_dir/$params->{'species'}_$params->{'sourceDB'}.gtf.gz"); } elsif ($params->{'sourceDB'} =~ m/ucsc/i) { $io->c_time('Downloading annotation from ' . $params->{'sourceDB'} . '...'); $dl_status = $io->execute_get_sys_cmd_output("$deps->{'bin-genePredToGtf'} $params->{'species'} refGene $ref_annot_dir/$params->{'species'}_$params->{'sourceDB'}.gtf"); } } elsif(defined $params->{'useThisAnnotation'}) { $annot_gtf = $params->{'useThisAnnotation'}; } # Create known ncRNAs. if ($params->{'sourceDB'} =~ m/ensembl/i) { $io->c_time('Creating a GTF file with gene_biotype which is not "protein_coding"...'); $post_dl_status = $io->execute_get_sys_cmd_output("grep -vi 'protein_coding' $annot_gtf > $nc_annot_gtf"); } elsif ($params->{'sourceDB'} =~ m/ucsc/i) { $io->c_time('Creating a GTF file with transcript_id other than "NM_"...'); $post_dl_status = $io->execute_get_sys_cmd_output("grep -vi 'NM_' $annot_gtf > $nc_annot_gtf"); } # Figure out rest of the lncRNApipe options and finalize command calls. # get_unique_features.pl options my $thru_conf_get_uq_feat_opts_n = "'-sf $annot_gtf -sff gtf'"; $thru_conf_get_uq_feat_opts_n = "'" . join(' ', @{$params->{'get_unique_features'}}) . "'" if (defined $params->{'get_unique_features'} && is_option_exists_in_anon($params->{'get_unique_features'}, '-sf\s+|-source\s+')); $thru_conf_get_uq_feat_opts_n = "'-sf $annot_gtf -sff gtf " . join(' ', @{$params->{'get_unique_features'}}) . "'" if (defined $params->{'get_unique_features'} && !is_option_exists_in_anon($params->{'get_unique_features'}, '-sf\s+|-source\s+')); my $thru_conf_get_uq_feat_opts_k = "'-sf $nc_annot_gtf -sff gtf --known -ov 10 -extract'"; if (defined $params->{'get_unique_features'} && is_option_exists_in_anon($params->{'get_unique_features'}, '-sf\s+|-source\s+')) { $thru_conf_get_uq_feat_opts_k = "'--known -extract " . join(' ', @{$params->{'get_unique_features'}}) . "'"; $nc_annot_gtf = $io->strip_leading_and_trailing_spaces(is_option_exists_in_anon($params->{'get_unique_features'}, '(?:-sf\s+|-source\s+)(.+)', 'EXTRACT')); } $thru_conf_get_uq_feat_opts_k = "'-sf $nc_annot_gtf -sff gtf --known -extract " . join(' ', @{$params->{'get_unique_features'}}) . "'" if (defined $params->{'get_unique_features'} && !is_option_exists_in_anon($params->{'get_unique_features'}, '-sf\s+|-source\s+')); # RNAfold options. my $thru_conf_rna_fold_opts = '--rna'; $io->error('In [RNAfold: ] options, "-p" or "--noPS" are not required.' . $beverb) if ($params->{'runRNAfold'} =~ m/^yes$/i && is_option_exists_in_anon($params->{'RNAfold'}, 'p|noPS')); $thru_conf_rna_fold_opts = "--rna '" . join(' ', @{$params->{'RNAfold'}}) . "'" if ($params->{'runRNAfold'} =~ m/^yes$/i && defined $params->{'RNAfold'}); # cmscan options my $thru_conf_inf_opts = '--inf'; $io->error('In [cmscan: ] options, "--cpu" or "--tblout" or "-o" are not required.' . $beverb) if ($params->{'runcmscan'} =~ m/^yes$/i && is_option_exists_in_anon($params->{'cmscan'}, 'o|cpu|tblout')); $thru_conf_inf_opts = $thru_conf_inf_opts . " '" . join(' ', @{$params->{'cmscan'}}) . "'" if ($params->{'runcmscan'} =~ m/^yes$/i && defined $params->{'cmscan'}); # Check for annotation download errors. This code can be usefuly if I want to check any other downloads # in the future. print "\n", $io->error("Download quit with the following error:\n\n$dl_status" . $beverb) if ($dl_status !~ m/could not capture/i); print "\n", $io->error("Error while trying to create requested annotation files:\n\n$post_dl_status" . $beverb) if ($post_dl_status !~ m/could not capture/i); my $job_submission_rudimentary = 0; $job_submission_rudimentary = 'jobID_is_rudimentary' if (defined $donot_wait4jobIDs); if ($params->{'performTranscriptAssembly'} =~ m/^YES$/i) { # If users asks to perform transcript assembly, check min required options. # We already know there are replicates by presense of a , # verify_config_file($io, $beverb, $params->{'replicates'}, 'replicates', 'yes|no'); verify_config_file($io, $beverb, $params->{'libType'}, 'libType', 'se|pe|mixed'); verify_config_file($io, $beverb, $params->{'readType'}, 'readType', 'FASTA|FASTQ'); $io->warning('Will not trim reads. [readType: FASTA]') if ($params->{readType} =~ m/FASTA/i && defined $params->{'trimmomatic'}); $io->error('Invalid / No value provided for [bowtieGenomeIndex: ?]' . $beverb) if (!defined $params->{'bowtieGenomeIndex'}); $io->error("[performTranscriptAssembly: YES] requested, but cannot find any reads' options." . $beverb) if (!defined $params->{'reads'}->{'r1'} && !defined $params->{'reads'}->{'r2'}); $io->warning('[performTranscriptAssembly: YES] requested, but found [cuffcompare: ] options.'. "\n[cuffcompare: ] options are not required when you are performing transcript assembly with lncRNApipe.") if (defined $params->{'cuffcompare'}); $io->error('-bam option is not required for [cuffdiff: ] when [performTranscriptAssembly: YES].' . $beverb) if (defined $params->{'cuffdiff'} && is_option_exists_in_anon($params->{'cuffdiff'}, '-bam\s+')); mkdirs($io, [$tophat_o_dir, $cufflinks_o_dir, $trim_o_dir]); $io->c_time('Preparing job scripts...') if (defined $params->{'scheduler'}); my @r1_samples = my @r2_samples = (); if (!defined $params->{'reads'}->{'r2'}) { $io->c_time('[reads => r1: ] only mentioned. Treating it as SE reads...'); @r1_samples = split/\|/, $params->{'reads'}->{'r1'}; $params->{'reads'}->{'r2'} .= '0|' x scalar @r1_samples; # R1 is SE @r2_samples = split/\|/, $params->{'reads'}->{'r2'}; } elsif (!defined $params->{'reads'}->{'r1'}) { $io->c_time('[reads => r2: ] only mentioned. Treating it as SE reads...'); @r2_samples = split/\|/, $params->{'reads'}->{'r2'}; $params->{'reads'}->{'r1'} .= '0|' x scalar @r2_samples; # R1 is SE @r1_samples = split/\|/, $params->{'reads'}->{'r1'}; } else { $io->c_time('Detected PE / MP reads...'); @r1_samples = split/\|/, $params->{'reads'}->{'r1'}; @r2_samples = split/\|/, $params->{'reads'}->{'r2'}; } my $grand_sample_size = sample_size(\@r1_samples, \@r2_samples); $io->error('Number of samples mentioned in [categorize_ncRNAs: ] ( ' . scalar (split/\,/, is_option_exists_in_anon($params->{'categorize_ncRNAs'}, 'sample')) . ' ) is not equal to number of samples mentioned in [reads: ] ( ' . ($grand_sample_size+1) . ' ).' . $beverb) if ( ($grand_sample_size+1) != scalar (split/\,/, is_option_exists_in_anon($params->{'categorize_ncRNAs'}, 'sample')) ); # Quickly check if sample name is duplicated. my $categorize_ncRNAs_sample_names = {}; foreach my $categorize_ncRNAs_sample ((split/\,/, is_option_exists_in_anon($params->{'categorize_ncRNAs'}, 'sample'))) { $categorize_ncRNAs_sample =~ s/\"//g; $categorize_ncRNAs_sample = (split/\s+/, $categorize_ncRNAs_sample)[1] if ($categorize_ncRNAs_sample =~ m/\s+/); if (!exists $categorize_ncRNAs_sample_names->{$categorize_ncRNAs_sample}) { $categorize_ncRNAs_sample_names->{$categorize_ncRNAs_sample} = 1; } elsif (exists $categorize_ncRNAs_sample_names->{$categorize_ncRNAs_sample}) { $io->error("Sample name [ $categorize_ncRNAs_sample ] is duplicated in [categorize_ncRNAs: ]." . $beverb); } } $io->error("Seems like you intend to use limited samples.\nIn that case, indicate empty sample by separating it with appropriate delimiter [ | ].") if ( defined $params->{'reads'}->{'r1'} && defined $params->{'reads'}->{'r2'} && ( $#r1_samples > 0 || $#r2_samples > 0 ) && ( $#r1_samples != $#r2_samples ) && ( $params->{'reads'}->{'r1'} !~ m/^\||\|\||\|$/ || $params->{'reads'}->{'r2'} !~ m/^\||\|\||\|$/ ) && $params->{'libType'} =~ m/^mixed$/i ); my $sample_names2num = sample_nums($params->{'categorize_ncRNAs'}, $io); my $cufflinks_assembled_trs = my $all_bam_files = ''; for(my $i=0; $i<=$grand_sample_size; $i++) { my $bam_files_per_sample = ''; my $sample_name_grand = "run_$conf_run_id.sample_" . ($i+1); $sample_name_grand = "run_$conf_run_id." . $sample_names2num->{$i} if (exists $sample_names2num->{$i}); my $reps_1 = handle_read_libs($r1_samples[$i], $params->{'reads'}->{'readsDir'}, $io, $params) if ($r1_samples[$i]); my $reps_2 = handle_read_libs($r2_samples[$i], $params->{'reads'}->{'readsDir'}, $io, $params) if ($r2_samples[$i]); $io->error("Seems like you intend to use limited replicates.\nIn that case, indicate empty replicate by separating it with appropriate delimiter [ , ].") if (($#$reps_1 > 0 && $#$reps_2 > 0 ) && ($#$reps_1 != $#$reps_2) && ( $r1_samples[$i] !~ m/^,|,,|,$/ || $r2_samples[$i] !~ m/^,|,,|,$/ ) && $params->{'libType'} =~ m/^mixed$/i ); my $grand_rep_size = sample_size($reps_1, $reps_2); my $cufflinks_samp_o_dir = "$cufflinks_o_dir/$sample_name_grand"; my $tophat_samp_o_dir = "$tophat_o_dir/$sample_name_grand"; mkdirs($io, [$cufflinks_samp_o_dir, $tophat_samp_o_dir]); for (my $j=0; $j<=$grand_rep_size; $j++) { my $all_reps_1 = my $all_reps_2 = ''; my $rep_num = $j+1; my $rep_num2name = "_rep_$rep_num"; $all_reps_1 = $reps_1->[$j] if ($reps_1->[$j]); $all_reps_2 = $reps_2->[$j] if ($reps_2->[$j]); my $all_reps_1_base = $io->file_basename($all_reps_1); my $all_reps_2_base = $io->file_basename($all_reps_2); my $sample_name = "run_$conf_run_id.sample_" . ($i+1) . $rep_num2name; $sample_name = "run_$conf_run_id." . $sample_names2num->{$i} . $rep_num2name if (exists $sample_names2num->{$i}); my $tophat_rep_o_dir = "$tophat_o_dir/$sample_name"; my $trim_rep_o_dir = "$trim_o_dir/$sample_name"; mkdirs($io, [$tophat_rep_o_dir]); my $trim_cmd = ''; if ($params->{'readType'} =~ m/^FASTQ$/i && defined $params->{'trimmomatic'} && scalar @{$params->{'trimmomatic'}} > 1) { mkdirs($io, [$trim_rep_o_dir]); $trim_cmd = "\n\n#\n# Trimmomatic Run\n#\n#\n"; my $trimmomatic_dir = dirname(abs_path($deps->{'trimmomatic'})); $trimmomatic_dir .= '/adapters/'; my $trim_params = join(" ", @{$params->{'trimmomatic'}}); $trim_params =~ s/\:(.+?\.fa.*?\:)/\:$trimmomatic_dir$1/ if ($trim_params !~ m/\/.+?\.fa/); if ($all_reps_2 && $all_reps_1) { $trim_cmd .= "java -jar $deps->{'trimmomatic'} PE -threads $params->{'CPUs'} $all_reps_1 $all_reps_2 ". "$trim_rep_o_dir/$all_reps_1_base.filtered.fq " . "$trim_rep_o_dir/$all_reps_1_base.unpaired.fq " . "$trim_rep_o_dir/$all_reps_2_base.filtered.fq " . "$trim_rep_o_dir/$all_reps_2_base.unpaired.fq " . $trim_params . " &> $trim_rep_o_dir/run.log"; $all_reps_1 = "$trim_rep_o_dir/$all_reps_1_base.filtered.fq"; $all_reps_2 = "$trim_rep_o_dir/$all_reps_2_base.filtered.fq"; } elsif ($all_reps_1) { $trim_cmd .= "java -jar $deps->{'trimmomatic'} SE -threads $params->{'CPUs'} $all_reps_1 ". "$trim_rep_o_dir/$all_reps_1_base.filtered.fq " . $trim_params . " &> $trim_rep_o_dir/run.log"; $all_reps_1 = "$trim_rep_o_dir/$all_reps_1_base.filtered.fq"; } elsif ($all_reps_2) { $trim_cmd .= "java -jar $deps->{'trimmomatic'} SE -threads $params->{'CPUs'} $all_reps_2 ". "$trim_rep_o_dir/$all_reps_2_base.filtered.fq " . $trim_params . " &> $trim_rep_o_dir/run.log"; $all_reps_1 = "$trim_rep_o_dir/$all_reps_2_base.filtered.fq"; } $trim_cmd .= add_exit_check("$trim_rep_o_dir/run.log", "$trim_o_dir/status.log", 'completed successfully', 'isTrimRunComplete', 'Trimmomatic', '', '>>', $sample_name); #my $is_trim_job_done = is_job_done("$trim_o_dir/status.log", $io, 0, $sample_name); $trim_cmd = "\n\n# Trimmomatic run has passed. Will resume from tophat ...\n" if (is_job_done("$trim_o_dir/status.log", $io, 0, $sample_name)); #print "\n$trim_cmd\n\n"; $check_these_log_files .= "$trim_rep_o_dir/run.log\n"; } my $tophat_cmd = "\n\n#\n# Tophat Run\n#\n#\ntophat " . join(' ', @{$params->{'tophat'}}) . " -o $tophat_rep_o_dir" . " -p $params->{'CPUs'}" . " $params->{'bowtieGenomeIndex'} $all_reps_1 $all_reps_2" . "&> $tophat_rep_o_dir/run.log" . add_exit_check("$tophat_rep_o_dir/run.log", "$tophat_o_dir/status.log", 'run complete', 'isTophatRunComplete', 'Tophat', '', '>>', $sample_name); $bam_files_per_sample .= "$tophat_rep_o_dir/accepted_hits.bam "; $all_bam_files .= "$tophat_rep_o_dir/accepted_hits.bam "; $check_these_log_files .= "$tophat_rep_o_dir/run.log\n"; $scheduler_job_depends = "'$sample_name'" if ($params->{'scheduler'} =~ m/^none$/i); #my $is_tophat_job_done = is_job_done("$tophat_o_dir/status.log", $io, 0, $sample_name); $tophat_cmd = "\n\n# Tophat run has passed. Will resume from cufflinks ...\n" if (is_job_done("$tophat_o_dir/status.log", $io, 0, $sample_name)); #print "\n$tophat_cmd\n"; write_and_submit_job("lncRNApipe_alignment_stage_$sample_name" . ".sh", 'NO_DEPENDENCY_CHAIN', "job_IDs_tophat.$conf_run_id", $scheduler_job_depends, $job_scripts_dir, $params, $io, [$trim_cmd, $tophat_cmd], $job_submission_rudimentary); } #chop $all_bam_files if ($all_bam_files =~ m/\s$/); # Remove last space; #chop $bam_files_per_sample if ($bam_files_per_sample =~ m/\s$/); # Remove last space; #chomp(my $all_bam_files = $io->execute_get_sys_cmd_output("find $tophat_o_dir -type f -name 'accepted_hits.bam' | sed ':a;N;\$!ba;s/\\n/ /g'")); #$io->error('Cannot find BAM file(s). Did Tophat run fail?' . #"\nPlease check test log files:\n\n$check_these_log_files") #if ($bam_files_per_sample =~ m/could not capture/i); my $sambamba_cmd = my $cufflinks_input_bam_str = my $cufflinks_cmd = ''; my $bam_num = scalar (split /\s+/, $all_bam_files); my $bam_num_sample = scalar (split /\s+/, $bam_files_per_sample); #print "Bam num: $bam_num\t$all_bam_files\n"; my $job_deps_str = get_job_dependency_chain("$job_scripts_dir/job_IDs_tophat.$conf_run_id", $io, $params, "$bam_num:" . ($bam_num_sample-1)); $sambamba_cmd = "\n\n# No scheduler involved. Wait on PIDs.\n$job_deps_str" if ($params->{'scheduler'} =~ m/^none$/i); if ($bam_num_sample > 1) { #if ( $bam_num != $READ_NUM ) { # $io->warning("Inequal number of BAM files ($bam_num) and number of read files (" . # $READ_NUM . ") mentioned in [reads: ] $beverb", 'ERROR!'); # $params->{'qdel'} = 1; #} $sambamba_cmd .= "\n\n#\n# Sambamba Run\n#\n#\n$deps->{'sambamba'} merge" . " -t $params->{'CPUs'} $tophat_samp_o_dir/accepted_hits.merged.bam $bam_files_per_sample" . " &> $cufflinks_samp_o_dir/run.log"; $cufflinks_input_bam_str = "$tophat_samp_o_dir/accepted_hits.merged.bam"; } else { $cufflinks_input_bam_str = $bam_files_per_sample; } $cufflinks_cmd .= "\n\n#\n# Cufflinks Run\n#\n#\ncufflinks ". join(' ', @{$params->{'cufflinks'}}) . " -o $cufflinks_samp_o_dir" . " -L $sample_name_grand" . " -p $params->{'CPUs'}" . " -g $annot_gtf $cufflinks_input_bam_str >> $cufflinks_samp_o_dir/run.log 2>&1" . add_exit_check("$cufflinks_samp_o_dir/run.log", "$cufflinks_o_dir/status.log", 'processed.+?loci', 'isCufflinksRunComplete', 'Cufflinks', '', '>>', $sample_name_grand); $cufflinks_assembled_trs .= "$cufflinks_samp_o_dir/transcripts.gtf "; $check_these_log_files .= "$cufflinks_samp_o_dir/run.log\n"; $scheduler_job_depends = "'$sample_name_grand'" if ($params->{'scheduler'} =~ m/^none$/i); $cuffdiff_bam_files .= join(',', (split/\s+/, $bam_files_per_sample)) . ' '; #my $is_cufflinks_job_done = is_job_done("$cufflinks_o_dir/status.log", $io, 0, $sample_name_grand); if (is_job_done("$cufflinks_o_dir/status.log", $io, 0, $sample_name_grand)) { $sambamba_cmd = "\n\n# Cufflinks run has passed. No need to merge replicates. Will resume from cuffcompare ..."; $cufflinks_cmd = "\n# Cufflinks run has passed. No need to redo transcript assembly. Will resume from cuffcompare ..."; } write_and_submit_job("lncRNApipe_assembly_stage_$sample_name_grand" . ".sh", $job_deps_str, "job_IDs_cufflinks.$conf_run_id", $scheduler_job_depends, $job_scripts_dir, $params, $io, [$sambamba_cmd, $cufflinks_cmd], 'jobID_is_rudimentary'); #print "$sambamba_cmd\n$cufflinks_cmd\n"; } my $cufflinks_tr_num = scalar (split /\s+/, $cufflinks_assembled_trs); $job_deps_str_lncRNApipe_post_assembly = get_job_dependency_chain("$job_scripts_dir/job_IDs_cufflinks.$conf_run_id", $io, $params, $cufflinks_tr_num, "$cufflinks_o_dir/status.log") if ($cufflinks_tr_num); $cufflinks_after_assembly_check = add_exit_check("$cufflinks_o_dir/status.log", "$cufflinks_o_dir/lncRNApipe_afterCufflinks.status.log", 'passed', 'isCufflinksRunComplete', 'Cufflinks assembly'); $params->{'cuffcompare'} = [$cufflinks_assembled_trs]; } elsif ($params->{'performTranscriptAssembly'} =~ m/^NO$/i) { $io->c_time('OK. Skipping transcript assembly...'); # Check cuffcompare options. $io->error('[cuffcompare: ] options not provided.' . "\nWhen [performTranscriptAssembly: NO], we need to know about your assembled transcripts in GTF format." . $beverb) if (!defined $params->{'cuffcompare'}); $io->error('In [cuffcompare: ] options, please provide your assembled transcripts as "-i full_unix_path_to_your_assembly_list.txt".' . $beverb) if (!is_option_exists_in_anon($params->{'cuffcompare'}, '-i\s+')); $io->error('Only "-i full_unix_path_to_your_assembly_list.txt" is allowed for [cuffcompare: ] when [performTranscriptAssembly: NO].'. $beverb) if ( defined $params->{'cuffcompare'} && is_option_exists_in_anon($params->{'cuffcompare'}, '-i\s+', 'NOT')); $io->error('In [cuffdiff: ] options, "-L" is not required. We got that from [categorize_ncRNAs: ].' . $beverb) if (defined $params->{'cuffdiff'} && is_option_exists_in_anon($params->{'cuffdiff'}, '-L\s+')); $io->error('In [cuffdiff: ] options, "-bam" is required when [[performTranscriptAssembly: NO ].' . $beverb) if (defined $params->{'cuffdiff'} && !is_option_exists_in_anon($params->{'cuffdiff'}, '-bam\s+')); $cuffdiff_bam_files = is_option_exists_in_anon($params->{'cuffdiff'}, '-bam\s+'); $cuffdiff_bam_files =~ s/\-bam//; if (defined $params->{'cuffdiff'}) { for (0 .. $#{$params->{'cuffdiff'}}) { delete @{$params->{'cuffdiff'}}[$_] if (@{$params->{'cuffdiff'}}[$_] =~ m/-bam\s+/); } } } #chop $cufflinks_assembled_trs if ($cufflinks_assembled_trs =~ m/\s$/); # Remove last space; my $lncRNApipe_opts = "\n\n# No scheduler involved. Wait on PIDs.\n$job_deps_str_lncRNApipe_post_assembly" if ($params->{'scheduler'} =~ m/^none$/i && $job_deps_str_lncRNApipe_post_assembly); if ($cufflinks_after_assembly_check && $job_deps_str_lncRNApipe_post_assembly) { $lncRNApipe_opts = $lncRNApipe_opts . $cufflinks_after_assembly_check; } else { $job_deps_str_lncRNApipe_post_assembly = 'NO_DEPENDENCY_CHAIN'; } $lncRNApipe_opts .= qq { # # lncRNApipe post assembly run. This is where we identify any possible novel ncRNAs / lncRNAs. # # perl $USER_HOME/lncRNApipe --run $params->{'outputDir'} --cpu $params->{'CPUs'} --cuffcompare '-r $annot_gtf -s $params->{'genomeFasta'} } . join(' ', @{$params->{'cuffcompare'}}) . "' --cat-ncRNAs '" . join(' ', @{$params->{'categorize_ncRNAs'}}) . "' --get $thru_conf_get_uq_feat_opts_n --fetch --cpc $thru_conf_inf_opts $thru_conf_rna_fold_opts > $params->{'outputDir'}/lncRNApipe.postAssembly.run_$conf_run_id.log 2>&1"; my $cat_opts_for_get_known_ncRNAs = "--cuffcompare '-r $nc_annot_gtf -s " . $params->{'genomeFasta'} . ' ' . join(' ', @{$params->{'cuffcompare'}}) . "' --cat-ncRNAs '" . join(' ', @{$params->{'categorize_ncRNAs'}}) . " --known-ncRNAs -rescue --extract-pat \"=\"'"; $lncRNApipe_opts .= qq { # # lncRNApipe post assembly run. This is where we identify any possible known ncRNAs / lncRNAs. # # perl $USER_HOME/lncRNApipe --run $cuffdiff_known --cpu $params->{'CPUs'} $cat_opts_for_get_known_ncRNAs --fetch --skip-cpc $thru_conf_rna_fold_opts $thru_conf_inf_opts >> $params->{'outputDir'}/lncRNApipe.postAssembly.run_$conf_run_id.log 2>&1}; $lncRNApipe_opts .= add_exit_check("$params->{'outputDir'}/lncRNApipe.postAssembly.run_$conf_run_id.log", "$params->{'outputDir'}/lncRNApipe.postAssembly.status.log", 'aborted|bailing', 'islncRNApipePostAssemblyRunComplete', 'Post assembly pipeline', '! -z'); $check_these_log_files .= "$params->{'outputDir'}/lncRNApipe.postAssembly.run_$conf_run_id.log\n"; #print "\n\n$lncRNApipe_post_assembly_cmd\n\n"; $io->error('Only "--min-isoform-fraction" is allowed for [cuffmerge: ]') if ( defined $params->{'cuffmerge'} && is_option_exists_in_anon($params->{'cuffmerge'}, '--min-isoform-fraction\s+', 'NOT') ); my $cuffmerge_cmd = my $cuffdiff_cmd = ''; if (defined $params->{'cuffmerge'} || defined $params->{'cuffdiff'}) { my $cuffmerge_opts = join(' ', @{$params->{'cuffmerge'}}); $cuffmerge_cmd .= qq { # # Merge all known and novel ncRNAs. # # # # Known ncRNAs. # find $cuffdiff_known/lncRNApipe.final -type f -name '*.final.gtf' > $cuffmerge_known/lncRNApipe_known_ncRNAs_tr_list.txt cuffmerge $cuffmerge_opts -p $params->{'CPUs'} -o $cuffmerge_known -g $nc_annot_gtf -s $params->{'genomeFasta'} $cuffmerge_known/lncRNApipe_known_ncRNAs_tr_list.txt > $cuffmerge_known/cuffmerge_known_ncRNAs.run_$conf_run_id.log 2>&1 # # Novel ncRNAs. # find $params->{'outputDir'}/lncRNApipe.final -type f -name '*.final.gtf' > $cuffmerge_novel/lncRNApipe_novel_ncRNAs_tr_list.txt cuffmerge $cuffmerge_opts -p $params->{'CPUs'} -o $cuffmerge_novel $cuffmerge_novel/lncRNApipe_novel_ncRNAs_tr_list.txt >> $cuffmerge_novel/cuffmerge_novel_ncRNAs.run_$conf_run_id.log 2>&1}; #$cuffmerge_cmd .= add_exit_check("$params->{'outputDir'}/cuffmerge.combined.run.log", # "$params->{'outputDir'}/cuffmerge.combined.status.log", # 'aborted|error', # 'isCuffmergeRunComplete', # 'Cuffmerge'); $check_these_log_files .= "$cuffmerge_known/cuffmerge_known_ncRNAs.run_$conf_run_id.log\n" . "$cuffmerge_novel/cuffmerge_novel_ncRNAs.run_$conf_run_id.log\n"; } if (defined $params->{'cuffdiff'}) { my $cuffdiff_opts = join(' ', @{$params->{'cuffdiff'}}); my $sample_names = is_option_exists_in_anon($params->{'categorize_ncRNAs'}, 'sample'); ($sample_names) = ($sample_names =~ m/.+\s+(.+)/); $cuffdiff_cmd .= qq { # # Find differentially expressed known and novel ncRNAs. # # # # Differentially expressed known ncRNAs. # cuffdiff -p $params->{'CPUs'} -o $cuffdiff_known -L $sample_names $cuffdiff_opts $cuffmerge_known/merged.gtf $cuffdiff_bam_files> $cuffdiff_known/cuffdiff_on_known_ncRNAs.run_$conf_run_id.log 2>&1 # # Differentially expressed novel ncRNAs. # cuffdiff -p $params->{'CPUs'} -o $cuffdiff_novel -L $sample_names $cuffdiff_opts $cuffmerge_novel/merged.gtf $cuffdiff_bam_files>> $cuffdiff_novel/cuffdiff_on_novel_ncRNAs.run_$conf_run_id.log 2>&1}; $cuffdiff_cmd .= add_exit_check("$cuffdiff_known/cuffdiff_on_known_ncRNAs.run_$conf_run_id.log", "$cuffdiff_known/status.log", 'writing run info', 'isCuffdiffKnownRunComplete', 'Cuffdiff on known ncRNAs'); $cuffdiff_cmd .= add_exit_check("$cuffdiff_novel/cuffdiff_on_novel_ncRNAs.run_$conf_run_id.log", "$cuffdiff_novel/status.log", 'writing run info', 'isCuffdiffNovelRunComplete', 'Cuffdiff on novel ncRNAs'); $check_these_log_files .= "$cuffdiff_known/cuffdiff_on_known.run_$conf_run_id.log\n" . "$cuffdiff_novel/cuffdiff_on_novel.run_$conf_run_id.log\n"; } $lncRNApipe_opts = "\n\n# We have already identified known and novel ncRNAs. Will resume from cuffmerge and cuffdiff ...\n" if (is_job_done("$params->{'outputDir'}/lncRNApipe.postAssembly.status.log", $io)); $cuffmerge_cmd = $cuffdiff_cmd = "\n\n# Cuffmerge and Cuffdiff runs on known and novel ncRNAs have passed.\n" if (is_job_done("$cuffdiff_known/status.log", $io) && is_job_done("$cuffdiff_novel/status.log", $io)); write_and_submit_job("lncRNApipe_postAssembly.run_$conf_run_id.sh", $job_deps_str_lncRNApipe_post_assembly, "job_IDs_lncRNApipe_postAssembly.$conf_run_id", $scheduler_job_depends, $job_scripts_dir, $params, $io, [$lncRNApipe_opts, $cuffmerge_cmd, $cuffdiff_cmd], 'jobID_is_rudimentary'); # We need to explicitely get job ID and print it so that we can kill it later. my $cuffdiff4all_trs = ''; if (defined $params->{'runCuffdiffForAllTranscripts'} && $params->{'runCuffdiffForAllTranscripts'} =~ m/^yes$/i) { my $cuffdiff_on_tr_dir = "$params->{'outputDir'}/cuffdiff_on_transcripts"; mkdirs($io, [$cuffdiff_on_tr_dir, "$cuffdiff_on_tr_dir/cuffmerge"]); my $sample_names = is_option_exists_in_anon($params->{'categorize_ncRNAs'}, 'sample'); ($sample_names) = ($sample_names =~ m/.+\s+(.+)/); my $cuffdiff_opts = my $cuffmerge_opts = ''; $cuffdiff_opts = join(' ', @{$params->{'cuffdiff'}}) if (scalar @{$params->{'cuffdiff'}} >= 1); $cuffmerge_opts = join(' ', @{$params->{'cuffmerge'}}) if (scalar @{$params->{'cuffmerge'}} >= 1); $cuffdiff4all_trs .= "\n\n# No scheduler involved. Wait on PIDs.\n$job_deps_str_lncRNApipe_post_assembly" if ($params->{'scheduler'} =~ m/^none$/i && $job_deps_str_lncRNApipe_post_assembly); $cuffdiff4all_trs .= qq { # # Merge all transcripts from cufflinks. # # find $cufflinks_o_dir -type f -name 'transcripts.gtf' > $cuffdiff_on_tr_dir/cufflinks_tr_list.txt cuffmerge $cuffmerge_opts -p $params->{'CPUs'} -o $cuffdiff_on_tr_dir/cuffmerge -g $annot_gtf -s $params->{'genomeFasta'} $cuffdiff_on_tr_dir/cufflinks_tr_list.txt > $cuffdiff_on_tr_dir/cuffmerge_on_cufflinks_transcripts.run_$conf_run_id.log 2>&1 # # Find differentially expressed transcripts. # # cuffdiff -o $cuffdiff_on_tr_dir -L $sample_names $cuffdiff_opts $cuffdiff_on_tr_dir/cuffmerge/merged.gtf $cuffdiff_bam_files> $cuffdiff_on_tr_dir/cuffdiff_on_all_transcripts.run_$conf_run_id.log 2>&1}; $cuffdiff4all_trs .= add_exit_check("$cuffdiff_on_tr_dir/cuffdiff_on_all_transcripts.run_$conf_run_id.log", "$cuffdiff_on_tr_dir/status.log", 'writing run info', 'isCuffdiffAllTrsRunComplete', 'Cuffdiff'); $check_these_log_files .= "$cuffdiff_on_tr_dir/cuffmerge_on_cufflinks_transcripts.run_$conf_run_id.log\n" . "$cuffdiff_on_tr_dir/cuffdiff_on_all_transcripts.run_$conf_run_id.log\n"; $cuffdiff4all_trs = "\n\n# Cuffdiff run on all assembled Cufflinks' transcripts has passed. Skipping...\n" if (is_job_done("$cuffdiff_on_tr_dir/status.log", $io, 0)); $io->c_time("cuffdiff run on all cufflinks' transcripts requested..."); $io->c_time("Generating separate job script using the same job dependency chain..."); write_and_submit_job("lncRNApipe_postAssembly_diffExp.run_$conf_run_id.sh", $job_deps_str_lncRNApipe_post_assembly, "job_IDs_lncRNApipe_postAssembly.$conf_run_id", $scheduler_job_depends, $job_scripts_dir, $params, $io, [$cuffdiff4all_trs], 'jobID_is_rudimentary'); # We need to explicitely get job ID and print it so that we can kill it later. } $io->c_time('All job scripts have been submitted. Please check these log files for any results / errors.' . "\n\n$check_these_log_files"); exit 0; } # Make directories and exit out if error. sub mkdirs { my $io = shift; my $dirs = shift; foreach my $dir (@$dirs) { if (!-d $dir) { my $mk_dir_output = $io->execute_get_sys_cmd_output("mkdir $dir"); $io->error('Cannot create output directories. Exited with the following error(s):' . "\n$mk_dir_output") if ($mk_dir_output !~ m/could not capture/i); } } return; } # Check status file and resume pipeline. sub is_job_done { my $file = shift; my $io = shift; my $get_run_id = shift; my $autogen_sample_name = shift; if ($autogen_sample_name) { $autogen_sample_name .= '.+?passed'; } else { $autogen_sample_name = 'passed'; } chomp(my $line_count = $io->execute_get_sys_cmd_output("wc -l $file 2>&1 | awk '{print \$1}'")); chomp(my $passed = $io->execute_get_sys_cmd_output("grep -oP '$autogen_sample_name' $file 2>&1 | head -n 1")); chomp(my $run_id = $io->execute_get_sys_cmd_output("grep -oiP '" . 'Run=\w+' . "' $file 2>&1 | head -n 1")); if ($run_id =~ m/Run=/i && $get_run_id) { $run_id =~ s/Run=//i; $conf_run_id = $run_id; return 1; } elsif ($run_id =~ m/Run=/i && $passed =~ m/passed/i) { $run_id =~ s/Run=//i; $conf_run_id = $run_id; } #elsif ($passed =~ m/passed/i) { #$io->error("Unable get previous Run ID. Only way to go from here is to restart the run from the beginning.\nTry using [overwriteOutputDir: YES]."); #} return $line_count if ($passed =~ m/passed/i && $line_count =~ m/^\d+/ && $line_count > 0); return 0; } # Check and return anon array options' sub is_option_exists_in_anon { my $arr = shift; my $regex = shift; my $negate = shift; foreach my $val (@$arr) { if ($val =~ m/$regex/ && $negate && $negate eq 'EXTRACT') { my $sample_name_exists = $1; return $sample_name_exists; } elsif ($val =~ m/$regex/ && !$negate) { my $sample_name_exists = $val; return $sample_name_exists; } elsif ($val !~ m/$regex/ && $negate && $negate ne 'EXTRACT') { my $sample_name_exists = $val; return $sample_name_exists; } } return 0; } # Kill jobs and exit, if user has requested it. sub kill_jobs_and_exit { my $job_id_files = shift; my $io = shift; my $params = shift; foreach my $file (@$job_id_files) { my $job_ids_fh = $io->open_file('<', $file); my $job_ids = do {local $/; <$job_ids_fh>}; close $job_ids_fh; my $job_deps_str = join(" ", (split(/\n|\r/, $job_ids))); if ($params->{'scheduler'} =~ m/^bash$/i || $params->{'killCmd'} =~ m/kill/i) { $job_deps_str = ''; $io->c_time("Getting PIDs..."); foreach my $job_id ( (split(/\n|\r/, $job_ids)) ) { chomp($job_id); #print "ps aux | grep '$job_id' | grep -v grep | awk '{print \$2}'\n"; my $pid = $io->execute_get_sys_cmd_output("ps aux | grep '$job_id' | grep -v grep | awk '{print \$2}'"); $pid =~ s/(\n|\r)+/ /g; $job_deps_str .= "$pid " if ($pid !~ m/could not capture/i); } } if (!$job_deps_str) { $io->warning('Did not find any running jobs mentioned in job file [ ' . $io->file_basename($file, 'suffix') . ' ] !', 'EXITING!'); } else { $io->c_time("[scheduler: $params->{'scheduler'}]. Attempting to kill by job ID(s) as stored in [ " . $io->file_basename($file, 'suffix') . ' ]...'); $io->execute_system_command("$params->{'killCmd'} $job_deps_str", "Command call:\n" . "-------------\n" . "$params->{'killCmd'} $job_deps_str"); } $io->warning("Kill command: [ $params->{'killCmd'} ]. Scheduler: [ $params->{'scheduler'} ].", 'EXITING!!'); exit 0; } } # Return job dependency chain based on scheduler sub get_job_dependency_chain { my $file = shift; my $io = shift; my $params = shift; my $wait_for_jobs = shift; my $check_file_presence = shift; my $only_these_jobs = ''; $io->error('Do not know how many jobs to wait for') if (!$wait_for_jobs); if ($wait_for_jobs =~ m/\d+\:\d+/) { ($wait_for_jobs, $only_these_jobs) = split/\:/, $wait_for_jobs } elsif ($wait_for_jobs =~ m/^\d+$/) { $only_these_jobs = $wait_for_jobs - 1; } else { $io->error('$WAIT_FOR_JOBS is not numeric!!'); } my $job_ids_fh = my $job_ids = my $job_deps_str = ''; if ($check_file_presence) { # Just read one time. $job_ids_fh = $io->open_file('<', $file); $job_ids = do {local $/; <$job_ids_fh>}; my $num_jobs = scalar (split(/\n|\r/, $job_ids)); close $job_ids_fh; $job_deps_str = join(", ", (split(/\n|\r/, $job_ids))); if (!defined $donot_wait4jobIDs && $params->{'scheduler'} !~ m/^none$/i && ask_user("\nDear $ENV{'USER'}:\n=====================\nSince the assembly job(s) waits in queue until all alignments are finished,\n" . "and since we cannot get job ID(s) programmatically, we tried to get job ID(s)\n" . "from STDOUT / STDERR of previous batch submission [ $params->{'batchSubCmd'} ] for cufflinks." . "\n\nThese are the job ID(s) we got that must be finished before " . "[ lncRNApipe_postAssembly.run_$conf_run_id.sh ] starts running.\n\n" . '*' x 70 . "\n\n$job_deps_str\n\n" . '*' x 70 . "\n\nAre the above job ID(s) we got from STDERR / STDOUT (sandwiched -\nbetween " . "#################################### lines) correct?\n\n" . "Type 'y' and press ENTER to submit the job script", 900)) { $io->c_time("YAY! We got it right!!"); } elsif(!defined $donot_wait4jobIDs && $params->{'scheduler'} !~ m/^none$/i) { $io->warning('Sorry, we got it wrong. We can still move forward depending on your correct input.' . "\nOK. Here's what you can do. You know the job ID(s) from the STDOUT / STDERR output above.\n" . "\nThey should look somewhat like:\n\n\t" . "Your job 3214168 (\"lncRNApipe_assembly_stage.sh\") has been submitted ... \n\n\t\tOR\n\n" . "\tJob <3214168> is submitted to queue ... \n\n\t\tOR\n\n" . "\t3214168.pbs.tamu.edu ... \n\n", 'OOPS!'); print STDOUT "Dear $ENV{'USER'}:\n=====================\nOpen a new command line session and replace the word JOBID " . "in the following command\nwith the actual job ID and repeat the command for each job ID you see above.\n\n" . "echo \"JOBID\" >> $file\n\n"; $io->c_time("Waiting until $ENV{'USER'} enters correct number of job ID(s) in [ $file ]..."); $io->execute_system_command("rm -rf $file"); $io->execute_system_command("touch $file"); } } while (1) { $job_ids_fh = $io->open_file('<', $file); $job_ids = do {local $/; <$job_ids_fh>}; my $num_jobs = scalar (split(/\n|\r/, $job_ids)); close $job_ids_fh; if ($wait_for_jobs < $num_jobs) { $io->warning("Number of job IDs [ $num_jobs ] is more than number of job IDs expected [ $wait_for_jobs ]." . "\nWe are not sure which Job IDs to use to create dependency chain.\n\nJob IDs file: $file", 'ERROR!'); kill_jobs_and_exit([$file], $io, $params); } elsif ($wait_for_jobs != $num_jobs) { $io->c_time('Waiting for ' . ($wait_for_jobs-$num_jobs) . ' more job ID(s) ' . 'in [ ' . $io->file_basename($file, 'suffix') . ' ]...'); sleep 10; next; } else { last; } } if ($params->{'scheduler'} =~ m/^none$/i) { $job_deps_str = join("|", (split(/\n|\r/, $job_ids))); if ($file =~ m/job_IDs_tophat/i) { $job_deps_str = "while [ true ]; do if [ ! -z \"\$(ps aux | grep -P 'tophat.+?($job_deps_str)' | grep -v grep)\" ]; then " . "sleep 10; else break; fi; done;"; } elsif ($file =~ m/job_IDs_cufflinks/i) { $job_deps_str = "while [ true ]; do if [ -e \"$check_file_presence\" ] && [ -s \"$check_file_presence\" ] && [ ! -z \"\$(ps aux | grep -P 'cufflinks.+?($job_deps_str)' | grep -v grep)\" ]; then " . "break; else sleep 10; fi; done;"; } } elsif ($params->{'scheduler'} =~ m/^sge$/i) { $job_deps_str = '-hold_jid ' . join(",", (reverse (split(/\n|\r|\s+/, $job_ids)))[0..$only_these_jobs]); } elsif ($params->{'scheduler'} =~ m/^pbs$/i) { $job_deps_str = '-W depend=afterok:' . join(":", (reverse (split(/\n|\r|\s+/, $job_ids)))[0..$only_these_jobs]); } elsif ($params->{'scheduler'} =~ m/^slurm$/i) { $job_deps_str = '--dependency=afterok:' . join(":", (reverse (split(/\n|\r|\s+/, $job_ids)))[0..$only_these_jobs]); } elsif ($params->{'scheduler'} =~ m/^lsf$/i) { $job_deps_str = '-w "done(' . join(") && done(", (reverse (split(/\n|\r|\s+/, $job_ids)))[0..$only_these_jobs]); $job_deps_str .= ')" <'; } return $job_deps_str; } # Write exit code for job script sub add_exit_check { my $log_file = shift; my $status_log_file = shift; my $pattern = shift; my $var_name = shift; my $cmd_in_msg = shift; my $expr = shift; my $write_mode = shift; my $sample_name = shift; my $auto_generated_on = localtime; $auto_generated_on =~ s/\s+/\_/g; $expr = '-z' if (!$expr); $write_mode = '>' if (!$write_mode); if (!$sample_name) { $sample_name = $cmd_in_msg; } else { $sample_name = "AutoGenSampleName=$sample_name " . $cmd_in_msg; } my $here = qq { # # Exit check. Checking log file of the completed run. # # $var_name=`grep -ioP '$pattern' $log_file` if [ $expr "\$$var_name" ]; then echo "lncRNApipe: AutoGenJobTime=$auto_generated_on Run=$conf_run_id $sample_name run has failed." $write_mode $status_log_file exit -1 else echo "lncRNApipe: AutoGenJobTime=$auto_generated_on Run=$conf_run_id $sample_name run has passed." $write_mode $status_log_file fi}; return $here; } # Write job script with commands sub write_and_submit_job { my $job_script_name = shift; my $deps_str = shift; my $job_ids_file = shift; my $jobid_env = shift; my $job_scripts_dir = shift; my $params = shift; my $io = shift; my $cmds = shift; my $jobID_rudimentary = shift; my $cluster_opts = '# No cluster options specified.'; my $scheduler_opts_str = '# No scheduler options specified.'; my $sandbox_bin_path = '# Using binaries [ bowtie, tophat, cufflinks ] found on the system.'; my $yaml_was = $io->file_basename($pipe_config, 'suffix'); $job_script_name = "$job_scripts_dir/$job_script_name"; $jobID_rudimentary = 0 if ($params->{'batchSubCmd'} =~ m/^bash$/); my $auto_generated_on = localtime; $scheduler_opts_str = join("\n#", @{$params->{'schedulerOpts'}}) if (defined $params->{'schedulerOpts'}); my $job_script_fh = $io->open_file('>', $job_script_name); unshift(@$cmds, "\n\n# This run's job ID or PID.\necho " . $jobid_env . " >> $job_scripts_dir/$job_ids_file") if ($jobid_env && $jobid_env !~ m/none/i && $job_ids_file !~ m/NO_DEPENDENCY_CHAIN/i && !$jobID_rudimentary); $cluster_opts = join("\n", @{$params->{'clusterOpts'}}) if (defined $params->{'clusterOpts'}); $sandbox_bin_path = '# Attempting to go for zero failures regarding: "-bash: : command not found".' . "\n# Just appending path to our sandboxed binaries.\n#" . "\nexport PATH=\$PATH:$deps->{'bowtie'}:$deps->{'tophat'}:$deps->{'cufflinks'}" if ($deps->{'bowtie'} =~ m/lncRNApipe\.depbin/ || $deps->{'tophat'} =~ m/lncRNApipe\.depbin/ || $deps->{'cufflinks'} =~ m/lncRNApipe\.depbin/); my $job_script = qq {#!/bin/bash # # lncRNApipe v$VERSION # Job script generated on $auto_generated_on. # Scheduler is $params->{'scheduler'}. # # Job parameters as mentioned in configuration file [ $yaml_was ]: # #$scheduler_opts_str # # Other custom configurations specific to your cluster as mentioned in # configuration file [ $yaml_was ]: # $cluster_opts # $sandbox_bin_path@$cmds}; print $job_script_fh $job_script; close $job_script_fh; $io->c_time('Job script [ ' . $io->file_basename($job_script_name, 'suffix') . ' ] written...'); my $grid_job_id = ''; if ($params->{'scheduler'} !~ m/^none$/i) { $io->c_time('Submitting job script [ '. $io->file_basename($job_script_name, 'suffix') . ' ]...'); if ($deps_str !~ m/NO_DEPENDENCY_CHAIN/i) { if ($jobID_rudimentary) { print "\n", '#' x 70, "\n"; my $grid_job_id = $io->execute_get_sys_cmd_output("$params->{'batchSubCmd'} $deps_str $job_script_name", #0, "Command call:\n" . "-------------\n" . "$params->{'batchSubCmd'} $deps_str $job_script_name"); print STDOUT $grid_job_id; print "\n", '#' x 70, "\n"; $grid_job_id = get_grid_jobID_rudimentary($grid_job_id); if ($grid_job_id !~ m/could not capture/i && $grid_job_id) { $io->execute_system_command("echo $grid_job_id >> $job_scripts_dir/$job_ids_file"); } else { $io->warning("Cannot get job ID from STDOUT / STDERR.\n" . 'Please report this error with output from the command to lncRNApipe@gmail.com so that we can fix it', 'ERROR!'); $io->c_time("Attempting to kill already submitted jobs..."); kill_jobs_and_exit(["$job_scripts_dir/$job_ids_file"], $io, $params); } } else { print "\n+", '-' x 68, "+\n"; $io->execute_system_command("$params->{'batchSubCmd'} $deps_str $job_script_name", #0, "Command call:\n" . "-------------\n" . "$params->{'batchSubCmd'} $deps_str $job_script_name"); print "\n+", '-' x 68, "+\n"; } } else { # bsub special case $job_script_name = "< $job_script_name" if ($params->{'batchSubCmd'} =~ m/^bsub$/i); if ($jobID_rudimentary) { print "\n+", '-' x 68, "+\n"; my $grid_job_id = $io->execute_get_sys_cmd_output("$params->{'batchSubCmd'} $job_script_name", #0, "Command call:\n" . "-------------\n" . "$params->{'batchSubCmd'} $job_script_name"); print STDOUT $grid_job_id; print "\n+", '-' x 68, "+\n"; $grid_job_id = get_grid_jobID_rudimentary($grid_job_id); if ($grid_job_id !~ m/could not capture/i && $grid_job_id) { $io->execute_system_command("echo $grid_job_id >> $job_scripts_dir/$job_ids_file"); } else { $io->warning("Cannot get job ID from STDOUT / STDERR.\n" . 'Please report this error with the output from the command to lncRNApipe@gmail.com so that we can fix it', 'ERROR!'); $io->c_time("Attempting to kill already submitted jobs..."); kill_jobs_and_exit(["$job_scripts_dir/$job_ids_file"], $io, $params); } } else { print "\n+", '-' x 68, "+\n"; $io->execute_system_command("$params->{'batchSubCmd'} $job_script_name", #0, "Command call:\n" . "-------------\n" . "$params->{'batchSubCmd'} $job_script_name"); print "\n+", '-' x 68, "+\n"; } } } else { $io->c_time('No scheduler specified. Running it in background with bash...'); print "\n@ ", '-' x 68, " @\n"; $io->execute_system_command("bash $job_script_name &", #0, "Command call:\n" . "-------------\n" . "bash $job_script_name &"); print "\n@ ", '-' x 68, " @\n"; } return; } # Return grid job id. This is rudimentary since more work is needed # to create job dependency chains. The Schedule::DRMAAc # installation needs little configuration, which # defeats the purpose of this whole pipeline, which is to be as simple # as possible to install and run. # # IN MOST CASES THE JOB ID IS NUMERIC. IF NOT, WE DEPEND ON THE USER # INPUT. # # This does the heavy lifting but we still ask user to verify job ids # parsed. sub get_grid_jobID_rudimentary { my $job_id_string = shift; if ($job_id_string =~ m/\s+/) { (undef, undef, my $job_id, undef) = ($job_id_string =~ m/(submitted)?.*?job\s+(id)?\W*(\d+)\W*.*(submitted)?/i); #(my $job_id) = ($job_id_string =~ m/submitted{0,1}job.+?\W*(\d+)\W*.*?submitted{0,1}/i); return $job_id; } else { return $job_id_string if ($job_id_string); } return 0; } # Calculate sample size and return largest. sub sample_size { my $arr1 = shift; my $arr2 = shift; my $samples = shift; return scalar(@$arr1) + scalar(@$arr2) if ($samples && $samples =~ m/both/i); if ($#$arr1 > $#$arr2) { return $#$arr1; } else { return $#$arr2; } } # Create sample nums for sample names sub sample_nums { my $sample_str = shift; my $io = shift; my $sample_names2num = {}; foreach my $cat_option (@$sample_str) { if ($cat_option =~ m/sample.+?\s+(.+)/i) { my $samples = $1; $cat_option = $io->strip_leading_and_trailing_spaces($cat_option); $io->error('Sample names in [categorize_ncRNAs: ] options should be enclosed within double quotes.' . "\nEx: \"sample_1,sample_2,sample_3,.,.,sample_n\"") if ($samples !~ /\".+?\"/); $samples =~ s/\"//g; my @sample_names = split/\,/, $samples; for (my $i=0; $i<=$#sample_names; $i++) { $sample_names2num->{$i} = $sample_names[$i]; } last; } } return $sample_names2num; } # Handle and separate read lib files sub handle_read_libs { my $samples = shift; my $reads_dir = shift; my $io = shift; my $params = shift; my $full_path2readfiles = []; my @replicates = split/\,/, $samples; foreach my $replicate (@replicates) { $replicate = $io->strip_leading_and_trailing_spaces($replicate); $reads_dir = $io->validate_create_path($reads_dir, 'do not create', 'Reads directory'); $replicate = $reads_dir . $replicate; if (!$replicate || !-e $replicate || !-s $replicate || length($replicate) == 0) { $io->error("[libType: $params->{'libType'}]\n\n" . "Read file [ file: $replicate ] does not exist or is empty.\n"); #"Use [libType: MIXED] if you want to mix PE and SE libraries or if you have limited replicates."); } push @{$full_path2readfiles}, $replicate; } return $full_path2readfiles; } # Throw error on incorrect config params sub verify_config_file { my $io = shift; my $msg = shift; my $check_this = shift; my $check_this_name = shift; my $only_these_vals = shift; my $regex = my $regex_msg = 'yes|no'; $regex = $regex_msg = $only_these_vals if ($only_these_vals); $regex_msg = uc($regex_msg); $regex_msg =~ s/\|/ or /g; $io->error("No value(s) provided for [$check_this_name: ]. Please use only $regex_msg." . $msg) if (!$check_this); $io->error("[$check_this_name: $check_this] is not a valid value. Please use only $regex_msg." . $msg) if ($check_this !~ m/^$regex$/i); return; } __END__ =head1 NAME _ ____ _ _ _ _ | |_ __ ___| _ \| \ | | / \ _ __ (_)_ __ ___ | | '_ \ / __| |_) | \| | / _ \ | '_ \| | '_ \ / _ \ | | | | | (__| _ <| |\ |/ ___ \| |_) | | |_) | __/ |_|_| |_|\___|_| \_|_| \_/_/ \_| .__/|_| .__/ \___| |_| |_| =head1 SYNOPSIS lncRNApipe is a reference annotation based automated pipeline to identify non-coding RNAs, both known and novel from RNA-Seq reads. It takes FASTQ / FASTA reads as input, trims reads, assembles transcripts, and identifies any possible known and novel ncRNAs, categorizes them into 5 catgories: Long intergenic lncRNAs (B>), Intronic contained lncRNAs (B>), Partially overlapping lncRNAs (B>), Completely overlapping lncRNAs (B>) and Exonic overlaps (B>) and annotates the novel ncRNAs, where possible. See complete description: perldoc lncRNApipe Examples: View this documentation: perl lncRNApipe --h Run the complete pipeline from the configuration file C<< params.yaml >>: perl lncRNApipe --conf params.yaml Run the complete pipeline on your local cluster's head node using the configuration file C<< params.yaml >> and do not wait for user confirmation of job IDs: perl lncRNApipe --conf params.yaml --dont-wait Run the B> of the pipeline through command-line options without mentioning the configuration file C<< params.yaml >>: perl lncRNApipe --run /data/lncRNApipe/ \ --cuffcmp '-r annotation.gtf -s genome.fa transcripts1.gtf transcripts2.gtf' \ --cat-ncRNAs '-sample-names "M1,M2" -ov 80 -fpkm 2 -len 200 -max-len 10000 -min-exons 1 -antisense" \ --get-uq-feat '-sf known_lncRNAs.bed' \ --fetch-seq '-db mm10' \ --cpc \ --rna \ --inf &> lncRNApipe.run.log =head1 CITATION B Karen Triff; Mathew W McLean; Kranti Konganti; Jiahui Pang; Evelyn Callaway; Beiyan Zhou; Ivan Ivanov; Robert S. Chapkin B> =head1 INSTALLATION lncRNApipe was developed in Linux environment and can be installed on UNIX based machines. If it is a 64-bit machine, all required prerequisites are sandboxed with the software and taken care of, which should run in most instances. lncRNApipe will attempt to find any available prerequisites on your machine and will default to them before using sandboxed binaries. All you need to do is run the following command to install the pipeline and all of its required dependencies: cd /change/into/my/preferred/path/of/installation curl -Ok https://raw.githubusercontent.com/biocoder/Perl-for-Bioinformatics/master/NGS-Utils/lncRNApipe perl lncRNApipe --setup If you are on a 32-bit machine, run the same command as shown above, and lncRNApipe will let you know, which tools must to be installed and be available in your C<< $PATH >>. In any case, the setup process will prompt you to specify whether or not you want to install B>. If the setup process fails due to compiler issues, or if any Perl module installation fails, try setting different compiler specific to your machine state with the following command: Example (Using C<< g++ >> as the compiler): perl lncRNApipe --setup --setup-compiler CC=g++ B =head1 TEST SOFTWARE INSTALLATION Once you have finished installing, you can do a test run with the included data set to see if the software was installed and working properly. The test data and annotation was downloaded from L and subsetted to 25,000 reads. The known ncRNA information in GTF format was downloaded using Table Browser from L database. The reference genome and indices are limited to chromosome 12. To prepare the C<< params.yaml >> specific to your UNIX path, change into the directory where you installed B and do: a=`pwd`"/"; sed "s@\/data.*\/lncRNApipe-test@$a\/lncRNApipe-test@g" params.yaml > lncRNApipe-test/params.test.yaml The above command will replace all the UNIX paths in the file C<< params.yaml >> to the UNIX paths specific to your machine. Then, edit the file C<< params.test.yaml >> with appropriate values for C<< CPUs: >>, C<< scheduler: >>, C<< batchSubCmd: >>, C<< schedulerOpts: >> and / or C<< clusterOpts: >>. Finally, run the pipeline on the test data. If you are running the job on a cluster, do: perl lncRNApipe --conf lncRNApipe-test/params.test.yaml --dont-wait If you are running the pipeline on your local workstation, do: perl lncRNApipe --conf lncRNApipe-test/params.test.yaml After the pipeline finishes, you should see C<< passed >> in all status code files. C<< lncRNApipe-test/run/trimmomatic/status.log >> lncRNApipe: AutoGenJobTime=Tue_Jan_26_10:20:14_2016 Run=1uzk7yVoHMcph AutoGenSampleName=run_1uzk7yVoHMcph.6h_rep_1 Trimmomatic run has passed. lncRNApipe: AutoGenJobTime=Tue_Jan_26_10:20:13_2016 Run=1uzk7yVoHMcph AutoGenSampleName=run_1uzk7yVoHMcph.2cells_rep_1 Trimmomatic run has passed. C<< lncRNApipe-test/run/tophat/status.log >> lncRNApipe: AutoGenJobTime=Tue_Jan_26_10:20:14_2016 Run=1uzk7yVoHMcph AutoGenSampleName=run_1uzk7yVoHMcph.6h_rep_1 Tophat run has passed. lncRNApipe: AutoGenJobTime=Tue_Jan_26_10:20:13_2016 Run=1uzk7yVoHMcph AutoGenSampleName=run_1uzk7yVoHMcph.2cells_rep_1 Tophat run has passed. C<< lncRNApipe-test/run/cufflinks/status.log >> lncRNApipe: AutoGenJobTime=Tue_Jan_26_10:20:14_2016 Run=1uzk7yVoHMcph AutoGenSampleName=run_1uzk7yVoHMcph.6h Cufflinks run has passed. lncRNApipe: AutoGenJobTime=Tue_Jan_26_10:20:13_2016 Run=1uzk7yVoHMcph AutoGenSampleName=run_1uzk7yVoHMcph.2cells Cufflinks run has passed. C<< lncRNApipe-test/run/cufflinks/lncRNApipe_afterCufflinks.status.log >> lncRNApipe: AutoGenJobTime=Tue_Jan_26_10:20:14_2016 Run=1uzk7yVoHMcph Cufflinks assembly run has passed. C<< lncRNApipe-test/run/cuffdiff_on_transcripts/status.log >> lncRNApipe: AutoGenJobTime=Tue_Jan_26_10:20:14_2016 Run=1uzk7yVoHMcph Cuffdiff run has passed. C<< lncRNApipe-test/run4/lncRNApipe.postAssembly.status.log >> lncRNApipe: AutoGenJobTime=Tue_Jan_26_10:20:14_2016 Run=1uzk7yVoHMcph Post assembly pipeline run has passed. =head1 DESCRIPTION lncRNApipe is a complete automated pipeline to: =over 4 =item * Trim B reads using Trimmomatic. =item * Map reads using spliced read mapper, Tophat. =item * Assemble transcripts using Cufflinks. =item * Identify known non-coding RNAs and any putative novel non-coding RNAs based on reference annotation. =item * Use B to estimate the coding potential of identified novel ncRNAs. =item * Annotate novel non-coding RNAs using C<< cmscan >> from Infernal. =item * Use B> to calculate minimum free energy structure of the identified novel non-coding RNAs. =item * Run Cuffdiff to identify differentially expressed known ncRNAs and novel ncRNAs. =item * Optionally, run cuffdiff on all transcripts to identify differentially expressed transcripts as you would normally do with the L. =back The pipeline is very flexible and can be run in two ways: from B> or from B> and maintains checkpoints at each stage so that you can resume from a failed run, if possible. =head2 ASSEMBLY STAGE The most simplest way to run the complete pipeline is from the B>. This is where you supply the raw B reads you got from your sequencing center, configure the parameters in a file called C<< params.yaml >> and run the jobs either or on a compute cluster or on your local work station. The assembly stage can be further broken down into following steps: =head3 Filtering lncRNApipe uses L to trim reads. Any options you generally specify to Trimmomatic can be entered in the C<< params.yaml >> file. =head3 Mapping Next, we run L to perform spliced mapping of RNA-Seq reads. You can speicfy Tophat options using C<< tophat: >> parameter in C<< params.yaml >>. L is used to merge any replicate data before running Cufflinks. =head3 Transcript Assembly L is used to perform transcript assembly and you can specify the options using the C<< cufflinks: >> parameter in C<< params.yaml >> =head2 IDENTIFICATION STAGE In the identification stage, the pipeline will bind the functionality of the tools / scripts: C<< cuffcompare >>, C<< categorize_ncRNAs.pl >>, C<< get_unique_features.pl >>, C<< fetch_seq_from_ucsc.pl >>, C<< RNAfold >>, C<< Infernal >> and B> (C<< CPC >>). Transcriptome construction tools such as Cufflinks produces a set of assembled transcripts in GTF format. lncRNApipe uses this data in addition to known gene annotation to extract putative lncRNAs constructed by the ab initio assemblers. The pipeline relies on the proper transcript-exon features generated by the assembler and validates it against the known reference gene and non coding RNA information to identify putative novel lncRNAs. In brief, the pipeline steps in the B> are as follows: =over 5 =item 1. Cuffcompare (lncRNApipe Option: --cuff or --cuffcompare) The transcript assembly can be compared to known annotation of choice to classify them into different L using Cuffcompare. The assembled transcripts should be in GTF format. Cufflinks generates the output files in GTF format. If you are using other software such as Scripture, you can convert the output file in BED format into GTF using Scripture as: java -jar /path/to/scripture.jar -task toGFF -cufflinks -in scripture.out.bed -source SCRIPTURE -out scripture.out.gtf To get help documentation about this module, run: perl lncRNApipe --h cuff =item 2. categorize_ncRNAs.pl (lncRNApipe Option: --cat or --cat-ncRNAs) lncRNApipe uses the tracking file (C<< *.tracking >>) produced by Cuffcompare, annotation data in gene prediction format and a list of supplied transcripts in GTF format to produce and categorize lncRNAs into different categories as mentioned in this L. In brief, the lncRNAs are classified into 5 categories: B (B>), B (B>), B (B>), B (B>) and B (B>). To get help documentation about this module, run: perl lncRNApipe --h cat =item 3. get_unique_features.pl (lncRNApipe Option: --get or --get--uq-feat) It then compares the putative list with supplied known lncRNAs in BED or GTF format to get features that do not overlap any known lncRNAs. To extract known lncRNAs from your assembled transcripts for downstream analysis, include C<< --known --extract >> with C<< --get >> option of lncRNApipe. To get help documentation about this module, run: perl lncRNApipe --h get =item 4. fetch_seq_from_ucsc.pl (lncRNApipe Option: --fetch or --fetch-seq) This list is then used to fetch DNA sequence of those transcript sequences to determine their coding potential using CPC. To get help documentation about this module, run: perl lncRNApipe --h fetch =item 5. CPC.sh (lncRNApipe Option --cpc or --cpc) In this step, the fetched FASTA sequences are used to determine the L and those that are flagged as B are used to create the final list of high confidence lncRNAs. This step may take a while to finish. To get help documentation about this module, run: perl lncRNApipe --h cpc =item 6. RNAfold (lncRNApipe Option --rna or --rnafold) Here, lncRNApipe pipeline invokes C<< RNAfold >> program to calculate minimum free energy structure of the predicted non-coding RNA. This step may take a while to finish. To get help documentation about this module, run: perl lncRNApipe --h rna =item 7. Infernal (lncRNApipe Option --inf or --infernal) In the final step, lncRNApipe pipeline invokes C<< cmscan >> from B package to search Rfam databases for sturcture and sequence similarities to annotate the putative lncRNAs . To get help documentation about this module, run: perl lncRNApipe --h inf =back =head1 RUNNING lncRNApipe lncRNApipe can be run either completely via the configuration file a.k.a C<< params.yaml >> or the B> i.e the pipeline from C<< cuffcompare >> can be run independently without specifying the configuration file (C<< params.yaml >>), where you specify everything needed through the command line options, but we recommend that you use the configuration file for sake of clarity and simplicity. The file C<< params.yaml >> contains all options required to run the pipeline from start to finish. Once you have entered the options as per your requirements, run the pipeline using the following command: perl lncRNApipe --conf params.yaml =head1 params.yaml C<< params.yaml >> is the configuration file, where you can specify the command line options for each of the modules of lncRNApipe to run the complete pipeline to go from B>. If you do not wish to run any of the modules, simply, disable them and all the lines following it (if, any) by prefixing it with C<< # >>. The comment lines in the C<< params.yaml >> generally start with C<< # >>. The options in the file C<< params.yaml >> follow a specific syntax. The parameters where you specify B or B generally end with a colon (C<< : >>). Example: performTranscriptAssembly: YES The parameters where you specify tools' options end with tool's name followed by tool's options with a dash prefix (C<< - >>). Example: tophat: - --b2-sensitive =head2 params.yaml's Options: =over 4 =item outputDir: /data/konganti/FASTQ_RNAseq/new-lncRNApipe2 Specify output directory. In the example above, a new output directory called C<< new-lncRNApipe2 >> will be created at the mentioned path C<< /data/konganti/FASTQ_RNAseq >> =item overwriteOutputDir: NO Indicate, if we should overwrite the output directory if it already exists. B =item performTranscriptAssembly: YES Let lncRNApipe know if you intend to perform transcript assembly using tophat / cufflinks or just identify ncRNAs from already assembled transcripts. =item CPUs: 10 Specify number of threads / CPUs to use where possible. If running on a grid, please also use job parameter specific to your cluster with C<< schedulerOpts: >> below. =item scheduler: SGE Specify scheduler type. Valid options are C<< PBS >>, C<< SGE >>, C<< LSF >>, C<< SLURM >> or C<< NONE >> to disable grid computing. =item batchSubCmd: qsub Mention batch submission command. For example, for C<< LSF >>, it is C<< bsub >>, for C<< PBS >> or C<< SGE >>, it is C<< qsub >>, for C<< SLURM >>, it is C<< sbatch >>. =item schedulerOpts: Specify scheduler options on separate line specific to your job running environment. ********************* B ***************************** Different clusters uses different job parameter names. For example our cluster uses C<< num_threads >> as parameter name to request number of CPUs in SGE. Some clusters may use C<< num_cpu >>. Since it is difficult to guess, please provide number of CPUs you want to use based on your grid environment below, which is equal to C<< CPUs: >> option above. Yes, we know that this is kind of redundant but it is necessary evil. What ever job parameters you provide after the C<< - >> , they will appear exactly in the job script. B. Ex for PBS (job parameters start with a C<< PBS >> directive): PBS -V PBS -l nodes=1:ppn=16,walltime=24:00:00 Ex for C<< SGE >> (job parameters start with C<< $ >> sign): $ -N lncRNApipe $ -l num_threads=4 So, my scheduler is C<< SGE >>, the parameters on my cluster would look like: schedulerOpts: - $ -l mem_free=10G - $ -l num_threads=10 =item clusterOpts: Mention any other lines that should be included in the job script. Ex: clusterOpts: - module load java1.7.0 perl5.14.2 =item readType: FASTQ Specify if your reads are in C<< FASTA >> or C<< FASTQ >> format. In the example above, reads are in C<< FASTQ >> format. =item libType: SE Specify if your reads are C<< SE >> (single-end only), C<< PE >> (paried-end only) or C<< MIXED >>. =item sourceDB: UCSC Specify if you want to use C<< ENSEMBL >> or C<< UCSC >> as source for annotation and do not want to use C<< useThisAnnotation: >>. B that you are using the same C<< sourceDB >> for tophat alignments, i.e. bowtie indexes created from the same C<< sourceDB >> for the same assembly version. As downloading genome indices on-the-fly is time consuming and since many users may have genome indices installed for various NGS analysis anyways, we leave it to you to provide B genome index below in tophat and cufflinks options' configuration. I C<< sourceDB: >> I C<< ENSEMBL >> I C<< UCSC >> I C<< CUSTOM >>. =item species: rn4 Choose assembly version if you do not want to use C<< useThisAnnotation: >> so that we can download up-to-date annotation on the fly. You can choose to supply your own annotation file for consistency below. C<< ENSEMBL >> and C<< UCSC >> represents species name differently in the URLs. To view species names for C<< ENSEMBL >> do: perl lncRNApipe --list ENSEMBL Go to https://genome.ucsc.edu/FAQ/FAQreleases.html#TOP to view UCSC version names. For example, at C<< UCSC >>, for mouse, it is C<< mm9 >> or C<< mm10 >>, for rat, it is C<< rn5 >> or C<< rn6 >> etc... In case of B>, you can use species name of your choice. Ex: C<< species: cotton >>. =item useThisAnnotation: /data/ref_annotation/rn4_UCSC/genes.gtf If you want to use your own annoation file, provide full path to the annotation file of your choice. This option overrides C<< species: >> options above. Again, B, you are providing the same genome index from the same source annotation file. In the example below, C<< bowtieGenomeIndex >> is C<< rn4 >> genome index created from C<< FASTA >> files from C<< UCSC >>. =item reads: Provide Unix path to directory where the read files are located and also provide read file names that are present in that directory. If your data is just C<< SE >>, then disable C<< r2: >> below by prefixing it with a C<< # >>. Separate replicates of sample by a comma. Separate different samples by a C<< | >>. B. In the example below, I have 8 samples and each sample has about 5-6 replicates. The read library is C<< SE >>. reads: readsDir: /data/konganti/FASTQ_RNAseq/reads r1: cca-2111.fastq,cca-2108.fastq,cca-2107.fastq,cca-1103.fastq,cca-2106.fastq|fpa-2410.fastq,fpa-2409.fastq,fpa-2408.fastq,fpa-1405.fastq,fpa-1403.fastq,fpa-1402.fastq|cpa-2210.fastq,cpa-2209.fastq,cpa-2207.fastq,cpa-1205.fastq,cpa-1204.fastq,cpa-1202.fastq|fca-2311.fastq,fca-2306.fastq,fca-1305.fastq,fca-1303.fastq,fca-1302.fastq,fca-1301.fastq|ccs-2506.fastq,ccs-2504.fastq,ccs-2505.fastq,ccs-1503.fastq,ccs-1502.fastq,ccs-1501.fastq|cps-2605.fastq,cps-2606.fastq,cps-2604.fastq,cps-1603.fastq,cps-1601.fastq,cps-1602.fastq|fcs-2706.fastq,fcs-2705.fastq,fcs-1703.fastq,fcs-2704.fastq,fcs-1702.fastq,fcs-1701.fastq|fps-2806.fastq,fps-2805.fastq,fps-2804.fastq,fps-1803.fastq,fps-1802.fastq,fps-1801.fastq # r2: =item trimmomatic: If you want to use Trimmomatic to trim reads, then provide TRIMMOMATIC options. TRIMMOMATIC provides the following adapter files: B>, B>, B>, B>, B> and B>. These adapter files are COPYRIGHTED by ILLUMINA and are public. No need to provide C<< -threads >>, as it will be handled by lncRNApipe. Ex: trimmomatic: - ILLUMINACLIP:TruSeq3-SE.fa:2:30:10 - LEADING:20 - TRAILING:20 - SLIDINGWINDOW:5:20 - MINLEN:25 =item genomeFasta: /data/ref_fasta/rn4_UCSC/genome.fa Provide full path to genome C<< FASTA >> file. In the example above, it is a multi C<< FASTA >> for C<< rn4 >> downloaded from C<< UCSC >>. =item bowtieGenomeIndex: /data/ref_indexes/bowtie2/rn4_UCSC/genome Provide bowtie genome index. B that you are using the same genome indices for the C<< sourceDB: >> mentioned above. Do not use C<< UCSC >> genome indices if you requested C<< ENSEMBL >> above in C<< sourceDB: >> or vice versa. In the example above, I am using genome indices for C<< rn4 >> created from C<< /data/ref_fasta/rn4_UCSC/genome.fa >> with C<< bowtie2-build >> command. =item tophat: In the example below, it is assumed that you have already created a transcriptome index. If you have not created one, go through tophat manual on how to L. If you use C<< --trancriptome-index >> below and the index does not exist, it will keep overwriting while the jobs are running. No need to provide C<< FASTQ >> files, as they will be handled by lncRNApipe. No need to provide C<< -o >>, as it will be handled by lncRNApipe. No need to provide C<< -p >>, as it will be handled by lncRNApipe. Ex: tophat: - --b2-sensitive - --transcriptome-index=/data/konganti/FASTQ_RNAseq/transcriptome_index/rn4 - --no-coverage-search =item cufflinks: Provide cufflinks options and also B you provide the same reference FASTA for the C<< sourceDB >> you want to use in case of bias correction. No need to provide C<< -g >>, as it will be handled by lncRNApipe. No need to provide C<< -o >>, as it will be handled by lncRNApipe. No need to provide C<< -p >>, as it will be handled by lncRNApipe. No need to provide input files, as they will be handled by lncRNApipe. Ex: cufflinks: - -u - -b /data/ref_fasta/rn4_UCSC/genome.fa =item cuffcompare: Provide cuffcompare options. ********************* B ***************************** If you only want to run lncRNApipe without the transcript B>, then provide assembled transcripts in GTF format, otherwise, no options are needed. # No need to provide C<< -r >>, as it will be handled by lncRNApipe. Provide C<< -i transcript_assembly_list.txt >> if you did not run B>, where C<< transcript_assembly_list.txt >> contains full path to assembled transcript files. Ex: cuffcompare: - -i /data/cufflinks/assembly_list.txt =item categorize_ncRNAs: Provide C<< categorize_ncRNAs.pl >>'s options. See C<< perl lncRNApipe --h cat >> for description of options. No need to provide C<< -annot >> option as it is handled by lncRNApipe. Ex: categorize_ncRNAs: - -sample-names "cca,fpa,cpa,fca,ccs,cps,fcs,fps" - -len 200 - -min-exons 1 - -ov 80 - -inc =item get_unique_features: Provide C<< get_unique_features.pl >>'s options. Generally only C<< -ov >> is required in either case of C<< [performTranscriptAssembly: YES] >> or C<< [performTranscriptAssembly: NO] >>. When you supply your own known ncRNAs file to compare against with C<< -sf >>, then you B also specify it's file format (Ex: gtf or bed) with C<< -sff >> option. See C<< perl lncRNApipe --h get >> for description of options. Ex: get_unique_features: - -ov 10 - -sf /data/G_hirsutum/G_hirsutum_annotation.gff - -sff gtf =item runRNAfold: By, default, C<< RNAfold >> is not run since it is very slow. It is generally recommended to generate C<< RNAfold >> plots based on the transcript of your interest after you have investigated the results, but you can still enable it in the pipeline by uncommenting C<< runRNAfold: YES >> to run C<< RNAfold >> with default options. To pass command line options to RNAfold, define it after line C<< RNAfold: >> Mention any other option other than C<< -p >> and C<< --noPS >> as they are automatically handled by lncRNApipe. See C<< perl lncRNApipe --h rna >> for description of options. Ex: runRNAfold: YES RNAfold: - --circ =item runcmscan: Provide options to C<< cmscan >>. Running C<< cmscan >> with default options provides good matches in most cases, but in any case you want to add extra options, do it here. To run C<< cmscan >> with default options, use C<< runcmscan: YES >>. Add additional options you want to pass to C<< cmscan >> after line C<< cmscan: >>. Provide any options other than C<< -o >>, C<< --tblout >> and C<< --cpu >> as they are automatically handled by lncRNApipe. See C<< "perl lncRNApipe --h inf >> for description of options. Ex: runcmscan: YES cmscan: # - -E 9.0 =item cuffmerge: Provide cuffmerge options. If you have replicates, final predicted lncRNAs will be merged. No need to provide C<< -s >>, as it will be handled by lncRNApipe. No need to provide C<< -g >>, as it will be handled by lncRNApipe. Ex: cuffmerge: - --min-isoform-fraction 0.05 =item cuffdiff: Provide C<< cuffdiff >> options if you want to run differential expression tests between known ncRNAs and between novel ncRNAs in your samples. Normally, C<< cuffdiff >> is only run on identified known and novel ncRNAs. If you want to run C<< cuffdiff >> on all transcripts, i.e to identify differentially expressed transcripts (end of typical tuxedo pipeline), then change C<< runCuffdiffForAllTranscripts >> to C<< YES >>. If you have mentioned C<< -sample-names >> above in C<< categorize_ncRNAs: >>, no need to mention C<< -L >>, else provide C<< -L >> option here. No need to provide C<< -o >>, as it will be handled by lncRNApipe. No need to provide C<< -p >>, as it will be handled by lncRNApipe. No need to provide input files, if you have run tophat, cufflinks above. If C<< performTranscriptAssembly >> is C<< NO >>, then provide your own BAM files here prefixed by C<< -bam >> option. Separate replicate BAM files with comma as you generally do with C<< cuffdiff >> command. Ex: runCuffdiffForAllTranscripts: YES cuffdiff: - -u - -b /data/ref_fasta/rn4_UCSC/genome.fa # - -bam /data/aligned_rn4_UCSC/sample1/rep1.bam,/data/aligned_rn4_UCSC/sample1/rep2.bam /data/aligned_rn4_UCSC/sample2/rep1.bam,/data/aligned_rn4_UCSC/sample2/rep2.bam =back =head1 CAVEATS The pipeline script uses a lot of inherent B core utilities and has been only tested in B shell. Please use absolute full B names. =over 5 =item * Instead of using: lncRNApipe --run ./lncRNApipe_output ... Use: lncRNApipe --run /data/lncRNApipe_output ... =item * If pipeline setup fails due to C<< XML::Parser >> module, you need to install XML parser C libraries. On Ubuntu / Debian based Linux distributions, as C<< root >> user, do: apt-get install libexpat1 libexpat1-dev On RedHat / Fedora / CentOS based Linux distributions, as C<< root >> user do: yum install expat expat-devel =item * Please make sure your C<< $SHELL >> is C<< /bin/bash >>. Do: echo $SHELL If it is not C<< /bin/bash >>, use the command C<< chsh -s /bin/bash >> to change your login shell. You need to log out and log back in for default login shell to take effect. =item * On newer Ubuntu distributions of Linux, even though C<< $SHELL >> shows C<< /bin/bash >>, the shell in fact points to C<< /bin/dash >>. To change that, follow these L. =back =head1 OUTPUT The final output is a putative list of annotated lncRNAs in GTF format that can be directly uploaded as custom tracks to Genome Browsers, such as UCSC etc... and predicted secondary structures by RNAfold. =over 5 =item * Output from C<< trimmomatic.jar >> is stored in directory called B. Replicate data (if any) is maintained separately inside B directory. A C<< status.log >> file inside B directory contains status information for each run. =item * Output from C<< tophat >> is stored in directory called B. Replicate data (if any) is maintained separately inside B directory. A C<< status.log >> file inside B directory contains status information for each run. =item * Output from C<< cufflinks >> is stored in directory called B. Replicate data (if any) is maintained separately inside B directory. A C<< status.log >> file inside B directory contains status information for each run. =item * Output from C<< cuffcompare >> is stored in the directory called B< cuffcompare >. lncRNApipe uses C<< lncRNApipe_cuffcmp.tracking >> file from this folder for the next module. =item * Output from C<< categorize_ncRNAs.pl >> is stored in the directory called B. lncRNApipe uses files ending with suffix C<< .putative.class.lncRNAs.gtf >> for the next module. =item * Output from C<< get_unique_features.pl >> output is stored in the directory called B. lncRNApipe uses files ending with suffix C<< .putative.class.lncRNAs.unique.gtf >> or C<< .putative.class.lncRNAs.known.gtf >> for the next module. =item * Output from C<< fetch_seq_from_ucsc.pl >> is stored in the directory called B. lncRNApipe uses these FASTA seqeunce files to run CPC, RNAfold and Infernal modules. The file ending in suffix C<< .noncoding.FASTA >> is used for the next modules. =item * C<< CPC >>predictions are stored in a directory called B. =item * The final result files are stored in a directory called B and have a suffix B<.final.gtf>. =item * The C<< RNAfold >> plots are stored in directories inside B directory ending in suffix C<< .final.RNAfold >>. =item * The result files from C<< cmscan >> (B) are stored in directories inside B directory ending with suffix C<< .infernal >>. The file ending in suffix C<< .allInfernalHits.gtf >> contains all RNA hmm and cm model matches from C<< cmscan >> (B) and the best hit is applied to file ending in suffix C<< .final.gtf >> based on the sequence coverage (C<< --coverage-infernal >>) filter requested. =item * Output directory B contains files from merged novel ncRNAs. C<< merged.gtf >> file from this directory is used to get information about differentially expressed novel ncRNAs. =item * Output directory B contains files from merged known ncRNAs. C<< merged.gtf >> file from this directory is used to get information about differentially expressed known ncRNAs. =item * Output directory B contains files from C<< cuffdiff >> run on novel ncRNAs. =item * Output directory B contains files from C<< cuffdiff >> run on known ncRNAs. =item * Output directory B contains all the job scripts automatically generated by lncRNApipe. =back The pipeline has the ability to run each of these individual modules if any of them fail during the initial run. =head2 Start the pipeline without using the configuration file: =over 5 =item * Run the complete pipeline without using the configuration file through command line options: perl lncRNApipe --run /data/lncRNApipe/ --cuff '-r /data/projects/mm10/macrophages/rna-seq/cuffcompare.m0.m1.m2_vs_ensembl_ref_known/refSeq_UCSCKnownGenes_Ensemble.gtf -s /data/ref_fasta/mm10_UCSC/whole_genome.fa /data/projects/mm10/macrophages/rna-seq/m0/cufflinks/transcripts.gtf /data/projects/mm10/macrophages/rna-seq/m1/cufflinks/transcripts.gtf /data/projects/mm10/macrophages/rna-seq/m2/cufflinks/transcripts.gtf' -cat '-clean -overlap 80 -len 200 -max-len 10000 -min-exons 1 -fpkm 2 -antisense -sample-names "M0,M1,M2"' --get '-sf /data/projects/mm10/macrophages/ncrna/known_ref_lncRNAs/mm10_ncRNA.bed' --fetch --cpc --inf =item * Run B> module of the pipeline: perl lncRNApipe --run /data/lncRNApipe/ --cat '-clean -overlap 80 -len 200 -max-len 10000 -min-exons 1 -fpkm 2 -annotation /data/projects/mm10/macrophages/ncrna/2lncRNA/allGenes.txt -antisense -sample-names "M0,M1,M2" /data/projects/mm10/macrophages/rna-seq/m0/cufflinks/transcripts.gtf /data/projects/mm10/macrophages/rna-seq/m1/cufflinks/transcripts.gtf /data/projects/mm10/macrophages/rna-seq/m2/cufflinks/transcripts.gtf' =item * Run B> module of the pipeline to extract novel lncRNAs: perl lncRNApipe -run /data/lncRNApipe --get '-sf /data/projects/mm10/macrophages/ncrna/known_ref_lncRNAs/mm10_ncRNA.bed' =item * Run B> of the pipeline to extract known lncRNAs: perl lncRNApipe --run /data/lncRNApipe --get '--known --extract -sf /data/projects/mm10/macrophages/ncrna/known_ref_lncRNAs/mm10_ncRNA.bed' =item * Run B> of the pipeline: perl lncRNApipe --run /data/lncRNApipe --fetch '-local /data/ref_fasta/mm10_UCSC/genome.fa' =item * Run B> module of the pipeline: perl lncRNApipe --run /data/lncRNApipe --cpc =item * Run B> module of the pipeline: perl lncRNApipe --run /data/lncRNApipe --rna =item * Run B> module of the pipeline: perl lncRNApipe --run /data/lncRNApipe --inf =back =head1 OPTIONS lncRNApipe takes the following arguments: =over 4 =item --v or --version (Optional) Displays version information and quits. =item --d or --no-update-chk (Optional) Disable software update check. =item --h or --help (Optional) Displays this helpful message. =item --setup or --setup Try to setup the pipeline with all its dependencies. =item --setup-compiler If any of the perl module installation fails during the setup process, change the compiler that works with your machine state. Ex: perl lncRNApipe --setup --setup-compiler CC=icpc =item --run Run the lncRNApipe pipeline with output directory supplied as option. Ex: perl lncRNApipe -run /home/konganti/lncRNApipe_output ... =item --cuff or --cuffcompare Run the C<< cuffcompare >> module with supplied options. =item --cat or --cat-ncRNAs Run the C<< categorize_ncRNAs.pl >> module with supplied options. =item --get or --get-uq-feat Run the C<< get_unique_features.pl >> module with supplied options. =item --fetch or --fetch-seq Run the C<< fetch_seq_from_ucsc.pl >> module with supplied options. Using C<< --fetch >> without any options will try and use local reference FASTA mentioned with C<< --cuffcompare >> option. =item --skip-get (--fetch-seq option required) Run the C<< fetch_seq_from_ucsc.pl >> module without running C<< get_unique_features.pl >> module. In some cases, you may not have a file of known non-coding RNAs. In such cases, you can skip C<< get_unique_features.pl >> module, and directly fetch FASTA sequences for features from C<< --cat-ncRNAs >> module. =item --cpc or --cpc Run the C<< CPC >> module. No options are required. =item --cpc-strand-reverse (Optional) This option will direct CPC 2.0 to search the query strand in both forward and reverse directions. Default: Only forward strand is searched. =item --skip-cpc Skip runnning C<< CPC >> altogether. Use this option, if you think CPC is flagging a lot of your transcripts as B> and instead rely only on Infernal search results. =item --skip-cpc-core Skip runnning core C<< CPC >> process once you know you have output from C<< CPC >>. This option can be used when lncRNApipe fails for some reason after C<< CPC >> has finished running but unable to continue forward. =item --rna or --rnafold Run C<< RNAfold >> module of the pipeline. No options required =item --skip-rnafold-core Skip runnning core C<< RNAfold >> process once you know you have output from C<< RNAfold >>. This option can be used when you want lncRNApipe to generate plots based on output from C<< RNAfold >> which should have completed successfully. =item --inf or --infernal Run Infernal module of the pipeline. No options required =item --skip-cmscan-core Skip runnning C<< cmscan >> process once you know you have output from a successful C<< cmscan >> run. This option can be used when you want lncRNApipe to extract annotation from infernal and attempt to annotate putative novel lncRNAs. =item --cov-inf or --coverage-infernal Only annotate final putative lncRNAs which have C<< cmscan >> match over this much percentage of sequence. Default: 10 =item --cpu lncRNApipe will attempt to run multiple processes on this may CPU cores. Mentioning number of CPUs equal to or more than number of assembled transcript files is strongly encouraged. =item --send-run-report lncRNApipe will send your log file. For this to work, you need to save the output of the pipeline to a log file. Ex: perl lncRNApipe --run /data/lncRNApipe/ \ --cuffcmp '-r annotation.gtf -s genome.fa transcripts1.gtf transcripts2.gtf' \ --cat-ncRNAs '-sample-names "M1,M2" -ov 80 -fpkm 2 -len 200 -max-len 10000 -min-exons 1 -antisense" \ --fetch-seq '-db mm10' \ --cpc \ --rna \ --inf &> lncRNAPipe.run.log and then, if the run fails or if you wish to discuss other issues. Use: perl lncRNApipe --send-run-report '-email your_email@gmail.com -log lncRNApipe.run.log -m "I cannot run lncRNApipe because lorem ipsum ...."' =item --dont-wait Initially, when designing the pipeline, using C<< Schedule::DRMAAc >> was considered to submit jobs via DRMAA API in grid computing environment. I later found that C<< drmaa.h >> is not present on all cluster installations and cluster admins can install DRMAA API on request, but it defeats the purpose of simply installing the pipeline and just running the jobs. I then used regex to parse the output of the batch submission command and create dependency chain from it. It works on most of the cluster installations where the scheduler is either SGE, PBS, SLURM or LSF. Using C<< --dont-wait >> option will not wait for user confirmation to verify that the program has parsed the job IDs correctly. When C<< --dont-wait >> option is not used, the program will prompt for user to verify that the job IDs gathered are in fact correct. Yes, this is rudimentary, but as I explained above, it gets the job done. =item --kill Kill jobs. If you have used either grid submission or not used grid submission, all the job ids or PIDs will be present in 3 files: B/C<< job_IDs_tophat.RUNID >>, B/C<< job_IDs_cufflinks.RUNID >> and B/C<< job_IDs_lncRNApipe_postAssembly.RUNID >>. If you want to kill any jobs for any reason do: perl lncRNApipe --conf params.yaml --kill job_scripts.Wed_Jun_24_11_30_44_2015/job_IDs_tophat.2CphrqPLqFdCs If you want to kill jobs from multiple job IDs' files, do: perl lncRNApipe --conf params.yaml --kill job_scripts.Wed_Jun_24_11_30_44_2015/job_IDs_tophat.2CphrqPLqFdCs --kill job_scripts.Wed_Jun_24_11_30_44_2015/job_IDs_cufflinks.2CphrqPLqFdCs --kill job_scripts.Wed_Jun_24_11_30_44_2015/job_IDs_lncRNApipe_postAssembly.2CphrqPLqFdCs =item --show-counts Shows lncRNA counts for requested files. If for any reason, you want to get just the counts of identified lncRNAs from lncRNApipe, use this option. Ex: perl lncRNApipe --show-counts lncRNApipe.final/transcripts.1.gtf --show-counts lncRNApipe.final/transcripts.2.gtf =back =head1 AUTHOR Kranti Konganti, Ekonganti@tamu.eduE. =head1 COPYRIGHT This program is distributed under the Artistic License 2.0. =head1 DATE Oct-04-2019 =cut