#!/usr/bin/env ruby # Michael Matschiner, 2018-05-21 # # This script prepares XML format input files for the programs SNAPP (http://beast2.org/snapp/) # and SNAPPER (https://github.com/rbouckaert/snapper) given a SNP matrix in Phylip or VCF format, # a table linking species IDs and specimen IDs, and a file specifying age constraints. # Optionally, a starting tree can be provided (recommended) and the number of MCMC iterations # used by SNAPP or SNAPPER can be specified. Files prepared by this script use a particular # combination of priors and operators optimized for analyses that aim to estimate species # divergence times, and all population sizes are linked to enable SNAPP analysis with up to # around 20 or 30 populations. # Detailed annotation explaining the choice of priors and operators will be included in the # XML file (unless turned off with option '-n'). # # A manuscript on divergence-time estimation with SNAPP is in preparation. # # To learn about the available options of this script, run with '-h' (or without any options): # ruby snapp_prep.rb -h # # For questions and bug fixes, email to michaelmatschiner@mac.com # Load required libraries. require 'optparse' # Feedback. puts "" puts "snapp_prep.rb" puts "" puts "----------------------------------------------------------------------------------------" puts "" # Define default options. options = {} options[:analysis] = "SNAPP" options[:phylip] = nil options[:vcf] = nil options[:table] = "example.spc.txt" options[:constraints] = "example.con.txt" options[:tree] = nil options[:length] = 500000 options[:weight] = 1.0 options[:max_snps] = nil options[:min_dist] = nil options[:transversions] = false options[:transitions] = false options[:no_annotation] = false options[:xml] = nil options[:out] = nil # Get the command line options. ARGV << '-h' if ARGV.empty? opt_parser = OptionParser.new do |opt| opt.banner = "Usage: ruby #{$0} [OPTIONS]" opt.separator "" opt.separator "Example:" opt.separator "ruby #{$0} -p example.phy -t #{options[:table]} -c #{options[:constraints]}" opt.separator "" opt.separator "Options:" opt.on("-a","--analysis ANALYSIS","Analysis type, either 'SNAPP' or 'SNAPPER' (default: SNAPP).") {|a| options[:analysis] = a} opt.on("-p","--phylip FILENAME","File with SNP data in phylip format (default: none).") {|p| options[:phylip] = p} opt.on("-v","--vcf FILENAME","File with SNP data in vcf format (default: none).") {|v| options[:vcf] = v} opt.on("-t","--table FILENAME","File with table linking species and individuals (default: #{options[:table]}).") {|t| options[:table] = t} opt.on("-c","--constraints FILENAME","File with age constraint information (default: #{options[:constraints]}).") {|c| options[:constraints] = c} opt.on("-s","--starting-tree FILENAME","File with starting tree in Nexus or Newick format (default: none).") {|s| options[:tree] = s} opt.on("-l","--length LENGTH",Integer,"Number of MCMC generations (default: #{options[:length]}).") {|l| options[:length] = l} opt.on("-w","--weight WEIGHT",Float,"Relative weight of topology operator (default: #{options[:weight]}).") {|w| options[:weight] = w} opt.on("-m","--max-snps NUMBER",Integer,"Maximum number of SNPs to be used (default: no maximum).") {|m| options[:max_snps] = m} opt.on("-q","--min-dist NUMBER",Integer,"Minimum distance between SNPs to be used (default: no minimum).") {|q| options[:min_dist] = q} opt.on("-r","--transversions","Use transversions only (default: #{options[:transversions]}).") {options[:transversions] = true} opt.on("-i","--transitions","Use transitions only (default: #{options[:transitions]}).") {options[:transitions] = true} opt.on("-x","--xml FILENAME","Output file in XML format (default: ANALYSIS.xml).") {|x| options[:xml] = x} opt.on("-o","--out PREFIX","Prefix for .log and .trees output files (default: ANALYSIS).") {|i| options[:out] = i} opt.on("-n","--no-annotation","Do not add explanatory annotation to XML file (default: #{options[:no_annotation]}).") {options[:no_annotation] = true} opt.on("-h","--help","Print this help text.") { puts opt_parser exit(0) } opt.separator "" end opt_parser.parse! # Get the analysis type. if options[:analysis].downcase == "snapp" analysis_type = "snapp" elsif options[:analysis].downcase == "snapper" analysis_type = "snapper" else puts "ERROR: Unknown analysis type '#{analysis_type}'!" exit(1) end options[:xml] = "#{analysis_type}.xml" if options[:xml] == nil options[:out] = "#{analysis_type}" if options[:out] == nil # Make sure that input is provided in either phylip or vcf format. if options[:phylip] == nil and options[:vcf] == nil puts "ERROR: An input file must be provided, either in phylip format with option '-p' or in vcf format with option '-v'!" exit(1) elsif options[:phylip] != nil and options[:vcf] != nil puts "ERROR: Only one of the two options '-p' and '-v' can be used!" exit(1) end # Make sure that the -r and -i options are not used jointly. if options[:transversions] and options[:transitions] puts "ERROR: Only one of the two options '-r' and '-i' can be used!" exit(1) end # Make sure that the specified weight is not negative. if options[:weight] < 0 puts "ERROR: The specified relative weight of the topology operator must not be negative!" exit(1) end # Initiate a warn string and counts for excluded sites. warn_string = "" number_of_excluded_sites_missing = 0 number_of_excluded_sites_monomorphic = 0 number_of_excluded_sites_triallelic = 0 number_of_excluded_sites_tetraallelic = 0 number_of_excluded_sites_indel = 0 number_of_excluded_sites_transition = 0 number_of_excluded_sites_transversion = 0 number_of_sites_with_half_call = 0 # Initiate arrays for specimen ids and sequences. specimen_ids = [] seqs = [] # Define various characters. binary_chars = ["0","1","2"] nucleotide_chars = ["A","C","G","T","R","Y","S","W","K","M"] ambiguous_chars = ["R","Y","S","W","K","M"] missing_chars = ["-","?","N"] # Read the phylip file. if options[:phylip] != nil phylip_file = File.open(options[:phylip]) phylip_lines = phylip_file.readlines phylip_lines[1..-1].each do |l| unless l.strip == "" specimen_ids << l.split[0] seqs << l.split[1].upcase end end # Recognize the sequence format (nucleotides or binary). unique_seq_chars = [] seqs.each do |s| s.size.times do |pos| unless missing_chars.include?(s[pos]) unique_seq_chars << s[pos] unless unique_seq_chars.include?(s[pos]) end end end sequence_format_is_nucleotide = true sequence_format_is_binary = true unique_seq_chars.each do |c| sequence_format_is_binary = false unless binary_chars.include?(c) sequence_format_is_nucleotide = false unless nucleotide_chars.include?(c) end if sequence_format_is_binary == false and sequence_format_is_nucleotide == false puts "ERROR: Sequence format could not be recognized as either 'nucleotide' or 'binary'!" exit(1) end end # Alternatively, read the vcf file. tmp_line_count_all = 0 if options[:vcf] != nil vcf_file = File.open(options[:vcf]) vcf_lines = vcf_file.readlines vcf_header_line = "" lgs = [] positions_on_lgs = [] vcf_lines.each do |l| if l[0..1] != "##" if l[0] == "#" and vcf_header_line == "" vcf_header_line = l vcf_header_line_ary = vcf_header_line.split specimen_ids = vcf_header_line_ary[9..-1] specimen_ids.size.times {seqs << ""} elsif vcf_header_line != "" tmp_line_count_all += 1 line_ary = l.split lg = line_ary[0] position_on_lg = line_ary[1].to_i ref = line_ary[3] alt = line_ary[4] if ref.size == 1 and alt.size == 1 format_str = line_ary[8] gt_index = format_str.split(":").index("GT") if gt_index == nil puts "ERROR: Expected 'GT' in FORMAT field but could not find it!" exit(1) end specimen_index = 0 found_half_called_gt = false line_ary[9..-1].each do |rec| gt = rec.split(":")[gt_index] gt = "./." if gt == "." if gt.include?("/") gt1 = gt.split("/")[0] gt2 = gt.split("/")[1] elsif gt.include?("|") gt1 = gt.split("|")[0] gt2 = gt.split("|")[1] else puts "ERROR: Expected alleles to be separated by '/' or '|' but did not found such separators!" puts l # TODO XXX remove exit(1) end if ["0","1","."].include?(gt1) and ["0","1","."].include?(gt2) if gt1 == "0" base1 = ref elsif gt1 == "1" base1 = alt else base1 = "N" end if gt2 == "0" base2 = ref elsif gt2 == "1" base2 = alt else base2 = "N" end if base1 == "N" and base2 == "N" seqs[specimen_index] << "N" elsif base1 == "N" or base2 == "N" seqs[specimen_index] << "N" found_half_called_gt = true elsif [base1,base2].sort == ["A","A"] seqs[specimen_index] << "A" elsif [base1,base2].sort == ["A","C"] seqs[specimen_index] << "M" elsif [base1,base2].sort == ["A","G"] seqs[specimen_index] << "R" elsif [base1,base2].sort == ["A","T"] seqs[specimen_index] << "W" elsif [base1,base2].sort == ["C","C"] seqs[specimen_index] << "C" elsif [base1,base2].sort == ["C","G"] seqs[specimen_index] << "S" elsif [base1,base2].sort == ["C","T"] seqs[specimen_index] << "Y" elsif [base1,base2].sort == ["G","G"] seqs[specimen_index] << "G" elsif [base1,base2].sort == ["G","T"] seqs[specimen_index] << "K" elsif [base1,base2].sort == ["T","T"] seqs[specimen_index] << "T" else puts "ERROR: Unexpected genotype: #{base1} and #{base2}!" exit(1) end else puts "ERROR: Expected genotypes to be bi-allelic and contain only 0s and/or 1s or missing data marked with '.', but found #{gt1} and #{gt2}!" exit(1) end specimen_index += 1 end lgs << lg positions_on_lgs << position_on_lg number_of_sites_with_half_call += 1 if found_half_called_gt elsif ref.size > 1 and ref.include?(",") == false number_of_excluded_sites_indel += 1 elsif alt.size > 1 and alt.include?(",") == false number_of_excluded_sites_indel += 1 elsif ref.include?(",") or alt.include?(",") number_of_ref_and_alt_alleles = ref.count(",") + alt.count(",") + 2 if number_of_ref_and_alt_alleles == 3 number_of_excluded_sites_triallelic += 1 elsif number_of_ref_and_alt_alleles == 4 number_of_excluded_sites_tetraallelic += 1 else puts "ERROR: Unexpected combination of REF and ALT alleles (REF: #{ref}; ALT: #{alt})!" exit(1) end end else puts "ERROR: Expected a vcf header line beginning with '#CHROM' but could not find it!" exit(1) end end end # Make sure that all sequences have a positive and equal length. seqs[1..-1].each do |s| if s.size != seqs[0].size puts "ERROR: Sequences have different lengths!" exit(1) end end # Make sure that information on linkage groups and positions on these have the same length. if lgs.size != seqs[0].size or positions_on_lgs.size != seqs[0].size puts "ERROR: Information on linkage groups or SNP positions is inconsistent with number of genotypes!" puts lgs.size # Todo Remove puts seqs[0].size puts positions_on_lgs.size exit(1) end # Specify that the sequence format is nucleotide. sequence_format_is_binary = false end # If necessary, translate the sequences into SNAPP's "0", "1", "2" code, where "1" is heterozygous. binary_seqs = [] seqs.size.times{binary_seqs << ""} if options[:vcf] != nil binary_lgs = [] binary_positions_on_lgs = [] end if sequence_format_is_binary seqs[0].size.times do |pos| alleles_at_this_pos = [] seqs.each do |s| alleles_at_this_pos << s[pos] if binary_chars.include?(s[pos]) end uniq_alleles_at_this_pos = alleles_at_this_pos.uniq if uniq_alleles_at_this_pos.size == 2 or uniq_alleles_at_this_pos.size == 3 seqs.size.times do |x| binary_seqs[x] << seqs[x][pos] end if options[:vcf] != nil binary_lgs << lgs[pos] binary_positions_on_lgs << positions_on_lgs[pos] end elsif uniq_alleles_at_this_pos.size == 0 number_of_excluded_sites_missing += 1 elsif uniq_alleles_at_this_pos.size == 1 number_of_excluded_sites_monomorphic += 1 end end # Check if the total number of '0' and '2' in the data set are similar. total_number_of_0s = 0 total_number_of_2s = 0 binary_seqs.each do |b| total_number_of_0s += b.count("0") total_number_of_2s += b.count("2") end goal = 0.5 tolerance = 0.01 if (total_number_of_0s/(total_number_of_0s+total_number_of_2s).to_f - goal).abs > tolerance warn_string << "WARNING: The number of '0' and '2' in the data set is expected to be similar, however,\n" warn_string << " they differ by more than #{tolerance*100.round} percent.\n" warn_string << "\n" end else seqs[0].size.times do |pos| # Collect all bases at this position. bases_at_this_pos = [] seqs.each do |s| if s[pos] == "A" bases_at_this_pos << "A" bases_at_this_pos << "A" elsif s[pos] == "C" bases_at_this_pos << "C" bases_at_this_pos << "C" elsif s[pos] == "G" bases_at_this_pos << "G" bases_at_this_pos << "G" elsif s[pos] == "T" bases_at_this_pos << "T" bases_at_this_pos << "T" elsif s[pos] == "R" bases_at_this_pos << "A" bases_at_this_pos << "G" elsif s[pos] == "Y" bases_at_this_pos << "C" bases_at_this_pos << "T" elsif s[pos] == "S" bases_at_this_pos << "G" bases_at_this_pos << "C" elsif s[pos] == "W" bases_at_this_pos << "A" bases_at_this_pos << "T" elsif s[pos] == "K" bases_at_this_pos << "G" bases_at_this_pos << "T" elsif s[pos] == "M" bases_at_this_pos << "A" bases_at_this_pos << "C" else unless ["N","?","-"].include?(s[pos]) puts "ERROR: Found unexpected base at position #{pos+1}: #{s[pos]}!" exit(1) end end end uniq_bases_at_this_pos = bases_at_this_pos.uniq # Issue a warning if non-bi-allelic sites are excluded. if uniq_bases_at_this_pos.size == 2 # Determine if this is a transition or transversion site. transversion_site = false if uniq_bases_at_this_pos.sort == ["A","C"] transversion_site = true elsif uniq_bases_at_this_pos.sort == ["A","G"] transversion_site = false elsif uniq_bases_at_this_pos.sort == ["A","T"] transversion_site = true elsif uniq_bases_at_this_pos.sort == ["C","G"] transversion_site = true elsif uniq_bases_at_this_pos.sort == ["C","T"] transversion_site = false elsif uniq_bases_at_this_pos.sort == ["G","T"] transversion_site = true else puts "ERROR: Unexpected combination of unique bases at position #{pos+1}: #{uniq_bases_at_this_pos[0]}, #{uniq_bases_at_this_pos[1]}" exit(1) end transition_site = true transition_site = false if transversion_site == true if options[:transversions] == true and transversion_site == false # If the site is a transition and only transversions are allowed. number_of_excluded_sites_transition += 1 elsif options[:transitions] == true and transition_site == false # If the site is a transversion and only transitions are allowed. number_of_excluded_sites_transversion += 1 else # Randomly define what's "0" and "2". uniq_bases_at_this_pos.shuffle! seqs.size.times do |x| if seqs[x][pos] == uniq_bases_at_this_pos[0] binary_seqs[x] << "0" elsif seqs[x][pos] == uniq_bases_at_this_pos[1] binary_seqs[x] << "2" elsif missing_chars.include?(seqs[x][pos]) binary_seqs[x] << "-" elsif ambiguous_chars.include?(seqs[x][pos]) binary_seqs[x] << "1" else puts "ERROR: Found unexpected base at position #{pos+1}: #{seqs[x][pos]}!" exit(1) end end if options[:vcf] != nil binary_lgs << lgs[pos] binary_positions_on_lgs << positions_on_lgs[pos] end end elsif uniq_bases_at_this_pos.size == 0 number_of_excluded_sites_missing += 1 elsif uniq_bases_at_this_pos.size == 1 number_of_excluded_sites_monomorphic += 1 elsif uniq_bases_at_this_pos.size == 3 number_of_excluded_sites_triallelic += 1 elsif uniq_bases_at_this_pos.size == 4 number_of_excluded_sites_tetraallelic += 1 else puts "ERROR: Found unexpected number of alleles at position #{pos+1}!" exit(1) end end end # Make sure that information on linkage groups and positions on these have the same length. if options[:vcf] != nil if binary_lgs.size != binary_seqs[0].size or binary_positions_on_lgs.size != binary_seqs[0].size puts "ERROR: Information on linkage groups or SNP positions is inconsistent with number of genotypes!" exit(1) end end # Read the file with a table linking species and individuals. table_file = File.open(options[:table]) table_lines = table_file.readlines table_species = [] table_specimens = [] table_lines.each do |l| next if l.strip == "" line_ary = l.split header_line = false header_line = true if line_ary[0].downcase == "species" and line_ary[1].downcase == "specimen" header_line = true if line_ary[0].downcase == "species" and line_ary[1].downcase == "specimens" header_line = true if line_ary[0].downcase == "species" and line_ary[1].downcase == "sample" header_line = true if line_ary[0].downcase == "species" and line_ary[1].downcase == "samples" header_line = true if line_ary[0].downcase == "species" and line_ary[1].downcase == "individual" header_line = true if line_ary[0].downcase == "species" and line_ary[1].downcase == "individuals" unless header_line table_species << line_ary[0] table_specimens << line_ary[1] end end # Make sure that the arrays table_specimens and specimen_ids are identical when sorted. unless table_specimens.sort == specimen_ids.sort if options[:vcf] != nil input_file_name = options[:vcf] else input_file_name = options[:phylip] end puts "ERROR: The individuals listed in file #{options[:table]} and those included in the input file #{input_file_name} are not identical!" error_string = " File #{options[:table]} includes individuals " table_specimens.sort[0..-2].each do |s| error_string << "#{s}, " end error_string << "and #{table_specimens.sort[-1]}." puts error_string error_string = " The input file #{input_file_name} includes individuals " specimen_ids.sort[0..-2].each do |s| error_string << "#{s}, " end error_string << "and #{specimen_ids.sort[-1]}." puts error_string exit(1) end # Remove sites at which one or more species have only missing data; these could not be used by SNAPP anyway. binary_seqs_filtered = [] binary_seqs.size.times {binary_seqs_filtered << ""} if options[:vcf] != nil binary_lgs_filtered = [] binary_positions_on_lgs_filtered = [] end binary_seqs[0].size.times do |pos| one_or_more_species_have_only_missing_data_at_this_pos = false table_species.uniq.each do |spc| specimens_for_this_species = [] table_specimens.size.times do |x| specimens_for_this_species << table_specimens[x] if table_species[x] == spc end alleles_for_this_species_at_this_pos = [] specimen_ids.size.times do |x| if specimens_for_this_species.include?(specimen_ids[x]) alleles_for_this_species_at_this_pos << binary_seqs[x][pos] end end if alleles_for_this_species_at_this_pos.uniq == ["-"] one_or_more_species_have_only_missing_data_at_this_pos = true end end # Set all alleles at this position to nil if one species had only missing data. if one_or_more_species_have_only_missing_data_at_this_pos number_of_excluded_sites_missing += 1 else binary_seqs.size.times do |x| binary_seqs_filtered[x] << binary_seqs[x][pos] end if options[:vcf] != nil binary_lgs_filtered << binary_lgs[pos] binary_positions_on_lgs_filtered << binary_positions_on_lgs[pos] end end end binary_seqs = binary_seqs_filtered if options[:vcf] != nil binary_lgs = binary_lgs_filtered binary_positions_on_lgs = binary_positions_on_lgs_filtered end # Make sure that information on linkage groups and positions on these have the same length. if options[:vcf] != nil if binary_lgs.size != binary_seqs[0].size or binary_positions_on_lgs.size != binary_seqs[0].size puts "ERROR: Information on linkage groups or SNP positions is inconsistent with number of genotypes!" exit(1) end end # If a minimum distance between SNPs has been set, thin the data set according to this number. if options[:min_dist] != nil and options[:min_dist] > 1 number_of_excluded_sites_due_to_min_dist = 0 binary_seqs_filtered = [] binary_seqs.size.times {binary_seqs_filtered << ""} if options[:vcf] != nil last_used_lg = nil last_used_pos_on_lg = nil binary_seqs[0].size.times do |pos| if last_used_lg != binary_lgs[pos] or last_used_pos_on_lg <= binary_positions_on_lgs[pos] - options[:min_dist] last_used_lg = binary_lgs[pos] last_used_pos_on_lg = binary_positions_on_lgs[pos] binary_seqs.size.times do |x| binary_seqs_filtered[x] << binary_seqs[x][pos] end else number_of_excluded_sites_due_to_min_dist += 1 end end else last_used_pos = nil binary_seqs[0].size.times do |pos| if last_used_pos == nil or last_used_pos <= pos - options[:min_dist] last_used_pos = pos binary_seqs.size.times do |x| binary_seqs_filtered[x] << binary_seqs[x][pos] end else number_of_excluded_sites_due_to_min_dist += 1 end end end binary_seqs = binary_seqs_filtered end # If a maximum number of SNPs has been set, reduce the data set to this number. number_of_sites_before_excluding_due_to_max = binary_seqs[0].size number_of_excluded_sites_due_to_max = 0 if options[:max_snps] != nil if options[:max_snps] < 1 puts "ERROR: The specified maximum number of SNPs must be positive!" exit(1) elsif options[:max_snps] < binary_seqs[0].size seq_indices = [] binary_seqs[0].size.times {|x| seq_indices << x} selected_seq_indices = seq_indices.sample(options[:max_snps]).sort binary_seqs_red = [] binary_seqs.each do |s| binary_seq_red = "" selected_seq_indices.each do |i| binary_seq_red << s[i] end binary_seqs_red << binary_seq_red end binary_seqs = binary_seqs_red number_of_excluded_sites_due_to_max = number_of_sites_before_excluding_due_to_max - options[:max_snps] else warn_string << "WARNING: The maximum number of SNPs has been set to #{options[:max_snps]}, which is greater\n" warn_string << " than the number of bi-allelic SNPs with sufficient information (#{binary_seqs[0].size}) for #{analysis_type.upcase}.\n" end end # Compose the warn string if necessary. if number_of_sites_with_half_call > 0 warn_string << "WARNING: Found #{number_of_sites_with_half_call} site" warn_string << "s" if number_of_sites_with_half_call > 1 warn_string << " with genotypes that were half missing. These genotypes were ignored.\n" end if number_of_excluded_sites_missing > 0 warn_string << "WARNING: Excluded #{number_of_excluded_sites_missing} site" warn_string << "s" if number_of_excluded_sites_missing > 1 warn_string << " with only missing data in one or more species.\n" end if number_of_excluded_sites_monomorphic > 0 warn_string << "WARNING: Excluded #{number_of_excluded_sites_monomorphic} monomorphic site" warn_string << "s" if number_of_excluded_sites_monomorphic > 1 warn_string << ".\n" end if number_of_excluded_sites_transition > 0 warn_string << "WARNING: Excluded #{number_of_excluded_sites_transition} transition site" warn_string << "s" if number_of_excluded_sites_transition > 1 warn_string << ".\n" end if number_of_excluded_sites_transversion > 0 warn_string << "WARNING: Excluded #{number_of_excluded_sites_transversion} transversion site" warn_string << "s" if number_of_excluded_sites_transversion > 1 warn_string << ".\n" end if number_of_excluded_sites_triallelic > 0 warn_string << "WARNING: Excluded #{number_of_excluded_sites_triallelic} tri-allelic site" warn_string << "s" if number_of_excluded_sites_triallelic > 1 warn_string << ".\n" end if number_of_excluded_sites_tetraallelic > 0 warn_string << "WARNING: Excluded #{number_of_excluded_sites_tetraallelic} tetra-allelic site" warn_string << "s" if number_of_excluded_sites_tetraallelic > 1 warn_string << ".\n" end # If there were any warning, print them. unless warn_string == "" warn_string << "\n" puts warn_string end # Compose the info string if necessary. info_string = "" if options[:vcf] != nil and options[:min_dist] != nil and options[:min_dist] > 1 info_string << "INFO: Removed #{number_of_excluded_sites_due_to_min_dist} bi-allelic sites due to specified minimum distance between sites of #{options[:min_dist]} bp.\n" elsif options[:min_dist] != nil and options[:min_dist] > 1 info_string << "INFO: Removed #{number_of_excluded_sites_due_to_min_dist} bi-allelic sites due to specified minimum distance between sites of #{options[:min_dist]} bp.\n" end if options[:max_snps] != nil info_string << "INFO: Removed #{number_of_excluded_sites_due_to_max} bi-allelic sites due to specified maximum number of #{options[:max_snps]} sites.\n" end if options[:transversions] info_string = "INFO: Retained #{binary_seqs[0].size} bi-allelic transversion sites.\n" elsif options[:transitions] info_string = "INFO: Retained #{binary_seqs[0].size} bi-allelic transition sites.\n" else info_string = "INFO: Retained #{binary_seqs[0].size} bi-allelic sites.\n" end # If there were any infos, print them. unless info_string == "" info_string << "\n" puts info_string end # Read the file with age constraint information. constraint_file = File.open(options[:constraints]) constraint_lines = constraint_file.readlines constraint_strings = [] cladeage_constraints_used = false constraint_lines.each do |l| if ["normal","lognor","unifor","expone","cladea","monoph"].include?(l[0..5].downcase) cladeage_constraints_used = true if l[0..7].downcase == "cladeage" constraint_strings << l end end if constraint_strings.size == 0 puts "WARNING: No age constraints could be found in file #{options[:constraints]}. You will have to manually modify the XML file to include age constraints." end if cladeage_constraints_used info_string = "INFO: CladeAge constraints are specified in file #{options[:constraints]}.\n" info_string << " To use these in #{analysis_type.upcase}, make sure that the CladeAge package for BEAST2 is installed.\n" info_string << " Installation instructions can be found at http://evoinformatics.eu/cladeage.pdf.\n" info_string << "\n" puts info_string end # Read the starting tree input file if specified. if options[:tree] tree_file = File.open(options[:tree]) tree_string = tree_file.readlines[0].strip.chomp(";").gsub(/\[.+?\]/,"") else warn_string = "WARNING: As no starting tree has been specified, a random starting tree will be used by\n" warn_string << " #{analysis_type.upcase}. If the random starting tree is in conflict with specified constraints, #{analysis_type.upcase}\n" warn_string << " may not be able to find a suitable state to initiate the MCMC chain. This will lead\n" warn_string << " to an error message such as 'Could not find a proper state to initialise'. If such\n" warn_string << " a problem is encountered, it can be solved by providing a starting tree in which\n" warn_string << " the ages of constrained clades agree with the constraints placed on these clades.\n" warn_string << "\n" puts warn_string end # Set run parameters. store_frequency = options[:length]/2000 screen_frequency = 50 snapp_log_file_name = "#{options[:out]}.log" snapp_trees_file_name = "#{options[:out]}.trees" # Prepare XML input string. xml_string = "" xml_string << "\n" xml_string << "\n" xml_string << "\n" xml_string << "\n" unless options[:no_annotation] xml_string << "\n" end xml_string << "\n" specimen_ids.size.times do |x| xml_string << " \n" end xml_string << "\n" xml_string << "\n" xml_string << "\n" xml_string << "beast.base.inference.distribution.Uniform\n" xml_string << "beast.base.inference.distribution.Exponential\n" xml_string << "beast.base.inference.distribution.LogNormalDistributionModel\n" xml_string << "beast.base.inference.distribution.Normal\n" xml_string << "beast.base.inference.distribution.Gamma\n" xml_string << "beast.base.inference.distribution.OneOnX\n" xml_string << "beast.base.inference.distribution.Prior\n" xml_string << "\n" xml_string << "\n" xml_string << "\n" xml_string << " \n" xml_string << " \n" if options[:tree] xml_string << " \n" else xml_string << " \n" end if analysis_type == "snapp" xml_string << " \n" elsif analysis_type == "snapper" xml_string << " \n" else puts "ERROR: Unknown analysis type '#{analysis_type}'!" exit(1) end xml_string << " \n" table_species.uniq.each do |s| xml_string << " \n" table_species.size.times do |x| if table_species[x] == s xml_string << " \n" end end xml_string << " \n" end xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " 0.1\n" xml_string << " 200.0\n" xml_string << " 0.01\n" xml_string << " \n" xml_string << "\n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << "\n" xml_string << " \n" constraint_count = 0 constraint_strings.each do |c| constraint_count += 1 constraint_ary = c.split unless constraint_ary.size == 3 puts "ERROR: Expected three character strings per line for each constraint specification, but found" puts " '#{c.strip}'," exit(1) end constraint_distribution = constraint_ary[0].downcase constraint_placement = constraint_ary[1].downcase constraint_clade = constraint_ary[2] constraint_clade_ary = constraint_clade.split(",") constraint_clade_ary.each do |s| unless table_species.include?(s) puts "ERROR: A constraint has been specified for a clade that includes species #{s}, however," puts " this species could not be found in file #{options[:table]}!" exit(1) end end unless ["stem","crown","na"].include?(constraint_placement) puts "ERROR: Expected 'stem', 'crown', or 'NA' (only for monophyly constraints without calibration)" puts " but found '#{constraint_placement}' as the second character string in '#{c}'!" exit(1) end if constraint_distribution.include?("(") == false and constraint_distribution != "monophyletic" puts "ERROR: Expected parameters in parentheses as part of the first character string in " puts " '#{c.strip}'," puts " but found '#{constraint_distribution}'!" exit(1) end if constraint_distribution == "monophyletic" constraint_type = "monophyletic" else constraint_type = constraint_distribution.split("(")[0].strip end unless ["normal","lognormal","uniform","exponential","cladeage","monophyletic"].include?(constraint_type) puts "ERROR: Expected 'normal', 'lognormal', 'uniform', 'exponential', 'cladeage', or 'monophyletic'" puts " as part of the first character string in '#{c}' but found '#{constraint_type}'!" exit(1) end unless constraint_type == "monophyletic" constraint_parameters = constraint_distribution.split("(")[1].strip.chomp(")").split(",") end if constraint_type == "normal" unless constraint_parameters.size == 3 puts "ERROR: Expected 3 parameters for normal distribution, but found #{constraint_parameters.size}!" exit(1) end elsif constraint_type == "lognormal" unless constraint_parameters.size == 3 puts "ERROR: Expected 3 parameters for lognormal distribution, but found #{constraint_parameters.size}!" exit(1) end elsif constraint_type == "uniform" unless constraint_parameters.size == 2 puts "ERROR: Expected 2 parameters for uniform distribution, but found #{constraint_parameters.size}!" exit(1) end elsif constraint_type == "exponential" unless constraint_parameters.size == 2 puts "ERROR: Expected 2 parameters for exponential distribution, but found #{constraint_parameters.size}!" exit(1) end elsif constraint_type == "cladeage" unless constraint_parameters.size == 8 puts "ERROR: Expected 8 parameters for cladeage distribution, but found #{constraint_parameters.size}!" exit(1) end else unless constraint_type == "monophyletic" puts "ERROR: Unexpected constraint type '#{constraint_type}'!" exit(1) end end constraint_id = constraint_count.to_s.rjust(4).gsub(" ","0") if constraint_type == "cladeage" xml_string << " \n" else xml_string << " \n" end xml_string << " \n" constraint_clade_ary.each do |s| xml_string << " \n" end xml_string << " \n" if constraint_type == "normal" xml_string << " \n" xml_string << " #{constraint_parameters[1]}\n" xml_string << " #{constraint_parameters[2]}\n" xml_string << " \n" elsif constraint_type == "lognormal" xml_string << " \n" xml_string << " #{constraint_parameters[1]}\n" xml_string << " #{constraint_parameters[2]}\n" xml_string << " \n" elsif constraint_type == "uniform" xml_string << " \n" elsif constraint_type == "exponential" xml_string << " \n" elsif constraint_type == "cladeage" xml_string << " \n" end xml_string << " \n" end unless options[:no_annotation] xml_string << " \n" end xml_string << " \n" xml_string << " \n" xml_string << " \n" unless options[:no_annotation] xml_string << " \n" end xml_string << " \n" xml_string << " \n" xml_string << " \n" unless options[:no_annotation] xml_string << " \n" end xml_string << " \n" unless options[:no_annotation] xml_string << " \n" end xml_string << " 1.0\n" xml_string << " 1.0\n" xml_string << " \n" xml_string << " \n" xml_string << " \n" if analysis_type == "snapp" xml_string << " \n" elsif analysis_type == "snapper" xml_string << " \n" else puts "ERROR: Unknown analysis type '#{analysis_type}'!" exit(1) end xml_string << " \n" if analysis_type == "snapp" xml_string << " \n" elsif analysis_type == "snapper" xml_string << " \n" else puts "ERROR: Unknown analysis type '#{analysis_type}'!" exit(1) end unless options[:no_annotation] xml_string << " \n" end xml_string << " 1.0\n" xml_string << " 1.0\n" if analysis_type == "snapper" xml_string << " \n" end xml_string << " \n" xml_string << " \n" unless options[:no_annotation] xml_string << " \n" end xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << "\n" xml_string << " \n" if options[:weight] > 0 unless options[:no_annotation] xml_string << " \n" end xml_string << " \n" end unless options[:no_annotation] xml_string << " \n" end xml_string << " \n" xml_string << " \n" unless options[:no_annotation] xml_string << " \n" end xml_string << " \n" unless options[:no_annotation] xml_string << " \n" end xml_string << " \n" xml_string << " \n" xml_string << "\n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << " \n" xml_string << "\n" xml_string << "\n" xml_string << "\n" xml_string << "\n" # Write the XML input file. snapp_file = File.open(options[:xml],"w") snapp_file.write(xml_string) puts "Wrote #{analysis_type.upcase} input in XML format to file #{options[:xml]}.\n\n"