#!/usr/bin/env Rscript
### Parse BLAST results and discard non-target OTUs (if any)

### Specify input file
 # BLAST 10 hits output from PipeCraft
blast_10_hits_file = "BLAST_10_best_hits.txt" 

### Specify target group(s) (if any)
 # Target taxonomic group(s) to keep
target = c("Fungi")
 # Taxonomic level to filter on: 
 #           Kingdom | Phylum | Class | Order | Family | Genus | Species
tax_level = "Kingdom"

### Specify sim_score thresholds for taxonomic levels
 # Minimum sim_score required for reliable assignment at each level
sim_score_thresholds = list(
  Class = 75,
  Order = 80,
  Family = 85,
  Genus = 90,
  Species = 97
)

### Specify e-value thresholds
 # e-value < e-50: reliable for kingdom assignment
 # e-value > e-20: mark as "unknown"
evalue_reliable = 1e-50  # Reliable threshold for kingdom assignment
evalue_unknown = 1e-20   # Unknown threshold, mark OTU as "unknown"  
#--------------------------------------#
#--------------------------------------#

# Load blast_10_hits_file
blast_10_hits = read.table("BLAST_10_best_hits.txt", 
                           header = TRUE, sep = "+", fill = TRUE)


########################################
### Make consensus from 10 best hits ###
########################################
# Description of the method:
# Consensus taxonomy is derived from up to 10 BLAST hits per OTU using the following steps:
#
# 1. E-value filtering:
#    - Hits with e-value > e-20 are excluded (marked as "unknown" if all hits exceed this)
#    - For Kingdom/Phylum: only hits with e-value < e-50 are considered (reliable threshold)
#    - For lower ranks (Class, Order, Family, Genus, Species): hits with e-value < e-20 are used
#
# 2. Sim_score quality filtering:
#    - consensus is built from high-quality hits only
#    - Only "good" hits are used: hits with sim_score within 5 points of the best hit
#
# 3. Rank-specific consensus (processed from Species → Genus → Family → Order → Class):
#    - For each rank, only hits meeting the sim_score threshold are considered:
#      * Species: sim_score >= 97
#      * Genus: sim_score >= 90
#      * Family: sim_score >= 85
#      * Order: sim_score >= 80
#      * Class: sim_score >= 75
#    - Frequency counting: counts how many hits assign each taxonomic value
#    - Score comparison: gets the best sim_score for each unique taxonomic value
#    - Consensus decision:
#      * If majority value has best score, assign it
#      * If minority value has score >2 points higher than majority, don't assign this rank
#        (check if all hits agree at a higher taxonomic rank instead)
#    - All ranks that can be assigned are assigned (not just the most specific)
#
# 4. Hierarchical assignment:
#    - If a rank cannot be assigned, all more specific ranks are set to NA
#    - If Species/Genus cannot be assigned, checks if all hits agree at Family/Order/Class level
#
# 5. Output:
#    - Returns consensus taxonomy with all ranks that meet their thresholds
#------------------------------------------------------------------------------#

# Parse semicolon-separated taxonomy string into a data frame
#     with separate columns for Accession, Kingdom, Phylum, Class, Order,
#     Family, Genus, and Species. Species is formatted as "Genus_species".
# Format: Accession;k__Kingdom;p__Phylum;
#           c__Class;o__Order;f__Family;g__Genus;s__Species
parse_taxonomy = function(taxonomy_string) {
  # Initialize result with NAs
  result = data.frame(
    Accession = NA, Kingdom = NA, Phylum = NA, Class = NA, 
    Order = NA, Family = NA, Genus = NA, Species = NA,
    stringsAsFactors = FALSE
  )
  
  # Handle NA/empty cases
  if (is.na(taxonomy_string) || 
      taxonomy_string == "" || 
      taxonomy_string == "*") {
    return(result)
  }
  
  # Split by semicolon
  ranks = strsplit(taxonomy_string, ";")[[1]]
  
  # Extract accession number (first field)
  if (length(ranks) > 0) {
    result$Accession = ranks[1]
  }
  
  # Parse each rank (format: prefix__taxon_name)
  for (rank in ranks) {
    if (grepl("^k__", rank)) {
      # Kingdom
      result$Kingdom = sub("^k__", "", rank)
    } else if (grepl("^p__", rank)) {
      # Phylum
      result$Phylum = sub("^p__", "", rank)
    } else if (grepl("^c__", rank)) {
      # Class
      result$Class = sub("^c__", "", rank)
    } else if (grepl("^o__", rank)) {
      # Order
      result$Order = sub("^o__", "", rank)
    } else if (grepl("^f__", rank)) {
      # Family
      result$Family = sub("^f__", "", rank)
    } else if (grepl("^g__", rank)) {
      # Genus
      result$Genus = sub("^g__", "", rank)
    } else if (grepl("^s__", rank)) {
      # Species: combine with genus as Genus_species
      species_epithet = sub("^s__", "", rank)
      if (!is.na(result$Genus) && result$Genus != "") {
        result$Species = paste0(result$Genus, "_", species_epithet)
      } else {
        result$Species = species_epithet
      }
    }
  }
  
  return(result)
}


# Function to get consensus taxonomy from multiple hits
get_consensus_taxonomy = function(hit_taxonomies, hit_scores, hit_evalues) {
  # hit_taxonomies: list of parsed taxonomy data.frames
  # hit_scores: vector of sim_score values for each hit
  # hit_evalues: vector of e-value values for each hit
  
  if (length(hit_taxonomies) == 0) {
    return(data.frame(
      Accession = NA, Kingdom = NA, Phylum = NA, Class = NA, 
      Order = NA, Family = NA, Genus = NA, Species = NA,
      stringsAsFactors = FALSE
    ))
  }
  
  # Check e-values: if all hits have e-value > e-20, mark as "unknown"
  if (all(hit_evalues > evalue_unknown, na.rm = TRUE)) {
    return(data.frame(
      Accession = NA, Kingdom = "unknown", Phylum = NA, Class = NA, 
      Order = NA, Family = NA, Genus = NA, Species = NA,
      stringsAsFactors = FALSE
    ))
  }
  
  # Filter hits: only use hits with e-value < e-20 for consensus
  valid_hits = hit_evalues < evalue_unknown
  if (sum(valid_hits, na.rm = TRUE) == 0) {
    return(data.frame(
      Accession = NA, Kingdom = "unknown", Phylum = NA, Class = NA, 
      Order = NA, Family = NA, Genus = NA, Species = NA,
      stringsAsFactors = FALSE
    ))
  }
  
  # Separate hits by e-value reliability
  # e-value < e-50: reliable for kingdom/phylum assignment
  reliable_hits = hit_evalues < evalue_reliable
  # e-value < e-20: can be used for lower ranks
  valid_hits_for_lower = hit_evalues < evalue_unknown
  
  # Use only valid hits
  hit_taxonomies_valid = hit_taxonomies[valid_hits_for_lower]
  hit_scores_valid = hit_scores[valid_hits_for_lower]
  hit_evalues_valid = hit_evalues[valid_hits_for_lower]
  reliable_mask = reliable_hits[valid_hits_for_lower]
  
  # Combine all taxonomies into one data frame
  all_tax = do.call(rbind, hit_taxonomies_valid)
  
  # Get consensus for each rank (most common, tie-break by best score)
  consensus = data.frame(
    Accession = NA, Kingdom = NA, Phylum = NA, Class = NA, 
    Order = NA, Family = NA, Genus = NA, Species = NA,
    stringsAsFactors = FALSE
  )
  
  ranks = c("Accession", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
  rank_thresholds = list(Class = 75, Order = 80, Family = 85, Genus = 90, Species = 97)
  
  # First, handle Kingdom and Phylum (using reliable hits only)
  for (rank in c("Kingdom", "Phylum")) {
    if (sum(reliable_mask, na.rm = TRUE) == 0) {
      consensus[[rank]] = NA
      if (rank == "Kingdom") {
        consensus$Kingdom = "unknown"
        return(consensus)
      }
      next
    }
    
    rank_values = all_tax[[rank]][reliable_mask]
    rank_scores = hit_scores_valid[reliable_mask]
    rank_values = rank_values[!is.na(rank_values) & rank_values != ""]
    
    if (length(rank_values) == 0) {
      consensus[[rank]] = NA
      if (rank == "Kingdom") {
        consensus$Kingdom = "unknown"
        return(consensus)
      }
      next
    }
    
    # Get consensus (most common, tie-break by best score)
    freq_table = table(rank_values)
    most_common = names(freq_table)[which.max(freq_table)]
    matching_indices = which(all_tax[[rank]] == most_common & reliable_mask)
    best_idx = matching_indices[which.max(hit_scores_valid[matching_indices])]
    
    consensus[[rank]] = most_common
    if (rank == "Kingdom" && hit_evalues_valid[best_idx] >= evalue_reliable) {
      consensus$Kingdom = "unknown"
      return(consensus)
    }
  }
  
  # Process ranks from Species down to Class
  # When there's disagreement, check if all hits agree at a higher rank
  taxonomic_ranks = c("Species", "Genus", "Family", "Order", "Class")
  
  # First, filter out hits that are >5 points below the best hit
  if (length(hit_scores_valid) > 0) {
    best_hit_score = max(hit_scores_valid, na.rm = TRUE)
    good_hits_mask = !is.na(hit_scores_valid) & hit_scores_valid >= (best_hit_score - 5)
    
    # Update valid hits to only include "good" hits
    hit_taxonomies_valid = hit_taxonomies_valid[good_hits_mask]
    hit_scores_valid = hit_scores_valid[good_hits_mask]
    hit_evalues_valid = hit_evalues_valid[good_hits_mask]
    reliable_mask = reliable_mask[good_hits_mask]
    
    # Rebuild all_tax with filtered hits
    all_tax = do.call(rbind, hit_taxonomies_valid)
  }
  
  # Process each rank and assign all that can be assigned
  for (rank in taxonomic_ranks) {
    threshold = rank_thresholds[[rank]]
    
    # Filter hits by sim_score threshold for this rank
    # Only use hits that meet the threshold for this rank
    score_threshold_mask = !is.na(hit_scores_valid) & hit_scores_valid >= threshold
    
    if (sum(score_threshold_mask, na.rm = TRUE) == 0) {
      # No hits meet the threshold for this rank
      consensus[[rank]] = NA
      # If this rank can't be assigned, all more specific ranks should also be NA
      current_idx = which(taxonomic_ranks == rank)
      if (current_idx > 1) {
        for (lower_idx in 1:(current_idx - 1)) {
          consensus[[taxonomic_ranks[lower_idx]]] = NA
        }
      }
      next
    }
    
    # Use only hits that meet the sim_score threshold
    rank_values = all_tax[[rank]][score_threshold_mask]
    rank_scores = hit_scores_valid[score_threshold_mask]
    
    # Filter out NA and empty values
    valid_idx = !is.na(rank_values) & rank_values != ""
    if (sum(valid_idx) == 0) {
      consensus[[rank]] = NA
      # If this rank can't be assigned, all more specific ranks should also be NA
      current_idx = which(taxonomic_ranks == rank)
      if (current_idx > 1) {
        for (lower_idx in 1:(current_idx - 1)) {
          consensus[[taxonomic_ranks[lower_idx]]] = NA
        }
      }
      next
    }
    
    rank_values_valid = rank_values[valid_idx]
    rank_scores_valid_subset = rank_scores[valid_idx]
    
    # Count frequencies and get best scores for each unique value
    unique_values = unique(rank_values_valid)
    value_stats = data.frame(
      value = unique_values,
      frequency = sapply(unique_values, function(v) sum(rank_values_valid == v)),
      best_score = sapply(unique_values, function(v) {
        idx = which(rank_values_valid == v)
        if (length(idx) > 0) max(rank_scores_valid_subset[idx], na.rm = TRUE) else NA
      }),
      stringsAsFactors = FALSE
    )
    
    # Sort by frequency (descending), then by best_score (descending)
    value_stats = value_stats[order(-value_stats$frequency, -value_stats$best_score), ]
    
    top_value = value_stats$value[1]
    top_freq = value_stats$frequency[1]
    top_score = value_stats$best_score[1]
    
    # Check if there's disagreement and minority has better score
    should_assign = TRUE
    if (nrow(value_stats) > 1) {
      second_best_score = value_stats$best_score[2]
      # If minority has a significantly better score, don't assign this rank
      if (!is.na(second_best_score) && !is.na(top_score) && 
          second_best_score > top_score + 2) {
        should_assign = FALSE
      }
    }
    
    if (should_assign) {
      # Find which hit has the top value with best score (from filtered hits)
      matching_indices_filtered = which(rank_values_valid == top_value)
      if (length(matching_indices_filtered) > 0) {
        # Get the best score from the filtered subset
        best_score_idx = matching_indices_filtered[which.max(rank_scores_valid_subset[matching_indices_filtered])]
        consensus_score = rank_scores_valid_subset[best_score_idx]
        
        # Threshold already applied in filtering, so consensus_score should meet it
        if (!is.na(consensus_score) && consensus_score >= threshold) {
          consensus[[rank]] = top_value
        } else {
          # Below threshold - don't assign this rank
          consensus[[rank]] = NA
          # If this rank can't be assigned, all more specific ranks should also be NA
          current_idx = which(taxonomic_ranks == rank)
          if (current_idx > 1) {
            for (lower_idx in 1:(current_idx - 1)) {
              consensus[[taxonomic_ranks[lower_idx]]] = NA
            }
          }
        }
      } else {
        consensus[[rank]] = NA
        # If this rank can't be assigned, all more specific ranks should also be NA
        current_idx = which(taxonomic_ranks == rank)
        if (current_idx > 1) {
          for (lower_idx in 1:(current_idx - 1)) {
            consensus[[taxonomic_ranks[lower_idx]]] = NA
          }
        }
      }
    } else {
      # Don't assign this rank due to disagreement
      consensus[[rank]] = NA
      # If this rank can't be assigned, all more specific ranks should also be NA
      current_idx = which(taxonomic_ranks == rank)
      if (current_idx > 1) {
        for (lower_idx in 1:(current_idx - 1)) {
          consensus[[taxonomic_ranks[lower_idx]]] = NA
        }
      }
    }
  }
  
  # If no rank was assigned at species/genus level, check if all hits agree at higher ranks
  if (is.na(consensus$Species) && is.na(consensus$Genus)) {
    for (rank in c("Family", "Order", "Class")) {
      if (!is.na(consensus[[rank]])) {
        # Already assigned, skip
        next
      }
      
      threshold = rank_thresholds[[rank]]
      
      # Filter hits by sim_score threshold for this rank
      score_threshold_mask = !is.na(hit_scores_valid) & hit_scores_valid >= threshold
      
      if (sum(score_threshold_mask, na.rm = TRUE) == 0) {
        next
      }
      
      # Use only hits that meet the sim_score threshold
      rank_values = all_tax[[rank]][score_threshold_mask]
      rank_values = rank_values[!is.na(rank_values) & rank_values != ""]
      
      if (length(unique(rank_values)) == 1 && length(rank_values) > 0) {
        # All hits (that meet threshold) agree at this rank - assign it
        matching_indices = which(all_tax[[rank]] == unique(rank_values)[1] & score_threshold_mask)
        if (length(matching_indices) > 0) {
          best_idx = matching_indices[which.max(hit_scores_valid[matching_indices])]
          consensus_score = hit_scores_valid[best_idx]
          
          if (!is.na(consensus_score) && consensus_score >= threshold) {
            consensus[[rank]] = unique(rank_values)[1]
          }
        }
      }
    }
  }
  
  return(consensus)
}

# Extract hits for each OTU from blast_10_hits
# Each row contains all 10 hits for one OTU
consensus_list = list()

# Process each row (each row is one OTU with up to 10 hits)
for (row_idx in 1:nrow(blast_10_hits)) {
  otu_row = blast_10_hits[row_idx, ]
  otu_id = otu_row$qseqid
  
  # Extract taxonomy strings, scores, and e-values for all hits (1st through 10th)
  hit_taxonomies = list()
  hit_scores = numeric()
  hit_evalues = numeric()
  
  # Column names: X1st_hit, X2nd_hit, ..., X10th_hit
  # Score columns: sim_score, sim_score.1, sim_score.2, ..., sim_score.9
  # E-value columns: evalue, evalue.1, evalue.2, ..., evalue.9
  hit_cols = c("X1st_hit", "X2nd_hit", "X3rd_hit", "X4th_hit", "X5th_hit",
               "X6th_hit", "X7th_hit", "X8th_hit", "X9th_hit", "X10th_hit")
  
  for (i in 1:10) {
    hit_col = hit_cols[i]
    
    # Check if this hit column exists and has data
    if (hit_col %in% colnames(otu_row)) {
      tax_string = otu_row[[hit_col]]
      
      # Skip if empty/NA
      if (!is.na(tax_string) && tax_string != "" && tax_string != "*") {
        # Parse taxonomy
        parsed = parse_taxonomy(tax_string)
        hit_taxonomies[[length(hit_taxonomies) + 1]] = parsed
        
        # Get sim_score for this hit
        if (i == 1) {
          score_col = "sim_score"
          evalue_col = "evalue"
        } else {
          score_col = paste0("sim_score.", i - 1)
          evalue_col = paste0("evalue.", i - 1)
        }
        
        # Try to get score, default to 100 if not found
        if (score_col %in% colnames(otu_row)) {
          hit_scores[length(hit_scores) + 1] = as.numeric(otu_row[[score_col]])
        } else {
          hit_scores[length(hit_scores) + 1] = 100
        }
        
        # Get e-value for this hit
        if (evalue_col %in% colnames(otu_row)) {
          hit_evalues[length(hit_evalues) + 1] = as.numeric(otu_row[[evalue_col]])
        } else {
          hit_evalues[length(hit_evalues) + 1] = 1e-100  # Very good default
        }
      }
    }
  }
  
  # Get consensus for this OTU
  consensus = get_consensus_taxonomy(hit_taxonomies, hit_scores, hit_evalues)
  consensus_list[[otu_id]] = consensus
}

# Convert to data frame
consensus_taxonomy = do.call(rbind, consensus_list)
rownames(consensus_taxonomy) = names(consensus_list)

# Combine with OTU identifiers and other info
blast_consensus = cbind(
  qseqid = names(consensus_list),
  consensus_taxonomy
)

# Remove Accession column 
if ("Accession" %in% colnames(blast_consensus)) {
  blast_consensus = blast_consensus[, !colnames(blast_consensus) %in% "Accession", drop = FALSE]
}

# Filter to target if specified
if (length(target) > 0 && !all(is.na(target)) && target[1] != "") {
  if (tax_level %in% colnames(blast_consensus)) {
    n_before = nrow(blast_consensus)
    blast_consensus = blast_consensus[blast_consensus[[tax_level]] %in% target, , drop = FALSE]
    n_after = nrow(blast_consensus)
    n_excluded = n_before - n_after
    cat("\n Consensus filtered to", tax_level, "level:", 
        paste(target, collapse = ", "), "\n")
    cat(" Excluded", n_excluded, "OTUs not matching target.\n")
    cat(" Remaining OTUs:", n_after, "\n")
  }
}

# Write consensus output
write.table(blast_consensus, file = "BLAST_consensus_parsed.txt", 
            sep = "\t", quote = FALSE, row.names = FALSE)