####################################################################### # fix_gaf.R # # This script fixes issues in Gene Ontology Annotation (GAF) files # where one database object ID has multiple gene symbols or names, # which causes the readGAF() function to fail. # # Usage: # source("fix_gaf.R") # fix_gaf("input.gaf", "output_fixed.gaf") # # Author: Yichao Hua # Date: 2025-04-28 ####################################################################### # Function to fix GAF files with multiple symbols/names for the same ID fix_gaf <- function(input_file, output_file = NULL) { # If output file not specified, create default name if (is.null(output_file)) { output_file <- gsub("\\.gaf$", "_fixed.gaf", input_file) if (output_file == input_file) { output_file <- paste0(input_file, "_fixed") } } # Check if input file exists if (!file.exists(input_file)) { stop("Error: Input file ", input_file, " does not exist.") } # Load required package if (!requireNamespace("dplyr", quietly = TRUE)) { install.packages("dplyr") } library(dplyr) message("Fixing GAF file: ", input_file) message("Output will be saved to: ", output_file) # Read the GAF file message("Reading GAF file...") lines <- readLines(input_file) # Separate header lines and data lines header_lines <- lines[grep("^!", lines)] data_lines <- lines[grep("^!", lines, invert = TRUE)] # Create a data frame to analyze the file content message("Parsing data lines...") df_list <- list() for (i in seq_along(data_lines)) { fields <- strsplit(data_lines[i], "\t")[[1]] if (length(fields) >= 10) { df_list[[i]] <- data.frame( line_num = i, db_id = fields[2], symbol = fields[3], name = fields[10], stringsAsFactors = FALSE ) } } # If no data was parsed, return error if (length(df_list) == 0) { stop("No data lines could be parsed. Check file format.") } df <- do.call(rbind, df_list) # Check for duplicate IDs with different symbols or names id_symbol_pairs <- unique(df[, c("db_id", "symbol")]) id_name_pairs <- unique(df[, c("db_id", "name")]) dup_symbols <- id_symbol_pairs %>% group_by(db_id) %>% summarize(count = n()) %>% filter(count > 1) dup_names <- id_name_pairs %>% group_by(db_id) %>% summarize(count = n()) %>% filter(count > 1) message("Found ", nrow(dup_symbols), " DB IDs with multiple symbols") message("Found ", nrow(dup_names), " DB IDs with multiple names") # If no duplicates, just copy the file if (nrow(dup_symbols) == 0 && nrow(dup_names) == 0) { message("No duplicate DB ID-Symbol or DB ID-Name pairs found. File does not need fixing.") file.copy(input_file, output_file) message("Copied original file to ", output_file) return(invisible(output_file)) } # Find unique IDs unique_ids <- unique(df$db_id) message("Processing ", length(unique_ids), " unique DB object IDs") # For each ID, standardize the symbol and name message("Creating standardization mappings...") id_to_standard <- list() for (id in unique_ids) { rows <- df[df$db_id == id, ] id_to_standard[[id]] <- list( symbol = rows$symbol[1], name = rows$name[1] ) } # Apply standardization to all data lines message("Applying standardization to data lines...") fixed_lines <- character(length(data_lines)) for (i in seq_along(data_lines)) { fields <- strsplit(data_lines[i], "\t")[[1]] if (length(fields) >= 10) { id <- fields[2] if (id %in% names(id_to_standard)) { fields[3] <- id_to_standard[[id]]$symbol fields[10] <- id_to_standard[[id]]$name } fixed_lines[i] <- paste(fields, collapse = "\t") } else { fixed_lines[i] <- data_lines[i] # Keep unchanged if unexpected format } } # Write the fixed file message("Writing fixed file...") writeLines(c(header_lines, fixed_lines), output_file) message("Fixed file created at: ", output_file) # Verify the fix message("Verifying fix...") # Read the fixed lines directly from memory to verify # Check for duplicate IDs with different symbols or names df_list <- list() for (i in seq_along(fixed_lines)) { fields <- strsplit(fixed_lines[i], "\t")[[1]] if (length(fields) >= 10) { df_list[[i]] <- data.frame( db_id = fields[2], symbol = fields[3], name = fields[10], stringsAsFactors = FALSE ) } } if (length(df_list) == 0) { warning("Could not verify fix: Unable to parse fixed data lines.") return(invisible(output_file)) } fixed_df <- do.call(rbind, df_list) # Check for duplicate IDs with different symbols or names fixed_id_symbol_pairs <- unique(fixed_df[, c("db_id", "symbol")]) fixed_id_name_pairs <- unique(fixed_df[, c("db_id", "name")]) dup_symbols_after <- fixed_id_symbol_pairs %>% group_by(db_id) %>% summarize(count = n()) %>% filter(count > 1) dup_names_after <- fixed_id_name_pairs %>% group_by(db_id) %>% summarize(count = n()) %>% filter(count > 1) if (nrow(dup_symbols_after) == 0 && nrow(dup_names_after) == 0) { message("Verification successful! Fixed file has no duplicate DB ID-Symbol or DB ID-Name pairs.") message("The fixed file can now be used with readGAF() function.") } else { warning("Fixed file still has ", nrow(dup_symbols_after), " DB IDs with multiple symbols and ", nrow(dup_names_after), " DB IDs with multiple names.") message("Further investigation may be needed.") } return(invisible(output_file)) }