#' @title Format MS2 spectra for export in GNPS-MGF format #' #' @description #' #' Re-format MS2 spectrum information for export of the data in Mgf format #' supported by GNPS. In detail, the function replaces the acquisition number #' of each spectrum with the feature ID (expected to be present in the #' `"feature_id"` column of `mcols(x)`) converted to an integer by removing #' the ID's leading `"FT"`. #' #' @param x `Spectra`. #' #' @return `Spectra` with the acquisition number replaced. #' #' @author Johannes Rainer, Philippine Louail #' #' @md formatSpectraForGNPS <- function(x) { if (inherits(x, "Spectra")) { svs <- spectraVariables(x) if (!"feature_id" %in% svs) { stop("No 'feature_id' in spectra variables") } fid <- x$feature_id if ("chrom_peak_id" %in% svs) pid <- x$chrom_peak_id else pid <- NULL x <- selectSpectraVariables( x, c( "acquisitionNum", "rtime", "precursorMz", "precursorIntensity", "precursorCharge", "dataStorage", "dataOrigin", "scanIndex", "mz", "intensity" ) ) x$FEATURE_ID <- fid if (length(pid)) x$PEAK_ID <- pid x$acquisitionNum <- as.integer(sub("^FT", "", fid)) x } else { fids <- mcols(x)$feature_id if (!length(fids)) stop("No column named 'feature_id' present in 'mcols(x)'") fids <- as.integer(sub("^FT", "", fids)) mendoapply(x, fids, FUN = function(z, id) { z@acquisitionNum <- id z }) } } #' Select the peaks matrix with the highest intensity. #' Function to select the peaks matrix with the highest TIC (sum of #' intensities) #' #' @param x `list` of `numeric` peaks matrices (with columns `"mz"` and #' `"intensity"`) #' #' @param ... ignored #' #' @return single `numeric` matrix. maxTicPeaksData <- function(x, ...) { tics <- vapply(x, function(z) sum(z[, "intensity"]), NA_real_) x[[which.max(tics)]] } #' @title Plot multiple spectra into the same plot #' #' @description #' #' Plot multiple spectra into the same plot. #' #' @param x `Spectra` that should be plotted. #' #' @param col color to be used for the individual peaks. #' #' @param type `character(1)` defining the plot type. Defaults to `"h"` to plot #' vertical lines. For more details see documentation of `plot`. #' #' @param main `character(1)` defining the title. #' #' @param ... additional arguments to be passed to `points`. #' #' @author Johannes Rainer #' #' @md plotSpectra <- function(x, col = "#00000040", type = "h", main, ...) { plot(3, 3, pch = NA, xlab = "m/z", ylab = "intensity", xlim = range(mz(x)), ylim = range(intensity(x)), main = main) tmp <- lapply(x, function(z) points(mz(z), intensity(z), type = type, col = col, ...)) } #' @title Select spectrum with maximal intensity from a list of spectra #' #' @description #' #' `maxTic` can be used with the `combineSpectra` method to select the spectrum #' with the largest overall signal from a list of spectra. #' #' @param z `Spectra` object. #' #' @return `Spectrum` #' #' @author Johannes Rainer #' #' @md maxTic <- function(z) { z[[which.max(lapply(intensity(z), sum))]] } #' @title Convert CAMERA output to an edge list for GNPS #' #' @description #' #' `getEdgelist` takes the output from the `getPeaklist` function from `CAMERA` #' and converts it to a `data.frame` with edge definitions for *GNPS*. Each #' row in that `data.frame` contains in columns `"ID1"` and `"ID2"` the #' identifiers (i.e. `rownames` of the input `data.frame`) of the features. #' Column `"EdgeType"` is always `"MS1 annotation"` and column `"Score"` `NA` #' (since no score can be exported from `CAMERA`). Columns `"Annotation"` #' contains the adduct names and their difference in m/z if **both** edges #' (features) were predicted to be an adduct of the **same** compound. If #' isotope annotations are available, these are also added to the column. #' Column `"CorrelationGroup"` provides the information which #' features were grouped by `CAMERA` into the same group. #' #' @param peaklist `data.frame` as returned by the [getPeaklist()] function #' from `CAMERA` package or an `xsAnnotate` object. #' #' @return `data.frame` with edge definitions (see description for more #' details). #' #' @author Mar Garcia-Aloy #' #' @examples #' #' res <- getEdgelist(getPeaklist(xsaFA)) getEdgelist <- function(peaklist) { if (is(peaklist, "xsAnnotate")) { peaklist <- getPeaklist(peaklist) if (!nrow(peaklist)) stop("Got an empty peak list.") } pl <- split(peaklist, factor(peaklist$pcgroup, levels = unique(peaklist$pcgroup))) res <- do.call(rbind, lapply(pl, .process_pcgroup)) rownames(res) <- NULL res } #' @title Extract feature annotations from CAMERA results #' #' @description #' #' Similar to the `getEdgelist` function, this function extracts information #' from a `CAMERA` result for use in GNPS. #' #' @param x `xsAnnotate` object after calling `findAdducts`. #' #' @return #' #' `data.frame` with columns: #' - `"annotation network number"`: ion identity network (IIN) number. All #' features predicted by `CAMERA` to be an adduct of a (co-eluting) compound #' with the same mass are part of this IIN. If a feature was predicted to be #' an adduct of two different compounds (with different masses) the ID of the #' larger network is reported. All features for which no adduct annotation is #' available will have an `NA` in this column. #' - `"best ion"`: the adduct definition of the feature. #' - `"correlation group ID"`: this corresponds to the `"pcgroup"` column in #' `getPeaklist(x)`. #' - `"auto MS2 verify"`: always `NA`. #' - `"identified by n="`: the size of the IIN. #' - `"partners"`: all other features (rows in the feature table) of this IIN. #' - `"neutral M mass"`: the mass of the compound. #' #' @author Johannes Rainer #' #' @noRd getFeatureAnnotations <- function(x) { if (!length(x@annoID)) stop("No adduct information present. Please call 'findAdducts' on ", "the object.") corr_group <- rep(seq_along(x@pspectra), lengths(x@pspectra)) corr_group <- corr_group[order(unlist(x@pspectra, use.names = FALSE))] ## Get the all ids (feature rows) for which an adduct was defined ids <- unique(x@annoID[, "id"]) ## Note: @pspectra contains the "correlation groups", @annoID the ## adduct groups, but it can happen that two ids are in the same adduct ## group without being in the same correlation group! ## loop through the adduct definitions and build the output data.frame adduct_def <- lapply(ids, function(id) { ## IDs of the same correlation group ids_pcgroup <- x@pspectra[[corr_group[id]]] ## ID of the adduct annotation groups this id is part of anno_grp <- x@annoID[x@annoID[, "id"] == id, "grpID"] ## Subset the adduct annotation to rows matching the annotation group ## of the present ID and to ids present in the same correlation group. adduct_ann <- x@annoID[x@annoID[, "id"] %in% ids_pcgroup & x@annoID[, "grpID"] %in% anno_grp, , drop = FALSE] ## if we have more than one annotation group, select the bigger one if (length(anno_grp) > 1) { cnts <- table(adduct_ann[, "grpID"]) adduct_ann <- adduct_ann[ adduct_ann[, "grpID"] == names(cnts)[order(cnts, decreasing = TRUE)][1], , drop = FALSE] } grp_id <- adduct_ann[1, "grpID"] ## different adduct rules can match the same m/z - we're just taking ## the first one (with lower ruleID) rule_id <- adduct_ann[adduct_ann[, "id"] == id, "ruleID"][1] ids_grp <- adduct_ann[, "id"] df <- data.frame( `row ID` = id, `annotation network number` = unname(grp_id), `best ion` = as.character(x@ruleset[rule_id, "name"]), `identified by n=` = nrow(adduct_ann), `partners` = paste0(ids_grp[ids_grp != id], collapse = ";"), `neutral M mass` = unname( x@annoGrp[x@annoGrp[, "id"] == grp_id, "mass"]), stringsAsFactors = FALSE, check.names = FALSE) }) adduct_def <- do.call(rbind, adduct_def) res <- adduct_def[rep("other", length(corr_group)), ] res[, "row ID"] <- seq_along(corr_group) rownames(res) <- as.character(res[, "row ID"]) res[ids, ] <- adduct_def res$`correlation group ID` <- corr_group res$`auto MS2 verify` <- NA res } #' Helper function to extract the adduct annotation from a pair of adducts #' from the same *pcgroup*. #' #' @author Mar Garcia-Aloy #' #' @noRd .define_annot <- function(y) { if (any(y$adduct == "")) return(NA) mass_1 <- .extract_mass_adduct(y$adduct[1]) mass_2 <- .extract_mass_adduct(y$adduct[2]) mass <- intersect(mass_1, mass_2) if (length(mass)) { def1 <- unlist(strsplit(y$adduct[1], " ")) def2 <- unlist(strsplit(y$adduct[2], " ")) paste0(def1[grep(mass[1], def1) - 1], " ", def2[grep(mass[1], def2) - 1], " dm/z=", round(abs(y$mz[1] - y$mz[2]), 4)) } else NA } #' Helper function to extract the isotope annotation from a pair of adducts #' from the same *pcgroup*. #' #' @author Mar Garcia-Aloy #' #' @noRd .define_isotop <- function(w) { if (any(w$isotopes == "")) return(NA) if (unlist(strsplit(w$isotopes[1], '\\]\\[') )[1] == unlist(strsplit(w$isotopes[2], '\\]\\[') )[1]) { a = paste0("[", do.call(rbind, strsplit(w$isotopes, "\\]\\["))[, 2], collapse = " ") b = paste0(unlist(strsplit(w$isotopes[2], '\\]') )[1], "]") paste0(b, a, " dm/z=", round(abs(w$mz[1] - w$mz[2]), 4)) } else NA } #' Simple helper to extract the mass(es) from strings such as #' [M+NH4]+ 70.9681 [M+H]+ 87.9886 #' #' @author Johannes Rainer #' #' @noRd #' #' @examples #' #' .extract_mass_adduct("[M+NH4]+ 70.9681 [M+H]+ 87.9886") #' .extract_mass_adduct("some 4") .extract_mass_adduct <- function(x) { if (!length(x) || x == "") return(NA) spl <- unlist(strsplit(x, " ", fixed = TRUE)) spl[seq(2, by = 2, length.out = length(spl) / 2)] } #' Helper function to process features from the same *pcgroup* #' #' @author Mar Garcia-Aloy #' #' @noRd .process_pcgroup <- function(x) { if (nrow(x) > 1) { res <- combn(seq_len(nrow(x)), 2, FUN = function(z) { anno <- .define_annot(x[z, ]) iso <- .define_isotop(x[z, ]) if (is.na(anno[1])) anno <- character() if (is.na(iso[1])) iso <- character() data.frame(ID1 = rownames(x)[z[1]], ID2 = rownames(x)[z[2]], EdgeType = if (length(anno) || length(iso)) "MS1 annotation" else "MS1 correlation", Score = 0.0, Annotation = paste0(anno, iso, collapse = " "), CorrelationGroup = x$pcgroup[1], stringsAsFactors = FALSE) }, simplify = FALSE) do.call(rbind, res) } else NULL }