normalize = function(file_location = file.choose(new = FALSE), methods=c("SERRF","nomis"), scatter_plot = T, detectcores_ratio = 0.1){ # file_location = file.choose(new = FALSE); # methods="SERRF"; # scatter_plot = F if(identical(methods,"all")){ methods = c("mTIC","sum","median","PQN","contrast","quantile","linear",'liwong','cubic','batch_ratio','batch_loess','SERRF','svm','nomis','bmis') } if(!"pacman" %in% rownames(installed.packages())){ cat("Installing packages...\n") install.packages('pacman') } pacman::p_load(affy, parallel,ranger,caret,pcaMethods,ggplot2,tidyr,graphics,grDevices,Hmisc,gtools,cowplot,RColorBrewer,readr,plotly,stringr,GGally,dplyr,e1071,officer,bootstrap,pdist,metabolomics) remove_outlier = function(v){ out = boxplot.stats(v)$out return(list(value = v[!v%in%out],index = which(v%in%out))) } loess_wrapper_extrapolate <- function (x, y, span.vals = seq(0.25, 1, by = 0.05), folds = 5){ # Do model selection using mean absolute error, which is more robust than squared error. mean.abs.error <- numeric(length(span.vals)) # Quantify error for each span, using CV loess.model <- function(x, y, span){ loess(y ~ x, span = span, control=loess.control(surface="interpolate",statistics='exact'),family = "gaussian") } loess.predict <- function(fit, newdata) { predict(fit, newdata = newdata) } span.index <- 0 for (each.span in span.vals) { span.index <- span.index + 1 mean.abs.error[span.index] = tryCatch({ y.hat.cv <- bootstrap::crossval(x, y, theta.fit = loess.model, theta.predict = loess.predict, span = each.span, ngroup = folds)$cv.fit non.empty.indices <- !is.na(y.hat.cv) diff = (y[non.empty.indices] / y.hat.cv[non.empty.indices]) * mean(y[non.empty.indices]) sd(diff)/mean(diff) },error = function(er){ NA }) } best.span <- span.vals[which.min(mean.abs.error)] if(length(best.span)==0){ best.span = 0.75 } best.model <- loess(y ~ x, span = best.span, control=loess.control(surface="interpolate",statistics='exact'),family = "gaussian") return(list(best.model, min(mean.abs.error, na.rm = TRUE),best.span)) } shiftData = function(ori,norm){ ori.min = apply(ori,1,min,na.rm=T) norm.min = apply(norm,1,min,na.rm=T) return(norm - c(norm.min - ori.min)) } RSD = function(data){ return(apply(data,1,function(x){ x = remove_outlier(x)[[1]] return(sd(x,na.rm=T)/mean(x,na.rm=T)) })) } source("https://raw.githubusercontent.com/slfan2013/rcodes/master/read_data.R") data = read_data(file_location) e = data$e_matrix f = data$f p = data$p # check if sampleType is in the dataset if(!"sampleType" %in% colnames(p)){ stop("Your data must have 'sampleType'. Please see example data for more information.") }else{ # check if qc, sample are in the data. if(any(!c('qc','sample') %in% p$sampleType)){ stop("The 'sampleType' must contain at least 'qc' and 'sample'. Please see example data for more information.") } } # check if time is in the datasheet if(!"time" %in% colnames(p)){ stop("Your data must have 'time'. Please see example data for more information.") } # check if time has duplicated value. if(any(duplicated(p$time))){ time_duplicated = duplicated(p$time) stop("Your dataset has ",sum(time_duplicated)," duplicated 'time' value. 'time' of each sample should be unique. The duplicated 'time' values are ", paste0(p$time[time_duplicated],collapse = ', '),'.') } # check if batch is in the datasheet if(!"batch" %in% colnames(p)){ stop("Your data must have 'batch'. Please see example data for more information.") } # check with missing value. num_miss = sum(is.na(e)) if(num_miss>0){ cat(paste0("Your dataset has ",num_miss," missing values (i.e. empty cell). These missing values will be replaced by half of the minimum of non-missing value for each compound.\n")) for(i in 1:nrow(e)){ e[i, is.na(e[i,])] = 0.5 * min(e[i, !is.na(e[i,])]) } } # check with zero values. num_zero = sum(e == 0) if(num_zero>0){ cat(paste0("Your dataset has ",num_zero," zeros. These zeros will be kept zeros in the final normalized data. \n")) } cat(paste0("In your data, \n")) cat(paste0("Number of compounds: ",nrow(f)," \n")) cat(paste0("Number of samples: ",sum(p$sampleType=='sample', na.rm = TRUE)," \n")) cat(paste0("Number of QCs: ",sum(p$sampleType=='qc', na.rm = TRUE)," \n")) p$sample_index = paste0('p',1:nrow(p)) original_colnames_e = colnames(e) colnames(e) = p$sample_index # Empty sampleType will not be used for normalization, but will be put back in the final sheet. if(any(is.na(p$sampleType))){ cat(paste0("There are ",sum(is.na(p$sampleType))," empty cells in the 'sampleType'. They will not be used for normalization, but will be put back in the final sheet. \n")) } # !!! when there is no na, this needs modification. p_empty_sampleType = p[is.na(p$sampleType), ] e_empty_sampleType = e[,is.na(p$sampleType)] if(class(e_empty_sampleType) == 'numeric'){ e_empty_sampleType = matrix(e_empty_sampleType, ncol = 1) } colnames(e_empty_sampleType) = p_empty_sampleType$sample_index e = e[, !is.na(p$sampleType)] p = p[!is.na(p$sampleType), ] infinite_index = which(apply(e, 1, function(x){ sum(is.infinite(x)) == length(x) })) if(!length(infinite_index)==0){ e_infinite = e[infinite_index,] f_infinite = f[infinite_index,] e = e[-infinite_index,] f = f[-infinite_index,] } qc_RSDs = list() normalized_dataset = list() calculation_times = list() with_validate = any(!p$sampleType %in% c('qc','sample')) # split e, and p to different sample type. e_qc = e[, p$sampleType == 'qc'] e_sample = e[, p$sampleType == 'sample'] p_qc = p[p$sampleType == 'qc',] p_sample = p[p$sampleType == 'sample',] e_validates = list() p_validates = list() if(with_validate){ val_RSDs = list() validate_types = unique(p$sampleType[!p$sampleType %in% c('qc','sample')]) for(validate_type in validate_types){ e_validates[[validate_type]] = e[, p$sampleType %in% validate_type] p_validates[[validate_type]] = p[p$sampleType %in% validate_type, ] val_RSDs[[validate_type]] = list() } }else{ validate_types = NULL } aggregate_e = function(e_qc,e_sample,e_validates){ e = do.call('cbind',c(list(e_qc, e_sample), e_validates)) e = e[,order(as.numeric(gsub("p","",colnames(e))))] return(e) } # start start = Sys.time() normalized_dataset[['none']] = aggregate_e(e_qc,e_sample,e_validates) qc_RSDs[['none']] = RSD(e_qc) calculation_times[['none']] = Sys.time() - start cat("\n") cat(paste0("Average QC RSD:",signif(median(qc_RSDs[['none']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% QC RSD:",sum(qc_RSDs[['none']]<0.2,na.rm = TRUE),".\n")) # pca = prcomp(t(e), scale = TRUE) # # plot(pca$x[,1],pca$x[,2], col = factor(p$sampleType)) if(with_validate){ # for each type of validate. for(validate_type in validate_types){ val_RSDs[[validate_type]][['none']] = RSD(e_validates[[validate_type]]) cat(paste0("Average ",validate_type," RSD:",signif(median(val_RSDs[[validate_type]][['none']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% ",validate_type," RSD:",sum(val_RSDs[[validate_type]][['none']]<0.2,na.rm = TRUE),".\n")) } } if('mTIC' %in% methods){ Sys.sleep(2) normalized_dataset[['mTIC']] = tryCatch({ cat("\n") mTIC_skip = FALSE if(!'compoundType' %in% colnames(f)){ cat(paste0("warning: 'compountType' is not in the dataset. mTIC skipped.\n")) mTIC_skip = TRUE }else{ mTIC_column = f[['compoundType']] if(!'known' %in% unique(f[['compoundType']])){ cat(paste0("'known' (case-sensitive) is not found in the 'compoundType'. mTIC skipped.\n")) mTIC_skip = TRUE } } if(!mTIC_skip){ start = Sys.time() index = mTIC_column %in% "known" sums = apply(e[index,], 2, sum, na.rm=T) mean_sums = mean(sums, na.rm = TRUE) e_norm = t(t(e)/(sums/mean_sums)) qc_RSDs[['mTIC']] = RSD(e_norm[,p$sampleType=='qc']) calculation_times[['mTIC']] = Sys.time() - start cat(paste0("Average QC RSD:",signif(median(qc_RSDs[['mTIC']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% QC RSD:",sum(qc_RSDs[['mTIC']]<0.2,na.rm = TRUE),".\n")) if(with_validate){ for(validate_type in validate_types){ val_RSDs[[validate_type]][['mTIC']] = RSD(e_norm[,p$sampleType %in% validate_type]) cat(paste0("Average ",validate_type," RSD:",signif(median(val_RSDs[[validate_type]][['mTIC']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% ",validate_type," RSD:",sum(val_RSDs[[validate_type]][['mTIC']]<0.2,na.rm = TRUE),".\n")) } } }else{ e_norm = NA } e_norm },error = function(error){ error }) } if('sum' %in% methods){ Sys.sleep(2) normalized_dataset[['sum']] = tryCatch({ cat("\n") start = Sys.time() sums = colSums(e,na.rm = T) e_norm = t(t(e)/sums*mean(sums,na.rm = T)) qc_RSDs[['sum']] = RSD(e_norm[,p$sampleType=='qc']) calculation_times[['sum']] = Sys.time() - start cat(paste0("Average QC RSD:",signif(median(qc_RSDs[['sum']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% QC RSD:",sum(qc_RSDs[['sum']]<0.2,na.rm = TRUE),".\n")) if(with_validate){ for(validate_type in validate_types){ val_RSDs[[validate_type]][['sum']] = RSD(e_norm[,p$sampleType %in% validate_type]) cat(paste0("Average ",validate_type," RSD:",signif(median(val_RSDs[[validate_type]][['sum']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% ",validate_type," RSD:",sum(val_RSDs[[validate_type]][['sum']]<0.2,na.rm = TRUE),".\n")) } } e_norm },error = function(error){ error }) } if('median' %in% methods){ Sys.sleep(2) normalized_dataset[['median']] = tryCatch({ cat("\n") start = Sys.time() medians = apply(e, 2, median, na.rm = T) e_norm = t(t(e)/medians*median(medians,na.rm = T)) qc_RSDs[['median']] = RSD(e_norm[,p$sampleType=='qc']) calculation_times[['median']] = Sys.time() - start cat(paste0("Average QC RSD:",signif(median(qc_RSDs[['median']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% QC RSD:",sum(qc_RSDs[['median']]<0.2,na.rm = TRUE),".\n")) if(with_validate){ for(validate_type in validate_types){ val_RSDs[[validate_type]][['median']] = RSD(e_norm[,p$sampleType %in% validate_type]) cat(paste0("Average Validate Sample RSD:",signif(median(val_RSDs[[validate_type]][['median']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% Validate Sample RSD:",sum(val_RSDs[[validate_type]][['median']]<0.2,na.rm = TRUE),".\n")) } } e_norm },error = function(error){ error }) } if('PQN' %in% methods){ Sys.sleep(2) normalized_dataset[['PQN']] = tryCatch({ cat("\n") start = Sys.time() reference <- apply(e, 1, median, na.rm = TRUE) reference[reference==0] = 1 quotient <- e/reference quotient.median <- apply(quotient, 2, median, na.rm = TRUE) e_norm <- t(t(e)/quotient.median) qc_RSDs[['PQN']] = RSD(e_norm[,p$sampleType=='qc']) calculation_times[['PQN']] = Sys.time() - start cat(paste0("Average QC RSD:",signif(median(qc_RSDs[['PQN']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% QC RSD:",sum(qc_RSDs[['PQN']]<0.2,na.rm = TRUE),".\n")) if(with_validate){ for(validate_type in validate_types){ val_RSDs[[validate_type]][['PQN']] = RSD(e_norm[,p$sampleType %in% validate_type]) cat(paste0("Average ",validate_type," RSD:",signif(median(val_RSDs[[validate_type]][['PQN']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% ",validate_type," RSD:",sum(val_RSDs[[validate_type]][['PQN']]<0.2,na.rm = TRUE),".\n")) } } e_norm },error = function(error){ error }) } if("contrast" %in% methods){ Sys.sleep(2) normalized_dataset[['contrast']] = tryCatch({ cat("\n") start = Sys.time() threshold=1e-11 e[e <= 0] <- threshold maffy.data <- maffy.normalize(data.matrix(e), subset=1:nrow(e), span=0.75, verbose=FALSE, family="gaussian", log.it=FALSE) subtract <- function(x){ t(t(x)-apply(x,2,quantile,0.1,na.rm = TRUE)) } e_norm <- subtract(maffy.data) rownames(e_norm) = rownames(e) colnames(e_norm) = colnames(e) qc_RSDs[['contrast']] = RSD(e_norm[,p$sampleType=='qc']) calculation_times[['contrast']] = Sys.time() - start cat(paste0("Average QC RSD:",signif(median(qc_RSDs[['contrast']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% QC RSD:",sum(qc_RSDs[['contrast']]<0.2,na.rm = TRUE),".\n")) if(with_validate){ for(validate_type in validate_types){ val_RSDs[[validate_type]][['contrast']] = RSD(e_norm[,p$sampleType %in% validate_type]) cat(paste0("Average ",validate_type," RSD:",signif(median(val_RSDs[[validate_type]][['contrast']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% ",validate_type," RSD:",sum(val_RSDs[[validate_type]][['contrast']]<0.2,na.rm = TRUE),".\n")) } } e_norm },error = function(error){ error }) } if("quantile" %in% methods){ Sys.sleep(2) normalized_dataset[['quantile']] = tryCatch({ cat("\n") start = Sys.time() normalize.quantile <- get("normalize.quantiles",en=asNamespace("affy")) e_norm <- normalize.quantile(data.matrix(e)) rownames(e_norm) <- rownames(e) colnames(e_norm) <- colnames(e) qc_RSDs[['quantile']] = RSD(e_norm[,p$sampleType=='qc']) calculation_times[['quantile']] = Sys.time() - start cat(paste0("Average QC RSD:",signif(median(qc_RSDs[['quantile']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% QC RSD:",sum(qc_RSDs[['quantile']]<0.2,na.rm = TRUE),".\n")) if(with_validate){ for(validate_type in validate_types){ val_RSDs[[validate_type]][['quantile']] = RSD(e_norm[,p$sampleType %in% validate_type]) cat(paste0("Average ",validate_type," RSD:",signif(median(val_RSDs[[validate_type]][['quantile']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% ",validate_type," RSD:",sum(val_RSDs[[validate_type]][['quantile']]<0.2,na.rm = TRUE),".\n")) } } e_norm },error = function(error){ error }) } if('linear' %in% methods){ Sys.sleep(2) normalized_dataset[['linear']] = tryCatch({ cat("\n") start = Sys.time() linear.baseline <- apply(e, 1, median,na.rm = TRUE) baseline.mean <- mean(linear.baseline,na.rm = TRUE) sample.means <- apply(e, 2, mean,na.rm = TRUE) linear.scaling <- baseline.mean/sample.means e_norm <- t(t(e) * linear.scaling) qc_RSDs[['linear']] = RSD(e_norm[,p$sampleType=='qc']) calculation_times[['linear']] = Sys.time() - start cat(paste0("Average QC RSD:",signif(median(qc_RSDs[['linear']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% QC RSD:",sum(qc_RSDs[['linear']]<0.2,na.rm = TRUE),".\n")) if(with_validate){ for(validate_type in validate_types){ val_RSDs[[validate_type]][['linear']] = RSD(e_norm[,p$sampleType %in% validate_type]) cat(paste0("Average ",validate_type," RSD:",signif(median(val_RSDs[[validate_type]][['linear']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% ",validate_type," RSD:",sum(val_RSDs[[validate_type]][['linear']]<0.2,na.rm = TRUE),".\n")) } } e_norm },error = function(error){ error }) } if('liwong' %in% methods){ Sys.sleep(2) normalized_dataset[['liwong']] = tryCatch({ cat("\n") start = Sys.time() average.intensity <- apply(e,2,mean,na.rm = TRUE) median.number <- round(ncol(e)/2 + 0.1) ordering <- order(average.intensity) median.sample.number <- ordering[median.number] median.sample <- e[,median.sample.number] e_norm <- vector() for(i in 1:ncol(e)){ tryCatch({ liwong.model <- normalize.invariantset(data=e[,i], ref=median.sample, prd.td=c(0.003,0.007)) liwong.sample <- predict(liwong.model$n.curve$fit, e[,i]) },error = function(er){ liwong.sample = list();liwong.sample$y = e[,i] }) e_norm <- cbind(e_norm,liwong.sample$y) } colnames(e_norm) = colnames(e) qc_RSDs[['liwong']] = RSD(e_norm[,p$sampleType=='qc']) calculation_times[['liwong']] = Sys.time() - start cat(paste0("Average QC RSD:",signif(median(qc_RSDs[['liwong']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% QC RSD:",sum(qc_RSDs[['liwong']]<0.2,na.rm = TRUE),".\n")) if(with_validate){ for(validate_type in validate_types){ val_RSDs[[validate_type]][['liwong']] = RSD(e_norm[,p$sampleType %in% validate_type]) cat(paste0("Average ",validate_type," RSD:",signif(median(val_RSDs[[validate_type]][['liwong']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% ",validate_type," RSD:",sum(val_RSDs[[validate_type]][['liwong']]<0.2,na.rm = TRUE),".\n")) } } e_norm },error = function(error){ error }) } if("cubic" %in% methods){ Sys.sleep(2) normalized_dataset[['cubic']] = tryCatch({ cat("\n") start = Sys.time() e_norm <- normalize.qspline(e,samples=0.02,target=apply(e,1,mean,na.rm = TRUE),verbose = FALSE) rownames(e_norm) <- rownames(e) colnames(e_norm) <- colnames(e) qc_RSDs[['cubic']] = RSD(e_norm[,p$sampleType=='qc']) calculation_times[['cubic']] = Sys.time() - start cat(paste0("Average QC RSD:",signif(median(qc_RSDs[['cubic']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% QC RSD:",sum(qc_RSDs[['cubic']]<0.2,na.rm = TRUE),".\n")) if(with_validate){ for(validate_type in validate_types){ val_RSDs[[validate_type]][['cubic']] = RSD(e_norm[,p$sampleType %in% validate_type]) cat(paste0("Average ",validate_type," RSD:",signif(median(val_RSDs[[validate_type]][['cubic']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% ",validate_type," RSD:",sum(val_RSDs[[validate_type]][['cubic']]<0.2,na.rm = TRUE),".\n")) } } e_norm },error = function(error){ error }) } if('batch_ratio' %in% methods){ Sys.sleep(2) normalized_dataset[['batch_ratio']] = tryCatch({ cat("\n") start = Sys.time() e_norm = matrix(,nrow=nrow(e),ncol=ncol(e)) QC.index = p[["sampleType"]] batch = p[["batch"]] for(i in 1:nrow(f)){ means = by(as.numeric(e[i,QC.index=='qc']),batch[QC.index=='qc'], mean, na.rm=T) mean_means = mean(means, na.rm = TRUE) for(b in 1:length(unique(batch))){ e_norm[i,batch%in%unique(batch)[b]] = as.numeric(e[i,batch%in%unique(batch)[b]])/(means[[unique(batch)[b]]]/mean_means) } } rownames(e_norm) = rownames(e) colnames(e_norm) = colnames(e) qc_RSDs[['batch_ratio']] = RSD(e_norm[,p$sampleType=='qc']) calculation_times[['batch_ratio']] = Sys.time() - start cat(paste0("Average QC RSD:",signif(median(qc_RSDs[['batch_ratio']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% QC RSD:",sum(qc_RSDs[['batch_ratio']]<0.2,na.rm = TRUE),".\n")) if(with_validate){ for(validate_type in validate_types){ val_RSDs[[validate_type]][['batch_ratio']] = RSD(e_norm[,p$sampleType %in% validate_type]) cat(paste0("Average ",validate_type," RSD:",signif(median(val_RSDs[[validate_type]][['batch_ratio']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% ",validate_type," RSD:",sum(val_RSDs[[validate_type]][['batch_ratio']]<0.2,na.rm = TRUE),".\n")) } } e_norm },error = function(error){ error }) } if('batch_loess' %in% methods){ Sys.sleep(2) normalized_dataset[['batch_loess']] = tryCatch({ cat("\n(This may take a while...)\n") start = Sys.time() e_norm = matrix(,nrow=nrow(e),ncol=ncol(e)) QC.index = p[["sampleType"]] batch = p[["batch"]] time = as.numeric(p[["time"]]) n_CV = 5 QC.index.train = QC.index.test = list() n_CV = 5 seed = 1 set.seed(seed) QC.index.train = QC.index.test = list() e_qc_only = e[,p$sampleType=='qc'] if(any(table(p$batch[p$sampleType=='qc']))<7){ ratio = 0.7 }else{ ratio = 0.8 } for(j in 1:n_CV){ QC.index.train.temp = sample(1L:ncol(e_qc_only),round(ncol(e_qc_only)*ratio)) QC.index.test.temp = c(1L:ncol(e_qc_only))[!c(1L:ncol(e_qc_only))%in%QC.index.train.temp] QC.index.train. = rep('sample',ncol(e_qc_only)) QC.index.test. = rep('sample',ncol(e_qc_only)) QC.index.train.[QC.index.train.temp] = 'qc' QC.index.test.[QC.index.test.temp] = 'qc' QC.index.train[[j]] = QC.index.train. QC.index.test[[j]] = QC.index.test. } loess_fun_cv = function(e,train.index = QC.index,test.index=NULL,batch,time){ cl = makeCluster(detectCores() * detectcores_ratio) e_norm = parSapply(cl, X=1:nrow(e), function(i,e,train.index,batch,time,remove_outlier,loess_wrapper_extrapolate){ e_norm = tryCatch({ # for(i in 1:nrow(e)){ line = e[i,] for(b in 1:length(unique(batch))){ outlier_remove = remove_outlier(e[i,(batch %in% unique(batch)[b]) & (train.index=='qc')]) if(length(outlier_remove$index) == 0){ lm = loess_wrapper_extrapolate(x=time[(batch %in% unique(batch)[b]) & (train.index=='qc')], y = e[i,(batch %in% unique(batch)[b]) & (train.index=='qc')])[[1]] }else{ lm = loess_wrapper_extrapolate(x=time[(batch %in% unique(batch)[b]) & (train.index=='qc')][-outlier_remove$index], y = e[i,(batch %in% unique(batch)[b]) & (train.index=='qc')][-outlier_remove$index])[[1]] } line[batch %in% unique(batch)[b]] = predict(lm,newdata = time[batch %in% unique(batch)[b]]) if(length(which(is.na(line[batch %in% unique(batch)[b]])))>0){ for(j in which(is.na(line[batch %in% unique(batch)[b]]))){ time_notNA = time[batch %in% unique(batch)[b]][-which(is.na(line[batch %in% unique(batch)[b]]))] closest_time = time_notNA[which.min(abs(time_notNA - time[batch %in% unique(batch)[b]][j]))] line[batch %in% unique(batch)[b]][j] = line[batch %in% unique(batch)[b]][which(time[batch %in% unique(batch)[b]]==closest_time)] } } } if(sum(line<0)>(length(line)/5)){ stop("too many negative value. LOESS failed.") }else{ line[line<0] = runif(sum(line<0), min = max(c(median(e[i,], na.rm = TRUE) - 0.1 * sd(e[i,], na.rm = TRUE),0)), max = max(c(median(e[i,], na.rm = TRUE) + 0.1 * sd(e[i,], na.rm = TRUE),1))) } # if(length(which(is.na(line)))>0){ # for(j in which(is.na(line))){ # time_notNA = time[-which(is.na(line))] # closest_time = time_notNA[which.min(abs(time_notNA - time[j]))] # line[j] = line[which(time==closest_time)] # } # } e[i,] / (line / median(e[i,], na.rm = TRUE)) # if(sum((e[i,] / (line / median(e[i,], na.rm = TRUE)))<0)){ # stop(i) # } # } },error = function(er){ e[i,] }) return(e_norm) },e,train.index,batch,time,remove_outlier,loess_wrapper_extrapolate) stopCluster(cl) e_norm = t(e_norm) if(!is.null(test.index)){ rsd = RSD(e_norm[,test.index=='qc']) }else( rsd = NULL ) return(list(data = e_norm,rsd = rsd)) } rsds = matrix(,nrow = nrow(e), ncol = n_CV) for(i in 1:n_CV){ rsds[,i] = loess_fun_cv(e=e_qc_only,train.index=QC.index.train[[i]],test.index = QC.index.test[[i]],batch = batch[p$sampleType=='qc'],time=time[p$sampleType=='qc'])[[2]] } qc_RSD = apply(rsds,1,mean,na.rm = TRUE) e_norm = loess_fun_cv(e,train.index = QC.index,test.index=NULL,batch=batch,time=time)[[1]] rownames(e_norm) = rownames(e) colnames(e_norm) = colnames(e) qc_RSDs[['batch_loess']] = qc_RSD calculation_times[['batch_loess']] = Sys.time() - start cat(paste0("Average QC RSD:",signif(median(qc_RSDs[['batch_loess']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% QC RSD:",sum(qc_RSDs[['batch_loess']]<0.2,na.rm = TRUE),".\n")) if(with_validate){ for(validate_type in validate_types){ val_RSDs[[validate_type]][['batch_loess']] = RSD(e_norm[,p$sampleType %in% validate_type]) cat(paste0("Average ",validate_type," RSD:",signif(median(val_RSDs[[validate_type]][['batch_loess']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% ",validate_type," RSD:",sum(val_RSDs[[validate_type]][['batch_loess']]<0.2,na.rm = TRUE),".\n")) } } e_norm },error = function(error){ error }) # p_val = c() # for(i in 1:nrow(normalized_dataset[['batch_loess']])){ # # x = normalized_dataset[['batch_loess']][i,] # # p_val[i] = tryCatch({t.test(x ~ p$Gender)$p.value}, error = function(e){return(1)}) # # } } if('SERRF' %in% methods){ normalized_dataset[['SERRF']] = tryCatch({ cat("\n(This may take a while...)\n") e_norm = matrix(,nrow=nrow(e),ncol=ncol(e)) QC.index = p[["sampleType"]] batch = p[["batch"]] time = p[["time"]] batch = factor(batch) num = 10 start = Sys.time(); cl = makeCluster(detectCores() * detectcores_ratio) # e_train = e_qc # e_target = e_validates[["Biorec"]] # num = 10 # p_train = p_qc # p_target = p_validates[["Biorec"]] serrfR = function(train = e[,p$sampleType == 'qc'], target = e[,p$sampleType == 'sample'], num = 10, batch. = factor(c(batch[p$sampleType=='qc'],batch[p$sampleType=='sample'])), time. = c(time[p$sampleType=='qc'],time[p$sampleType=='sample']), sampleType. = c(p$sampleType[p$sampleType=='qc'],p$sampleType[p$sampleType=='sample']),cl){ all = cbind(train, target) normalized = rep(0, ncol(all)) for(j in 1:nrow(all)){ for(b in 1:length(unique(batch.))){ current_batch = levels(batch.)[b] all[j,batch.%in%current_batch][all[j,batch.%in%current_batch] == 0] = rnorm(length(all[j,batch.%in%current_batch][all[j,batch.%in%current_batch] == 0]),mean = min(all[j,batch.%in%current_batch][!is.na(all[j,batch.%in%current_batch])])+1,sd = 0.1*(min(all[j,batch.%in%current_batch][!is.na(all[j,batch.%in%current_batch])])+.1)) all[j,batch.%in%current_batch][is.na(all[j,batch.%in%current_batch])] = rnorm(length(all[j,batch.%in%current_batch][is.na(all[j,batch.%in%current_batch])]),mean = 0.5*min(all[j,batch.%in%current_batch][!is.na(all[j,batch.%in%current_batch])])+1,sd = 0.1*(min(all[j,batch.%in%current_batch][!is.na(all[j,batch.%in%current_batch])])+.1)) } } corrs_train = list() corrs_target = list() for(b in 1:length(unique(batch.))){ current_batch = levels(batch.)[b] train_scale = t(apply(train[,batch.[sampleType.=='qc']%in%current_batch],1,scale)) if(is.null(target[,batch.[!sampleType.=='qc']%in%current_batch])){ target_scale = t(apply(target[,batch.[!sampleType.=='qc']%in%current_batch],1,scale)) }else{ target_scale = scale(target[,batch.[!sampleType.=='qc']%in%current_batch]) } # all_scale = cbind(train_scale, target_scale) # e_current_batch = all_scale corrs_train[[current_batch]] = cor(t(train_scale), method = "spearman") corrs_target[[current_batch]] = cor(t(target_scale), method = "spearman") # corrs[[current_batch]][is.na(corrs[[current_batch]])] = 0 } pred = parSapply(cl, X = 1:nrow(all), function(j,all,batch.,ranger, sampleType., time., num,corrs_train,corrs_target){ # for(j in 1:nrow(all)){ # j = j+1 print(j) normalized = rep(0, ncol(all)) qc_train_value = list() qc_predict_value = list() sample_value = list() sample_predict_value = list() for(b in 1:length(levels(batch.))){ current_batch = levels(batch.)[b] e_current_batch = all[,batch.%in%current_batch] corr_train = corrs_train[[current_batch]] corr_target = corrs_target[[current_batch]] corr_train_order = order(abs(corr_train[,j]),decreasing = TRUE) corr_target_order = order(abs(corr_target[,j]),decreasing = TRUE) sel_var = c() l = num while(length(sel_var)<(num)){ sel_var = intersect(corr_train_order[1:l], corr_target_order[1:l]) sel_var = sel_var[!sel_var == j] l = l+1 } train.index_current_batch = sampleType.[batch.%in%current_batch] train_data_y = scale(e_current_batch[j, train.index_current_batch=='qc'],scale=F) train_data_x = apply(e_current_batch[sel_var, train.index_current_batch=='qc'],1,scale) if(is.null(dim(e_current_batch[sel_var, !train.index_current_batch=='qc']))){ test_data_x = t(scale(e_current_batch[sel_var, !train.index_current_batch=='qc'])) }else{ test_data_x = apply(e_current_batch[sel_var, !train.index_current_batch=='qc'],1,scale) } train_NA_index = apply(train_data_x,2,function(x){ sum(is.na(x))>0 }) train_data_x = train_data_x[,!train_NA_index] test_data_x = test_data_x[,!train_NA_index] if(!class(test_data_x)=="matrix"){ test_data_x = t(test_data_x) } good_column = apply(train_data_x,2,function(x){sum(is.na(x))==0}) & apply(test_data_x,2,function(x){sum(is.na(x))==0}) train_data_x = train_data_x[,good_column] test_data_x = test_data_x[,good_column] if(!class(test_data_x)=="matrix"){ test_data_x = t(test_data_x) } train_data = data.frame(y = train_data_y,train_data_x ) if(ncol(train_data)==1){# some samples have all QC constent. norm = e_current_batch[j,] normalized[batch.%in%current_batch] = norm }else{ colnames(train_data) = c("y", paste0("V",1:(ncol(train_data)-1))) model = ranger(y~., data = train_data) test_data = data.frame(test_data_x) colnames(test_data) = colnames(train_data)[-1] norm = e_current_batch[j,] norm[train.index_current_batch=='qc'] = e_current_batch[j, train.index_current_batch=='qc']/((predict(model, data = train_data)$prediction+mean(e_current_batch[j,train.index_current_batch=='qc'],na.rm=TRUE))/mean(all[j,sampleType.=='qc'],na.rm=TRUE)) # norm[!train.index_current_batch=='qc'] =(e_current_batch[j,!train.index_current_batch=='qc'])/((predict(model, data = test_data)$prediction + mean(e_current_batch[j,!train.index_current_batch=='qc'],na.rm=TRUE))/mean(e_current_batch[j,!train.index_current_batch=='qc'],na.rm=TRUE)) norm[!train.index_current_batch=='qc'] =(e_current_batch[j,!train.index_current_batch=='qc'])/((predict(model,data = test_data)$predictions + mean(e_current_batch[j, !train.index_current_batch=='qc'],na.rm=TRUE))/(median(all[j,!sampleType.=='qc'],na.rm = TRUE))) norm[!train.index_current_batch=='qc'][norm[!train.index_current_batch=='qc']<0]=e_current_batch[j,!train.index_current_batch=='qc'][norm[!train.index_current_batch=='qc']<0] # plot(p$time[batch.%in%b][!train.index_current_batch=='qc'], (e_current_batch[j,!train.index_current_batch=='qc'])/((predict(model,data = test_data)$predictions + mean(e_current_batch[j, train.index_current_batch=='qc'],na.rm=TRUE))/(median(e_current_batch[j,!train.index_current_batch=='qc'],na.rm = TRUE)))) norm[train.index_current_batch=='qc'] = norm[train.index_current_batch=='qc']/(median(norm[train.index_current_batch=='qc'],na.rm=TRUE)/median(all[j,sampleType.=='qc'],na.rm=TRUE)) norm[!train.index_current_batch=='qc'] = norm[!train.index_current_batch=='qc']/(median(norm[!train.index_current_batch=='qc'],na.rm=TRUE)/median(all[j,!sampleType.=='qc'],na.rm=TRUE)) norm[!is.finite(norm)] = rnorm(length(norm[!is.finite(norm)]),sd = sd(norm[is.finite(norm)],na.rm=TRUE)*0.01) out = boxplot.stats(norm, coef = 3)$out norm[!train.index_current_batch=='qc'][norm[!train.index_current_batch=='qc']%in%out] = ((e_current_batch[j,!train.index_current_batch=='qc'])-((predict(model,data = test_data)$predictions + mean(e_current_batch[j, !train.index_current_batch=='qc'],na.rm=TRUE))-(median(all[j,!sampleType.=='qc'],na.rm = TRUE))))[norm[!train.index_current_batch=='qc']%in%out]; norm[!train.index_current_batch=='qc'][norm[!train.index_current_batch=='qc']<0]=e_current_batch[j,!train.index_current_batch=='qc'][norm[!train.index_current_batch=='qc']<0] normalized[batch.%in%current_batch] = norm # points(current_time, norm, pch = (as.numeric(factor(train.index_current_batch))-1)*19, col = "blue", cex = 0.7) # qc_train_value[[b]] = train_data_y + mean(e_current_batch[j, train.index_current_batch=='qc']) # qc_predict_value[[b]] = predict(model,data = train_data)$predictions + mean(e_current_batch[j, train.index_current_batch=='qc']) # sample_value[[b]] = e_current_batch[j,!train.index_current_batch=='qc'] # sample_predict_value[[b]] = predict(model,data = test_data)$predictions + mean(e_current_batch[j, !train.index_current_batch=='qc']) } } # par(mfrow=c(1,2)) # ylim = c(min(e[j,],norm), max(e[j,],norm)) # plot(time.[sampleType.=='qc'], unlist(qc_train_value),col = "red",ylim = ylim,main=j) # points(time.[sampleType.=='qc'],unlist(qc_predict_value),col = "yellow") # # points(time.[!sampleType.=='qc'],unlist(sample_value),col = "blue") # points(time.[!sampleType.=='qc'],unlist(sample_predict_value),col = "green") # # plot(time.,normalized, col = factor(sampleType.), ylim = ylim,main=f$label[j]) # # j = j + 1 # # } return(normalized) },all,batch.,ranger, sampleType., time., num,corrs_train,corrs_target) normed = t(pred) normed_target = normed[,!sampleType.=='qc'] for(i in 1:nrow(normed_target)){ normed_target[i,is.na(normed_target[i,])] = rnorm(sum(is.na(normed_target[i,])), mean = min(normed_target[i,!is.na(normed_target[i,])], na.rm = TRUE), sd = sd(normed_target[i,!is.na(normed_target[i,])])*0.1) } for(i in 1:nrow(normed_target)){ normed_target[i,normed_target[i,]<0] = runif(1) * min(normed_target[i,normed_target[i,]>0], na.rm = TRUE) } normed_train = normed[,sampleType.=='qc'] for(i in 1:nrow(normed_train)){ normed_train[i,is.na(normed_train[i,])] = rnorm(sum(is.na(normed_train[i,])), mean = min(normed_train[i,!is.na(normed_train[i,])], na.rm = TRUE), sd = sd(normed_train[i,!is.na(normed_train[i,])])*0.1) } for(i in 1:nrow(normed_train)){ normed_train[i,normed_train[i,]<0] = runif(1) * min(normed_train[i,normed_train[i,]>0], na.rm = TRUE) } return(list(normed_train=normed_train,normed_target=normed_target)) } serrf_normalized = e serrf_normalized_modeled = serrfR(train = e_qc, target = e_sample, num = num,batch. = factor(c(p_qc$batch, p_sample$batch)),time. = c(p_qc$time, p_sample$time),sampleType. = c(p_qc$sampleType, p_sample$sampleType),cl) serrf_qc = serrf_normalized_modeled$normed_train colnames(serrf_qc) = colnames(e_qc) serrf_sample = serrf_normalized_modeled$normed_target colnames(serrf_sample) = colnames(e_sample) serrf_cross_validated_qc = e_qc cv = 5 RSDs = list() if(any(table(p_qc$batch))<7){ ratio = 0.7 }else{ ratio = 0.8 } test_indexes = split(1L:nrow(p_qc), c(1L:nrow(p_qc))%%cv) for(k in 1:cv){ test_index = test_indexes[[k]] train_index = c(1L:nrow(p_qc))[-test_index] train_index = sample(1L:sum(p$sampleType=='qc'),round(sum(p$sampleType=='qc')*ratio)) test_index = c(1L:sum(p$sampleType=='qc'))[!(c(1L:sum(p$sampleType=='qc'))%in%train_index)] while(length(unique(p_qc$batch[test_index]))\n(This may take a while...)\n") start = Sys.time() # https://pubs.rsc.org/en/content/articlepdf/2015/an/c5an01638j e_norm = matrix(,nrow=nrow(e),ncol=ncol(e)) QC.index = p[["sampleType"]] batch = p[["batch"]] time = as.numeric(p[["time"]]) n_CV = 5 QC.index.train = QC.index.test = list() n_CV = 5 seed = 1 set.seed(seed) QC.index.train = QC.index.test = list() e_qc_only = e[,p$sampleType=='qc'] if(any(table(p$batch[p$sampleType=='qc']))<7){ ratio = 0.7 }else{ ratio = 0.8 } for(j in 1:n_CV){ QC.index.train.temp = sample(1L:ncol(e_qc_only),round(ncol(e_qc_only)*ratio)) QC.index.test.temp = c(1L:ncol(e_qc_only))[!c(1L:ncol(e_qc_only))%in%QC.index.train.temp] QC.index.train. = rep('sample',ncol(e_qc_only)) QC.index.test. = rep('sample',ncol(e_qc_only)) QC.index.train.[QC.index.train.temp] = 'qc' QC.index.test.[QC.index.test.temp] = 'qc' QC.index.train[[j]] = QC.index.train. QC.index.test[[j]] = QC.index.test. } svm_fun_cv = function(e,train.index = QC.index,test.index=NULL,batch,time){ cl = makeCluster(detectCores() * detectcores_ratio) e_norm = parSapply(cl, X=1:nrow(e), function(i,e,train.index,batch,time,remove_outlier,trainControl,train){ # for(i in 1:nrow(e)){ tryCatch({ line = e[i,] for(b in 1:length(unique(batch))){ outlier_remove = remove_outlier(e[i,(batch %in% unique(batch)[b]) & train.index=='qc']) if(length(outlier_remove$index) == 0){ dta = data.frame(x = time[(batch %in% unique(batch)[b]) & train.index=='qc'], y = e[i,(batch %in% unique(batch)[b]) & train.index=='qc']) lm = train(y~., data=dta, method = "svmLinear", trControl = trainControl(method = "cv", savePred=T)) }else{ dta = data.frame(x = time[(batch %in% unique(batch)[b]) & train.index=='qc'][-outlier_remove$index], y = e[i,(batch %in% unique(batch)[b]) & train.index=='qc'][-outlier_remove$index]) lm = train(y~., data=dta, method = "svmLinear", trControl = trainControl(method = "cv", savePred=T)) } line[batch %in% unique(batch)[b]] = predict(lm,newdata = data.frame(x = time[batch %in% unique(batch)[b]])) } if(length(which(is.na(line)))>0){ for(j in which(is.na(line))){ time_notNA = time[-which(is.na(line))] closest_time = time_notNA[which.min(abs(time_notNA - time[j]))] line[j] = line[which(time==closest_time)] } } e_norm = e[i,] / (line / median(e[i,], na.rm = TRUE)) },error = function(er){ e[i,] }) # } },e,train.index,batch,time,remove_outlier,trainControl,train) stopCluster(cl) e_norm = t(e_norm) if(!is.null(test.index)){ rsd = RSD(e_norm[,test.index=='qc']) }else( rsd = NULL ) return(list(data = e_norm,rsd = rsd)) } rsds = matrix(,nrow = nrow(e), ncol = n_CV) for(i in 1:n_CV){ rsds[,i] = svm_fun_cv(e=e_qc_only,train.index=QC.index.train[[i]],test.index = QC.index.test[[i]],batch = batch[p$sampleType=='qc'],time=time[p$sampleType=='qc'])[[2]] } qc_RSD = apply(rsds,1,mean,na.rm = TRUE) e_norm = svm_fun_cv(e,train.index = QC.index,test.index=NULL,batch=batch,time=time)[[1]] rownames(e_norm) = rownames(e) colnames(e_norm) = colnames(e) qc_RSDs[['svm']] = qc_RSD calculation_times[['svm']] = Sys.time() - start cat(paste0("Average QC RSD:",signif(median(qc_RSDs[['svm']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% QC RSD:",sum(qc_RSDs[['svm']]<0.2,na.rm = TRUE),".\n")) if(with_validate){ for(validate_type in validate_types){ val_RSDs[[validate_type]][['svm']] = RSD(e_norm[,p$sampleType %in% validate_type]) cat(paste0("Average ",validate_type," RSD:",signif(median(val_RSDs[[validate_type]][['svm']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% ",validate_type," RSD:",sum(val_RSDs[[validate_type]][['svm']]<0.2,na.rm = TRUE),".\n")) } } e_norm },error = function(error){ error }) } if('nomis' %in% methods){ Sys.sleep(2) cat("\n") normalized_dataset[['nomis']] = tryCatch({cat("nomis normalization.\n") start = Sys.time() skip_NOMIS = FALSE if(!'compoundType' %in% colnames(f)){ cat(paste0("warning: 'compoundType' is not found. NOMIS is skipped.\n")) skip_NOMIS = TRUE }else{ IS_column = f[['compoundType']] if(!'istd' %in% unique(f[['compoundType']])){ cat(paste0("'istd' (case-sensitive) is not found in the 'compoundType'. NOMIS is skipped.\n")) skip_NOMIS = TRUE } } if(!skip_NOMIS){ IS_column = f[['compoundType']] istd_index = which(IS_column%in%'istd') inputdata = data.frame(Group = "A", t(log((e + sqrt(e^2 + 4)) * 0.5, base = exp(1)))) colnames(inputdata) = c("Group",f$label) rownames(inputdata) = paste0("S",1:nrow(inputdata)) normed = exp(t(Normalise(inputdata,method = 'nomis',nc = istd_index)$output[,-1])) qc_RSDs[['nomis']] = RSD(normed[,p$sampleType=='qc']) e_norm = rbind(e[istd_index,],normed) calculation_times[['nomis']] = Sys.time() - start cat(paste0("Average QC RSD:",signif(median(qc_RSDs[['nomis']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% QC RSD:",sum(qc_RSDs[['nomis']]<0.2,na.rm = TRUE),".\n")) if(with_validate){ for(validate_type in validate_types){ val_RSDs[[validate_type]][['nomis']] = RSD(e_norm[,p$sampleType %in% validate_type]) cat(paste0("Average ",validate_type," RSD:",signif(median(val_RSDs[[validate_type]][['nomis']],na.rm = TRUE),4)*100,"%.\n")) cat(paste0("Number of compounds less than 20% ",validate_type," RSD:",sum(val_RSDs[[validate_type]][['nomis']]<0.2,na.rm = TRUE),".\n")) } } }else{ e_norm = NA } e_norm },error = function(error){ error }) } if(!length(infinite_index)==0){ e_full = rbind(e_infinite, e) f_full = rbind(f_infinite, f) e_full[infinite_index,] = NA e_full[-infinite_index,] = e e = e_full f_full[infinite_index,] = f_infinite f_full[-infinite_index,] = f f = f_full } comb_p = rbind(p_empty_sampleType, p) comb_e = cbind(e_empty_sampleType, e) comb_p$sample_index = as.numeric(substring(comb_p$sample_index,2)) comb_e = comb_e[,order(comb_p$sample_index)] if(!length(infinite_index)==0){ for(i in 1:length(normalized_dataset)){ e_full = rbind(e_infinite, normalized_dataset[[i]]) f_full = rbind(f_infinite, f) e_full[infinite_index,] = NA e_full[-infinite_index,] = normalized_dataset[[i]] normalized_dataset[[i]] = e_full } } normalized_dataset = normalized_dataset[!is.na(normalized_dataset)] for(i in 1:length(normalized_dataset)){ normalized_dataset[[i]] = cbind(e_empty_sampleType, normalized_dataset[[i]]) normalized_dataset[[i]] = normalized_dataset[[i]][,order(comb_p$sample_index)] } comb_p = comb_p[order(sample_index),] if(grepl("\\\\",file_location)){ comp = strsplit(file_location,"\\\\")[[1]] }else{ comp = strsplit(file_location,"/")[[1]] } filename = gsub("\\.csv|\\.xlsx","",comp[length(comp)]) root = paste0(paste0(comp[-length(comp)],collapse = "\\"),"\\") dir = paste0(root,filename," - normalization result") dir.create(dir) png(filename=paste0(dir,"\\","Bar Plot and PCA plot.png"), width = 2000, height = 1000 * ifelse(with_validate,3,2)) qc_RSD_performance = sapply(qc_RSDs,median, na.rm = TRUE) qc_RSD_performance = sort(qc_RSD_performance,decreasing = TRUE) qc_RSD_performance_color = rep("grey",length(qc_RSD_performance)) qc_RSD_performance_color[length(qc_RSD_performance_color)-1] = "red" qc_RSD_performance_color[length(qc_RSD_performance_color)] = "#ffbf00" qc_RSD_performance_color[names(qc_RSD_performance)=='none'] = 'black' val_RSD_performances = list() val_RSD_performances_color = list() if(with_validate){ for(validate_type in validate_types){ val_RSD_performances[[validate_type]] = sapply(val_RSDs[[validate_type]],median, na.rm = TRUE) val_RSD_performances[[validate_type]] = sort(val_RSD_performances[[validate_type]],decreasing = TRUE) val_RSD_performances_color[[validate_type]] = rep("grey",length(val_RSD_performances[[validate_type]])) val_RSD_performances_color[[validate_type]][length(val_RSD_performances_color[[validate_type]])-1] = "red" val_RSD_performances_color[[validate_type]][length(val_RSD_performances_color[[validate_type]])] = "#ffbf00" val_RSD_performances_color[[validate_type]][names(val_RSD_performances[[validate_type]])=='none'] = 'black' } layout(matrix(c(rep(1:(length(validate_types)+1), each = 2), length(validate_types)+2, length(validate_types)+3), ncol = 2, byrow = TRUE)) }else{ layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE)) } par(lty = 0) par(mar=c(5,4,4,2)*3) bp = barplot(qc_RSD_performance*100, main="QC RSD", xlab="", ylab="RSD (%)",col = qc_RSD_performance_color,width = 1,las=2,cex.axis =5, cex.names=5,cex.main = 5) text(bp[which(names(qc_RSD_performance)=='none'),1], qc_RSD_performance['none']*100, paste0(signif(qc_RSD_performance['none'],4)*100,"%"), cex = 5, pos = 3) text(bp[nrow(bp),1], qc_RSD_performance[length(qc_RSD_performance)]*100, paste0(signif(qc_RSD_performance[length(qc_RSD_performance)],4)*100,"%"), cex = 5, pos = 3) if(with_validate){ for(validate_type in validate_types){ par(lty = 0) par(mar=c(5,4,4,2)*3) bp = barplot(val_RSD_performances[[validate_type]]*100, main=paste0(validate_type," Sample RSD"), xlab="", ylab="RSD (%)",col = val_RSD_performances_color[[validate_type]],width = 1,las=2,cex.axis =5, cex.names=5,cex.main = 5) text(bp[which(names(val_RSD_performances[[validate_type]])=='none'),1], val_RSD_performances[[validate_type]]['none']*100, paste0(signif(val_RSD_performances[[validate_type]]['none'],4)*100,"%"), cex = 5, pos = 3) text(bp[nrow(bp),1], val_RSD_performances[[validate_type]][length(val_RSD_performances[[validate_type]])]*100, paste0(signif(val_RSD_performances[[validate_type]][length(val_RSD_performances[[validate_type]])],4)*100,"%"), cex = 5, pos = 3) } } comb_p$sampleType[is.na(comb_p$sampleType)] = "NA" pca_color = factor(comb_p$sampleType, levels = c('sample','qc',validate_types,"NA")) dots = c(1,16,rep(16, length(validate_types)),1)[as.numeric(pca_color)] if(!length(infinite_index)==0){ sds = apply(comb_e[-infinite_index,],1,sd) pca_before = prcomp(t(comb_e[-infinite_index,][!sds==0,]),scale. = TRUE) }else{ sds = apply(comb_e,1,sd) pca_before = prcomp(t(comb_e[!sds==0,]),scale. = TRUE) } par(mar=c(4,2,4,2)*3) plot(pca_before$x[,1],pca_before$x[,2], col = factor(comb_p$sampleType, levels = c('sample','qc',validate_types,"NA")),main = 'Before',xlab='raw data',cex.lab=5,yaxt='n', cex.axis=5, cex.main=5, cex.sub=5,ylab="", xaxt='n',cex = 5,pch = dots) if(!length(infinite_index)==0){ sds = apply(normalized_dataset[[names(qc_RSD_performance)[length(qc_RSD_performance)]]][-infinite_index,],1,sd) pca_after = prcomp(t(normalized_dataset[[names(qc_RSD_performance)[length(qc_RSD_performance)]]][-infinite_index,][!sds==0,]),scale. = TRUE) }else{ sds = apply(normalized_dataset[[names(qc_RSD_performance)[length(qc_RSD_performance)]]],1,sd) pca_after = prcomp(t(normalized_dataset[[names(qc_RSD_performance)[length(qc_RSD_performance)]]][!sds==0,]),scale. = TRUE) } plot(pca_after$x[,1],pca_after$x[,2], col = pca_color,main = 'After',xlab = names(qc_RSD_performance)[length(qc_RSD_performance)],cex.lab=5, cex.axis=5, cex.main=5, cex.sub=5,ylab="",yaxt='n', xaxt='n',cex = 5,pch = dots) dev.off() dir.create(paste0(dir,"\\normalized datasets")) for(i in 1:length(normalized_dataset)){ method = names(normalized_dataset[i]) if(identical(class(normalized_dataset[[i]]),"matrix")){ colnames(normalized_dataset[[i]]) = comb_p$label normalized_dataset[[i]][data$e_matrix==0] = 0 normalized_dataset[[i]][is.na(data$e_matrix)] = NA fwrite(data.table(label = f$label,normalized_dataset[[i]]),paste0(dir,"\\normalized datasets\\normalized by - ",method,'.csv')) }else{ cat("Error in ",method,": ", normalized_dataset[[i]]) } } if(!length(infinite_index)==0){ for(i in 1:length(qc_RSDs)){ qc_RSD_full = rep(0,nrow(f)) qc_RSD_full[-infinite_index] = qc_RSDs[[i]] } } fwrite(data.table(label = f$label, do.call('cbind',qc_RSDs)),paste0(dir,"//","QC - RSD.csv")) if(with_validate){ for(validate_type in validate_types){ if(!length(infinite_index)==0){ for(i in 1:length(val_RSDs[[validate_type]])){ qc_RSD_full = rep(0,nrow(f)) val_RSDs[[validate_type]][-infinite_index] = val_RSDs[[validate_type]][[i]] } } fwrite(data.table(label = f$label, do.call('cbind',val_RSDs[[validate_type]])),paste0(dir,"//","Validate Samples - ",validate_type," - RSD.csv")) } } if(scatter_plot){ # time consuming. comb_p$sampleType[is.na(comb_p$sampleType)] = "NA" cat("Generating scatter plots for each compounds. Please be patient...\n") dir.create(paste0(dir,"\\scatter plots")) for(i in 1:length(normalized_dataset)){ method = names(normalized_dataset)[i] if(identical(class(normalized_dataset[[i]]),"matrix")){ normalized = normalized_dataset[[i]] dir.create(paste0(dir,"\\scatter plots\\",method)) e = normalized_dataset[['none']] for(j in 1:nrow(e)){ if(!j %in% infinite_index){ png(paste0(dir,"\\scatter plots\\",method,"\\",j,"th.png"), width = 480*2, height = 480) par(mfrow=c(1,2)) ylim = c(min(e[j,],normalized[j,]), max(e[j,],normalized[j,])) if(Inf %in% ylim){ ylim[2] = max(e[j,!is.infinite(e[j,])],normalized[j,!is.infinite(normalized[j,])])*1.1 } if(sum(is.na(ylim))<1){ plot(comb_p$time,normalized_dataset[['none']][j,], col = factor(comb_p$sampleType, levels = c('sample','qc',validate_types, "NA")), ylim = ylim, main = f$label[j]) plot(comb_p$time,normalized[j,], col = factor(comb_p$sampleType, levels = c('sample','qc',validate_types, "NA")), ylim = ylim) } dev.off() } } }} } if(!exists("serrf_cross_validated_qc")){ serrf_cross_validated_qc = FALSE } cat(paste0("Good Job! All the normalizations are finished!\nPlease check your folder: '",dir,"'.\n")) return(list(normalized_dataset = normalized_dataset,qc_RSDs = qc_RSDs,calculation_times=calculation_times,serrf_cross_validated_qc = serrf_cross_validated_qc)) } o = normalize(methods = "all",scatter_plot = F,detectcores_ratio = 1) # methods can be either "all" a c() which can include mTIC,sum,median,PQN,contrast,quantile,linear,liwong,cubic,batch_ratio,batch_loess,SERRF,svm,nomis or bmis.