######################################################################################## # name: Calculate vector combinations # author: Martin Stoeter # category: calculators Calculates all combinations of unique values of selected column. Useful for generating a table for iteration of XY-plot (e.g. A,B,C will be A vs B, A vs C, B vs C) - unique_combinations: see above - all_combinations: also A vs A, B vs B, C vs C - combiSize: A, B, C, D; combiSize = 3 will be ABC, ABD, ACD, BCD (maximum of combiSize = 4 for all_combinations) - pairwise_with_second_column: all combinations of unique values of two columns (e.g. A, B, C and D, E, F will be A vs D, A vs E, ..., C vs F) ######

$$$TEMPLATE_DESC$$$ #1.0 Parameter selection # 1) select the column for combination combiColumn = combiSize = # 2) select type of combination combiType = combiColumn2 = length(uniqueCombiColumn))) combiSize <- length(uniqueCombiColumn) #prevent too many combinations #2.3. calculate if ((combiType == "unique_combinations")) { rOut <- as.data.frame(t(combn(uniqueCombiColumn, combiSize))) names(rOut) <- paste(combiColumn, names(rOut), sep="_") } if ((combiType == "all_combinations")) { if (combiSize == 1) rOut <- expand.grid(uniqueCombiColumn) if (combiSize == 2) rOut <- expand.grid(uniqueCombiColumn, uniqueCombiColumn) if (combiSize == 3) rOut <- expand.grid(uniqueCombiColumn, uniqueCombiColumn, uniqueCombiColumn) if (combiSize >= 4) rOut <- expand.grid(uniqueCombiColumn, uniqueCombiColumn, uniqueCombiColumn, uniqueCombiColumn) names(rOut) <- paste(combiColumn, names(rOut), sep="_") } if ((combiType == "pairwise_with_second_column")) { rOut <- expand.grid(uniqueCombiColumn, unique(kIn[, combiColumn2])) names(rOut) <- c(combiColumn, combiColumn2) } ]]> ######################################################################################## # name: Robust linear scaling (Percentile normalization) # author: Marc Bickle # category: normalization Normalizes using a projection of the cell based data between 0 and 1 based on the 1st percentile and the 99th percentile [1] [1] http://www.ncbi.nlm.nih.gov/books/NBK126174/#hcsimage.4_Normalization_of_HCS_data ######

$$$TEMPLATE_DESC$$$ # Parameter selection selReadouts = c() Group = ; Treatment = ; ######################################################################################## # name: Levene's test of equal variance # author: Marc Bickle # category: statistical hypothesis tests Performs the Levene's test of homoscedasticity for two groups of a population. 1) define the parameters to be tested 2) define the population column with the groups 3) choose how to compute the center of each population. Median is more robust. ######

Performs Levene's test of equal variance. # Parameter selection selReadouts = c() Group = ; Centering = ; ######################################################################################## # name: KS-test OR unpaired Wilcoxon rank test (nonparametric tests) # author: Marc Bickle, Antje Janosch # category: statistical hypothesis tests Calculates the likelihood that two non-normal populations are drawn from the same distribution. Choose your readouts of interest and the column providing group information. The test will be performed for each readout and each combination of possible values of the group column. Choose whether you want to do a Kolmogorov-Smirnov test or a unpaired Wilcoxon rank test (Mann-Whitney U test). ######

Calculates the likelihood that two non-normal populations are drawn from the same distribution. Choose your readouts of interest and the column providing group information. The test will be performed for each readout and each combination of possible values of the group column. Choose whether you want to do a Kolmogorov-Smirnov test or a unpaired Wilcoxon rank test (Mann-Whitney U test). # Parameter selection test = ; selReadouts = c() group = ; ######################################################################################## # name: Get gene info from NCBI # author: Martin Stoeter # category: bioinformatics # preview: Gets the gene name, symbol, NewlocusID, CurrentRecord, LastUpdate, locusID, and species from NCBI data base - define column with gene IDs (all unique gene IDs will be queried) - define batch size for query (larger queries often give time-out error) Info from 'NCBI2R' package: Warning: These functions use NCBI's eutils, and come with the same user requirements - if performing many queries, you must run the scripts during certain hours when the NCBI servers are not in high demand. Please see the package website for more details: http://NCBI2R.wordpress.com Violation of the terms described there, and the terms on the eutils website may result in losing access to NCBI for your group. Output: table from NCBI NOTE: requires R-library NCBI2R ######

$$$TEMPLATE_DESC$$$ #1.0 Parameter selection # 1) select the column for gene IDs geneIDcolumn = # 2) define batch size batchSize = ####################################################################################### # name: Calculate percent responders (2:1, for ArrayScan cell data) # author: Martin Stoeter # category: calculators # preview: Calculates percent "HIGH" and "LOW" similarly as ArrayScan BioApplications. Using an upper and lower threshold value objects/cells will be classified as above or below these thresholds and percentages of all objects within a well will be calculated. This will be done for all wells and all parameters (use loop for multiple plates). THIS SNIPPET EXPECTS TWO TABLES: OBJECT/CELL DATA and A TABLE WITH THRESHOLDS PER PARAMETER: Top input (object/cell data): - select columns for row and column information Bottom input (parameter-threshold table): - select columns that contain parameter names - select columns that contain upper and lower thresholds Hint: generate bottom table by aggregation of reference data (e.g. negative control) as mean/median and sd of each parameter, use math node to calculate upper and lower threshold from these values (e.g. mean +/- 2xsd). ATTENTION: NEEDS 2:1 SNIPPET!!! ######

$$$TEMPLATE_DESC$$$ #1.0 Parameter selection # 1.1 select row and column data rowName = ; colName = ; # 1.2 select parameter column parameterColumn = ; # 1.3 select threshold columns thrHigh = ; thrLow = ; kIn2[which(kIn2[,parameterColumn] == currentParameter),thrHigh])) / length(na.omit(subdataCol[,currentParameter])) * 100 wellList[[wellID]] <- data.frame("parameter" = currentParameter, "plateRow" = row, "plateColumn" = col, "percentHigh" = percentHigh, "percentLow" = percentLow) wellID <- wellID +1 } } } rOut <- data.frame(do.call("rbind", wellList)) ]]> ######################################################################################## # name: Median and MAD # author: Marc Bickle # category: calculators Calculates the median and the median absolute deviation (MAD) of the specified parameters grouped by the specified metadata column. ######

$$$TEMPLATE_DESC$$$ #1.0 Parameter selection selReadouts = c() Group = c() ######################################################################################## # name: R math / absolute values # author: Martin Stoeter # category: calculators Calculates absolute values of selected columns. Useful for turning a correlation matrix (e.g. Linear Correlation) containing correlation and anti-correlation into a matrix containing absolute correlation values (anti-correlation = correlation) - select columns to be returned as absolute values. Not selected columns will be returned as they are. - select if values should be returned as 1-absolute(value), useful for hierarchical clustering, where highly correlating parameters should show low distance (therefore cluster together). Available formulars: a) abs(x), b) 1-abs(x) ######

$$$TEMPLATE_DESC$$$ #1.0 Parameter selection # 1) select the columns parameters = match(c(), names(kIn)); # 2) select math mathForm = c() ######################################################################################## # name: Unique values # author: Antje Niederlein # category: calculators Returns unique values for all selected columns independently. If they differ in length, missing values will be added. ######

$$$TEMPLATE_DESC$$$ # 1. Parameter selection # 1) select the columns colSelection = match(c(), names(kIn)); ######################################################################################## # name: Mahalanobis Distance Calculator # author: Antje Janosch # category: distance measures Calculates the squared mahalonobis distances (MD) to the mean of a reference population (typically your negative controls). Option 'Use inverted': If 'yes', the inverted covariance matrix will be used. If 'auto', the script will make a choice based on the determinant of the covariance matrix. Option 'Outlier p-value': The given p-value is used to calculate a MD threshold. Data rows with a MD above this threshold can be considered as outliers. ATTENTION: This snippet expects a column named 'treatment' with nominal domain values! If it's available, a reference population can be chosen. It will be used if 'Reference population column' is set to -USE SELECTED TREATMENTS- (default). Alternatively, the message 'Error: Could not find atrribute: treatment' can be ignored. A reference population column can be selected and the name of the reference population can be given as text. ######

$$$TEMPLATE_DESC$$$ #1.0 Parameter selection # 1) select the readouts that span the phenotypic space parameters = c() # 2) calculate mean and covariance of the negative controls treatment = p_thresh = as.numeric() useInverted = ###################################################################################### # name: Power transformation and Z rank transformation # author: Martin Stoeter, Caroline Reiche # category: distributions # preview: This rank-preserving data transformation is a pre-processing technique to stabilize the variance and to make the data more normal distribution-like. The Z rank transformation (project_to_Gaussian) projects the data to a random normal distribution. For each group and each parameter the values are transformed independently. - select type of normalization - select parameters that should be rank normalized - select a group column that should be used for ranking (e.g. barcode, date, project name,...) If subset data is chosen as reference, only this subset data is used to calculate the lambda for the transformation (either a) or b); not possible in Z rank!). a) select a column and give name of subset data used as reference b) select subset of column 'treatment' to be used as reference (needs a column with name 'treatment') ######

$$$TEMPLATE_DESC$$$ #1.0 Parameter selection # 1.1 select type of transformation transformation = ; # 1.2 select the columns selectedParams = match(c(), names(kIn)); # 1.3 select group group = # 1.4 select subset subsetOption = subsetColumn = subsetTreatment = 2) { # apply the trafo switch(which(transformation == c('Box-Cox','Yeo-Johnson','basic')), { powers <- c() for (k in 1:ncol(dataMatrix)) { powers[k] <- powerTransform(dataMatrix[,k])$lambda } data[which(data[,group] == allGroups[i]), selectedParams] <- bcPower(data[which(data[,group] == allGroups[i]), selectedParams], powers) #Box-Cox transformedNames <- paste(names(data)[selectedParams], '.bcPower', sep = "")}, { powers <- c() for (k in 1:ncol(dataMatrix)) { powers[k] <- powerTransform(dataMatrix[,k],family="yjPower")$lambda } data[which(data[,group] == allGroups[i]), selectedParams] <- yjPower(data[which(data[,group] == allGroups[i]), selectedParams], powers) #Yeo-Johnson transformedNames <- paste(names(data)[selectedParams], '.yjPower', sep = "")}, { powers <- c() for (k in 1:ncol(dataMatrix)) { powers[k] <- powerTransform(dataMatrix[,k])$lambda } data[which(data[,group] == allGroups[i]), selectedParams] <- basicPower(data[which(data[,group] == allGroups[i]), selectedParams], powers) #basic power transformation transformedNames <- paste(names(data)[selectedParams], '.power', sep = "")}) } else { data[which(data[,group] == allGroups[i]), selectedParams] <- NA } } } #rename transformed data columns #switch(which(transformation == c('Box-Cox','Yeo-Johnson','basic','project_to_Gaussian')), { # transformedNames <- paste(names(data)[selectedParams], '.bcPower', sep = "")}, { #Box-Cox# # transformedNames <- paste(names(data)[selectedParams], '.yjPower', sep = "")}, { #Yeo-Johnson # transformedNames <- paste(names(data)[selectedParams], '.power', sep = "")}, { #basic power transformation # transformedNames <- paste(names(data)[selectedParams], '.zrank', sep = "")} #project_to_Gaussian # ) # bind both dataframes together newColNames <- c(names(kIn), transformedNames) rOut <- cbind(kIn, data[,selectedParams]) names(rOut) <- newColNames ]]> ######################################################################################## # name: Ranking Multiparametric Profiles (for Normalization Carpenter) # author: Marc Bickle # category: distributions Ranks the values of a parameter in a multi parametric profile within a dataset. Best practice is to use normalised data and rank over the entire data set. ######

$$$TEMPLATE_DESC$$$ #1. Parameter selection selReadouts = c() Dataset = c(); ######################################################################################## # name: Shapiro-Wilk Normality Test (QQ-Plot) # author: Holger Brandl, Martin Stoeter # category: distributions # preview: qq-plot.png Shapiro-Wilk Normality Test calculates statistics to judge the distribution of data (with respect to normal distribution). The QQ-Plot visualizes the data in respect to a normal distribution. - to use a flow variable check box and type in variable name using the format: FLOWVAR(variable name) - use the R-plot template "QQ-Plot grid" to do the QQ-plots ######

$$$TEMPLATE_DESC$$$ #1. Parameter selection #Select the parameter of interest 5000) { data <- sample(data, 5000) } # do the test and add the p-value to the plot result <- shapiro.test(data) allResults[[i]] <- data.frame(parameter = selReadouts[i], "p-value" = result$p.value, "log10(p-value)" = log10(result$p.value), "p-value.text" = format(result$p.value, scientific = TRUE, digits = 4), W = result$statistic, n = length(data)) } rOut <- data.frame(do.call("rbind", allResults)) ]]> ######################################################################################## # name: Normalization Carpenter # author: Marc Bickle # category: normalization Normalizes using a projection of the cell based data between 0 and 1 based on the 1st percentile and the 99th percentile ######

$$$TEMPLATE_DESC$$$ #1.0 Parameter selection selReadouts = c() Group = ; Treatment = ; ######################################################################################## # name: Correlation of parameters # author: Martin Stoeter # category: relations Calculates the correlation between all chosen parameters. For all selected parameters a pairwise correlation will be calculated (P1 vs P2, P1 vs P3, P2 vs P3) for the entire table (missing values will be removed/ignored). n is the number of data pointe in correlation. The type of correlation can be selected. ######

$$$TEMPLATE_DESC$$$ # 1. Parameter selection # 1) select parameters for replicate correlation paramName = c(); # 2) select correlation method corrFunctionMethod = ; AB, AC, BC) for (i in 1:(length(repVector)-1)) { for (j in (i+1):length(repVector)) { corrTable <- as.matrix(kIn[,c(repVector[i], repVector[j])]) #remove NA corrTable <- corrTable[which(!is.na(corrTable[,repVector[i]]) & !is.na(corrTable[,repVector[j]])),] #check if XYtable match is bigger than 1 sampleIDmatch <- length(corrTable) if (sampleIDmatch > 1) { #calculate correlation correlation <- cor.test(corrTable[,repVector[i]], corrTable[,repVector[j]], method = corrFunctionMethod) corrEstimate <- correlation$"estimate" corrMethod <- correlation$"method" } else { corrEstimate <- as.numeric(NA) corrMethod <- "not enough samples for correlation" sampleIDmatch <- 0 } #assemble result pearsonResult <- data.frame(parameter1 = repVector[i], parameter2 = repVector[j], n = sampleIDmatch, coefficient = corrEstimate, method = corrMethod) if (exists("pearsonResultTable")) { pearsonResultTable <- data.frame(rbind(pearsonResultTable, pearsonResult)) } else { pearsonResultTable <- pearsonResult } } } rOut <- pearsonResultTable ]]> ######################################################################################## # name: Correlation of multi-parametric profiles # author: Martin Stoeter # category: relations Calculates the correlation between multi-parameteric profiles. Select columns for "Group identifier" (e.g., gene symbol) and "Sample identifier" (e.g. siRNA-ID or oligo letter). Use the check boxes to define which combinations should be calculated. a) all combinations of groups: e.g. oligos of gene A will be correlated with oligos of gene B b) all combinations of samples: e.g. oligo A will be correlated with oligo B Be aware that if you do all combinations the snippet could take a while! For all pairwise combinations of the correlation of selected profile parameters will be calculated (missing values will be removed/ignored). n is the number of parameters in correlation. The type of correlation can be selected. ######

$$$TEMPLATE_DESC$$$ # 1. Parameter selection # 1) select parameters for replicate correlation paramName = c(); # 2) select column that contains sample ID IDgene = # 3) select column that contains group ID IDoligo = # 4) select correlation method corrFunctionMethod = ; 1) { correlation <- cor.test(corrTable[,1], corrTable[,2], method = corrFunctionMethod) corrEstimate <- correlation$"estimate" corrMethod <- correlation$"method" } else { corrEstimate <- as.numeric(NA) corrMethod <- "not enough values for correlation" sampleIDmatch <- as.numeric(NA) } } else { corrEstimate <- as.numeric(NA) corrMethod <- "not enough profiles for correlation" sampleIDmatch <- as.numeric(NA) } pearsonResult <- rbind(pearsonResult, data.frame(groupID1 = combVec[comb,1], sampleID1 = kInXtable[xOligo,IDoligo], groupID2 = combVec[comb,2], sampleID1 = kInYtable[yOligo,IDoligo], n = sampleIDmatch, coefficient = corrEstimate, method = corrMethod)) } } } } #now do correlation between groups (geneA vs geneB) if (allCombiGene){ combVec <- t(combn(geneVector,2)) for (comb in 1:length(combVec[,1])){ # extract data rows to correlate kInXtable <- kIn[which(as.character(kIn[,IDgene]) == combVec[comb,1]),c(IDgene, IDoligo, paramName)] kInYtable <- kIn[which(as.character(kIn[,IDgene]) == combVec[comb,2]),c(IDgene, IDoligo, paramName)] for (xOligo in 1:length(kInXtable)){ for (yOligo in 1:length(kInYtable)){ #correlate all olives or only the same olives (oligoA vs oligoA, B, ...) if (allCombiOligo || (kInXtable[xOligo,IDoligo] == kInYtable[yOligo,IDoligo])){ corrTable <- t(as.matrix(rbind(kInXtable[xOligo,paramName], kInYtable[yOligo,paramName]))) #remove NA corrTable <- corrTable[which(!is.na(corrTable[,1]) & !is.na(corrTable[,2])),] sampleIDmatch <- length(corrTable[,1]) if (sampleIDmatch > 1) { correlation <- cor.test(corrTable[,1], corrTable[,2], method = corrFunctionMethod) corrEstimate <- correlation$"estimate" corrMethod <- correlation$"method" } else { corrEstimate <- as.numeric(NA) corrMethod <- "not enough values for correlation" sampleIDmatch <- as.numeric(NA) } #assemble result pearsonResult <- rbind(pearsonResult, data.frame(groupID1 = combVec[comb,1], sampleID1 = kInXtable[xOligo,IDoligo], groupID2 = combVec[comb,2], sampleID1 = kInYtable[yOligo,IDoligo], n = sampleIDmatch, coefficient = corrEstimate, method = corrMethod)) } } } } } rOut <- pearsonResult ]]> ######################################################################################## # name: Correlation of replicates # author: Martin Stoeter # category: relations Calculates the correlation between replicates for all chosen parameters. The table must contain a column of replicate information. Then another column must contain the sample identifier, which must be unique within a replicate (e.g. platenumber::well, but not barcode!), AND the same between the replicates. For the corresponding sample identifiers the correlation will be calculated pairwise between the replicates (A vs B, A vs C, B vs C) for the selected parameters (missing values will be removed/ignored). n is the number of sample identifiers per replicate in correlation. The type of correlation can be selected. Option: R square can be also calculated from linear regression Trouble shooting: a) "sampleID not unique" -> more than 1 sampleID (eg wells) per replicate b) "not enough samples for correlation" -> sampleID for a replicate = 0 (no data to correlate) HINT: use GroupBy[Unique concatenate with count(replicate)] on selected sampleID column, must be replicate count = 1 for each sampleID ######

$$$TEMPLATE_DESC$$$ # 1. Parameter selection # 1) select parameters for replicate correlation paramName = c(); # 2) select column that contains replicate annotation repColName = ; # 3) select column that contains sample ID IDname = ; # 4) select correlation method corrFunctionMethod = ; # 4) select correlation method useRegression = ; AB, AC, BC) for (i in 1:(length(repVector)-1)) { #define first table (A): e.g. kInA <- kIn[(which(kIn$"replicate"=="A")),c("sampleID","replicate","AvgIntenCh1","MinDistCh1","AreaCh1","CellNumber")] kInXtable <- kIn[which(as.character(kIn[,repColName]) == repVector[i]),c(IDname, repColName, paramName)] for (j in (i+1):length(repVector)) { #define second table (B) kInYtable <- kIn[which(as.character(kIn[,repColName]) == repVector[j]),c(IDname, repColName, paramName)] for (paramIteration in paramName) { #check if sampleIDs are unique, NA are ignored if (length(unique(as.character(na.omit(kInXtable[,IDname])))) == length(as.character(na.omit(kInXtable[,IDname]))) & length(unique(as.character(na.omit(kInYtable[,IDname])))) == length(as.character(na.omit(kInYtable[,IDname])))) { corrTable <- merge(kInXtable[,c(IDname, repColName, paramIteration)], kInYtable[,c(IDname, repColName, paramIteration)], by.x = IDname, by.y =IDname, all = FALSE) #remove NA => now not needed anymore because merge(... all=FALSE) means inner join and row with NA in joining/merging column are removed if(corrFunctionMethod != "pearson") corrTable <- corrTable[which(!is.na(corrTable[,paste(paramIteration, "x", sep="."),]) & !is.na(corrTable[,paste(paramIteration, "y", sep="."),])),] #check if XYtable match is bigger than 1 sampleIDmatch <- length(corrTable[,1]) if (sampleIDmatch > 1) { #calculate correlation correlation <- cor.test(corrTable[,paste(paramIteration, "x", sep=".")], corrTable[,paste(paramIteration, "y", sep=".")], method = corrFunctionMethod) corrEstimate <- correlation$"estimate" corrMethod <- correlation$"method" if(corrFunctionMethod == "pearson") { corrDF <- correlation$"parameter" + 2 #DF = degree of freedom = number of XY-pairs without NAs } else { corrDF <- sampleIDmatch } #corrPvalue <- correlation$"p.value" #is allways 0 in pearson correlation!? if(useRegression) { regression <- lm(corrTable[,paste(paramIteration, "x", sep=".")] ~ corrTable[,paste(paramIteration, "y", sep=".")]) rSquared <- summary(regression)$r.squared } } else { corrEstimate <- as.numeric(NA) corrMethod <- "not enough samples for correlation" #corrPvalue <- as.numeric(NA) if(useRegression) rSquared <- as.numeric(NA) corrDF <- 0 } } else { corrEstimate <- as.numeric(NA) corrMethod <- "sampleID not unique" #corrPvalue <- as.numeric(NA) if(useRegression) rSquared <- as.numeric(NA) corrDF <- as.numeric(NA) } #assemble result if(useRegression) { pearsonResult <- data.frame(parameter = paramIteration, replicate1 = repVector[i], replicate2 = repVector[j], n = corrDF, coefficient = corrEstimate, method = corrMethod, linear.regression.R.squared = rSquared) #p.value = corrPvalue, } else { pearsonResult <- data.frame(parameter = paramIteration, replicate1 = repVector[i], replicate2 = repVector[j], n = corrDF, coefficient = corrEstimate, method = corrMethod) } if (exists("pearsonResultTable")) { pearsonResultTable <- data.frame(rbind(pearsonResultTable, pearsonResult)) } else { pearsonResultTable <- pearsonResult } } } } rOut <- pearsonResultTable ]]> ######################################################################################## # name: Create column name table from selection # author: Holger Brandl, Martin Stoeter # category: utilities # preview: CreateColumnNameTableFromSelection.png Creates a table with a single column that contains the selected column headers. - select or type name of column for column headers - preview shows what the snippet is doing instead of using KNIME nodes - use check box to create two more columns with additional tags (for looping barplot) - if the selecsted column headers haver already the tag ' (Mean)' from GroupBy node, then this tag can be removed before processing (generate column table from aggregated data) ######

$$$TEMPLATE_DESC$$$ #1.0 Parameter selection # 1) select the columns columnHeaders = c() # 2) select column name # 3) make columns for barplot looping ######################################################################################## # name: Handle and filter NaN and infinite values # author: Martin Stoeter # category: utilities NaN are missing values that are created by some nodes (e.g. HCS tools - Normalization) and that are different from KNIMEs' missing values ("?", in R = "NA"). Infinite are positive or negative invinite values that are created by some nodes (e.g. HCS tools - Normalization) by devision by zero. NaN can be filtered by Row Filter with patters "NaN" on number column, however this is tedious for many columns. Infinite can be filtered by Row Filter using very high or low values, however this is tedious for many columns. This R snippet allows handling of NaN and infinite on input table. - select type of output: 1. All column names and number of found NaN and infinite (count all NaN and infinite per column) 2a. Only column names that contain NaN (list of NaN columns) 2b. Only column names that contain NaN or infinite (list of NaN and infinite columns) 3a. Input table without NaN columns (filter NaN columns out) 3b. Input table without NaN and infinite columns (filter NaN and infinite columns out) 4a. Input table with NaN replaced by NA (NA = KNIME understandable missing values) 4b. Input table with NaN and infinite replaced by NA (NA = KNIME understandable missing values) ######

$$$TEMPLATE_DESC$$$ #1.0 Parameter selection # 2) select column name typeOfOutput = ; 0)] } #return the input table without column containing NaN or infinite if (typeOfOutput == "Input table without NaN or infinite columns") { NaNresult <- kIn[,which((!NaNresult[,2]>0) | (!NaNresult[,3]>0))] } #returns only the column names that contain NaN values if (typeOfOutput == "Only column names that contain NaN") { NaNresult <- NaNresult[which(NaNresult[,2]>0),] } #returns only the column names that contain NaN or infinite values if (typeOfOutput == "Only column names that contain NaN or infinite") { NaNresult <- NaNresult[which((NaNresult[,2]>0) | (NaNresult[,3]>0)),] } #replace NaN with NA and turn complete table if (typeOfOutput == "Input table with NaN replaced by NA") { for (i in 1:(length(kIn[1,]))) { if (NaNresult[i,"numberOfNaN"] > 0) { kIn[which(is.nan(kIn[,i])),i] <- NA } } NaNresult <- kIn } #replace NaN and infinite with NA and turn complete table if (typeOfOutput == "Input table with NaN and infinite replaced by NA") { for (i in 1:(length(kIn[1,]))) { if ((NaNresult[i,"numberOfNaN"] > 0) || (NaNresult[i,"numberOfInfinite"] > 0)) { kIn[which((is.nan(kIn[,i])) | (is.infinite(kIn[,i]))),i] <- NA } } NaNresult <- kIn } rOut<- NaNresult ]]> ######################################################################################## # name: Reference Column Filter (2:1) # author: Martin Stoeter # category: utilities # preview: ReferenceColumnFilter.png Filters the unselected columns of input port 1 (columns to be filtered) using a reference column of input port 2. - select columns to be kept in input 1 (filter is not applied) - select the column in input 2 that contains column headers for reference filter - select way of filtering (filter in = keep what is in reference, filter out = drop what is in reference) Preview shows what the snippet is doing instead of using KNIME nodes NEEDS R snippet (2:1) !!! ######

$$$TEMPLATE_DESC$$$ #1.0 Parameter selection # 1) select columns that should be kept keepColumns = c(); # 2) select column that contains replicate annotation refColumnName = ; # 3) type of filter filter = ; ####################################################################################### # name: Get info from R-server (installed packages, R version, system info) # author: Martin Stoeter # category: utilities # preview: This snippet lists the version and packages installed on your R-server. a) installed packages and their versions (installed.packages()) b) libraries/packages that can be loaded (library()) c) base packages (sessionInfo()["basePkgs"]) d) R version and platform (sessionInfo()[1]) e) other packages installed (not always present) f) system information (Sys.info()) ######

$$$TEMPLATE_DESC$$$ #1.0 Parameter selection # 1.1 select type of transformation getInfo = ######################################################################################## # name: Reshape to long format # category: utilities Creates a table with a single measurement column and several id columns. ###### library(reshape) rOut <- melt(kIn) ######################################################################################## # name: Convert numbers to defined text/string # author: Antje Niederlein, Martin Stoeter # category: utilities # preview: Converts selected column to a specified number format - select column to convert or define flow variable using the format: FLOWVAR(variable name) - define digits before and after digit point - if minus is checked and negative values are present, an additional digit will be added to make positive and negative values the same length (for sorting good!) - if automatic is checked, then number of digits before and after will be calculated from the data - example: [3,2,minus=true]: 5.5 -> 0005.50, -012.125 -> -012.13; [2,3,minus=false] -5.0 -> -05.000, 1234.5678 -> 1234.5678 ######

$$$TEMPLATE_DESC$$$ #1.0 Parameter selection # 1) select the columns columnHeader = # 2) select the format 0) decimal <- TRUE width <- leadingWidth + decimalWidth if(decimal) width <- width + 1 if(minus) width <- width + 1 outChar <- "" if(decimal) { if(minus) outChar <- formatC(x, format = "f", digits = decimalWidth, width = width, flag = "00") else outChar <- formatC(x, format = "f", digits = decimalWidth, width = width, flag = "0") } else { if(minus) outChar <- formatC(x, format = "d", width = width, flag = "00") else outChar <- formatC(x, format = "d", width = width, flag = "0") } outChar } lWidth = 0 dWidth = 0 minus <- FALSE if(minusDigits && min(numVec) < 0) minus <- TRUE if(autoNumberDigits) { lWidth <- nchar(as.character(floor(max(abs(numVec))))) splitDec <- strsplit(as.character(numVec),".", fixed = TRUE) decPart <- sapply(splitDec[sapply(splitDec, length) > 1], "[[", 2) dWidth <- max(nchar(decPart)) } else { lWidth = digitVector[1] dWidth = digitVector[2] } charVec <- sapply(numVec, sortableNumberString, minus, lWidth, dWidth) if(appendColumn) { kIn <- cbind(kIn,charVec) names(kIn)[ncol(kIn)] <- paste(columnHeader, newColumnSuffix, sep = "") } else { kIn[,columnHeader] <- charVec } #sort(charVec) rOut <- kIn ]]> ###################################################################################### # name: Z rank transformation (beta) # author: Martin Stoeter # category: distributions # preview: This rank-preserving data transformation is a pre-processing technique to stabilize the variance and to make the data more normal distribution-like. The Z rank transformation (project_to_Gaussian) projects the data to a random normal distribution. For each group and each parameter the values are transformed independently. - select type of normalization - select parameters that should be rank normalized - select a group column that should be used for ranking (e.g. barcode, date, project name,...) If subset data is chosen as reference, only this subset data is used to calculate the lambda for the transformation (either a) or b); not possible in Z rank!). a) select a column and give name of subset data used as reference b) select subset of column 'treatment' to be used as reference (needs a column with name 'treatment') ######

$$$TEMPLATE_DESC$$$ #1.0 Parameter selection # 1.1 select type of transformation transformation = ; # 1.2 select the columns selectedParams = match(c(), names(kIn)); # 1.3 select group groupColumnName = # 1.4 select subset subsetOption = subsetColumn = subsetTreatment = % mutate(rowID = row.names(kIn)) #table <- table %>% filter(Metadata_barcode == "060AZ190218A-man-fixed") rOut <- table aggregatedGroupData <- head(table,0) randomDataTable <- NULL # data frame to store random normal distributions calculateTransformationModel <- TRUE debugAndPlot <- FALSE # set this TRUE for visualization and debugging storeRandomDistributionTable <- FALSE if (transformation == "project_to_Gaussian") { #calc z rank normalization per group for (group in 1:(length(allGroups))) { # ================================= for each group, (get subset data,) and calculate & store randome distribution ================ # get name of group curGroup <- allGroups[group] groupData <- table %>% filter(get(groupColumnName) == curGroup) # z rank normalization can be calculated on complete data set (still for each group independently, here = ELSE), or based on a subset of the group (e.g. untreated) and then applied on the complete data set of the group (here = IF) groupDataModel <- groupData %>% filter(0 > 1) # initialize table with 0 rows if(subsetOption == "a) subset column") { groupDataModel <- groupData %>% filter(get(subsetColumn) == subsetName) # get the subset table to calculate the normalization / transformation model data doApplyOnNormalDistribution <- TRUE } if(subsetOption == "b) subset of column 'treatment'" && subsetTreatment != "-NOT USED-") { subsetColumn <- "treatment" groupDataModel <- groupData %>% filter(get(subsetColumn) == subsetTreatment) # get the subset table to calculate the normalization / transformation model data doApplyOnNormalDistribution <- TRUE } if (subsetOption == "-NO SUBSET - USE ALL DATA IN GROUP!-" | nrow(groupDataModel) == 0) { groupDataModel <- groupData doApplyOnNormalDistribution <- FALSE } # generate data set with random normal distribution that is 100 bigger than the group data set (100 => less random sampling errors) nValues <- nrow(groupDataModel) # length of model data vector normData <- sort(rnorm(nValues*100, mean = 0, sd = 1)) # random normal distribution is sampled 100-fold more to avoid effects due to random data # store random distribution in table to export to output and use as model later... if (storeRandomDistributionTable) { if (is.null(randomDataTable)) randomDataTable <- data.frame(cbind(normData)) else { if (nrow(randomDataTable) > length(normData)) randomDataTable <- data.frame(cbind(randomDataTable, c(normData, rep(NA,nrow(randomDataTable)-length(normData))))) else { for (fillNA in (nrow(randomDataTable)+1):length(normData)) randomDataTable <- data.frame(rbind(randomDataTable, rep(NA, ncol(randomDataTable)))) randomDataTable <- data.frame(cbind(randomDataTable, normData)) } } names(randomDataTable)[group] <- levels(curGroup) } # end of if (storeRandomDistributionTable) # ================================= for each group, (get subset data,) and calculate & store randome distribution ================ # iterate through all selected parameter for (param in 1:(length(selectedParams))) { paramName <- colnames(groupDataModel)[selectedParams[param]] # get current parameter name groupData$normData <- NA # make column for transformed data and set all values to NA for now if(calculateTransformationModel) { # xxx is this if variable neccessary? rawValues <- pull(groupDataModel, paramName) rankVals <- as.integer(rank(rawValues)) transformedData <- normData[rankVals*100-50] temp <- data.frame(rowID = groupDataModel, values = rawValues, rank = rankVals, normData = transformedData) # xxx is this needed? } # if if (doApplyOnNormalDistribution) { groupData <- groupData %>% arrange_(paramName) rawValuesGroupDataSorted <- pull(groupData, paramName) i = 1 # varible for iteration through transformation model data j = 1 # varible for iteration through group data rawValuesSorted <- sort(rawValues) # xxx again sorting? yes, but same variable could be used... transformedData <- sort(transformedData) # xxx again sorting? #rawValuesSorted <- rawValuesSorted[1:5] intvervalLow <- rawValuesSorted[i] intvervalHigh <- rawValuesSorted[i + 1] intervalHighLow <- intvervalHigh - intvervalLow intervalNormData <- normData[(i+1)*100-50]-normData[i*100-50] intervalTransformedData <- transformedData[i + 1] - transformedData[i] if (debugAndPlot) { # initialize plot for debugging with first data point plot(x = rawValuesSorted[1], y = transformedData[1], ylim = c(-5,5), xlim = c(rawValuesGroupDataSorted[2],rawValuesGroupDataSorted[length(rawValuesGroupDataSorted)])) # first data: c(rawValuesGroupDataSorted[2],rawValuesGroupDataSorted[20]) ; end of the data: c(rawValuesGroupDataSorted[length(rawValuesGroupDataSorted)-20],rawValuesGroupDataSorted[length(rawValuesGroupDataSorted)]) abline(v = intvervalLow, lty = 2, col = "grey") } # debug # find data points that are lower smaller than model data of subset while (!intvervalLow <= rawValuesGroupDataSorted[j]) j = j + 1 if (j > 1) { # apply linear model on this data xData <- rawValuesSorted[1:(length(rawValuesSorted)*0.05)] # get the lowest 5% of the data for fitting a linear model yData <- transformedData[1:(length(rawValuesSorted)*0.05)] linearModelLow <- lm(yData ~ xData) differenceLastModelDataPoint <- predict.lm(linearModelLow, data.frame(xData = rawValuesGroupDataSorted[j-1])) - transformedData[1] # to ensure continuity / steadiness of the data a shift correction of the fitted data and the projected data is neccessary groupData$normData[1:(j-1)] <- predict.lm(linearModelLow, data.frame(xData = rawValuesGroupDataSorted[1:(j-1)])) - differenceLastModelDataPoint if (debugAndPlot) { points(x = xData, y = yData, col = "lightgrey") # plot the modelled data points(y = groupData$normData[1:(j-1)], x = rawValuesGroupDataSorted[1:(j-1)], col = "darkgrey") # plot fitted points; j-1 because value of j ist now already bigger/equal intervalLow #print(as.numeric(c(i,j,groupData$normData[1:(j-1)],rawValuesGroupDataSorted[1:(j-1)]))) #print(rawValuesSorted[1:10]) #print(transformedData[1:10]) abline(linearModelLow, col="darkgrey") # plot line to show linear model } # debug } # apply linear model #print(as.numeric(c( intvervalLow, intvervalHigh,intervalHighLow, intervalNormData, intervalTransformedData))) #print(as.numeric(c(i, intvervalLow, intvervalHigh, intervalHighLow, intervalNormData))) while (i < length(rawValuesSorted) && j <= length(rawValuesGroupDataSorted)) { if(intvervalLow <= rawValuesGroupDataSorted[j] && rawValuesGroupDataSorted[j] <= intvervalHigh) { groupData$normData[j] <- as.numeric(normData[i*100-50] + intervalNormData * (rawValuesGroupDataSorted[j] - intvervalLow) / intervalHighLow) if (debugAndPlot) { points(y = groupData$normData[j],x = rawValuesGroupDataSorted[j], col = (i %% 5) + 1) #if (i < 20) print(as.numeric(c(i,j,groupData$normData[j],rawValuesGroupDataSorted[j]))) # print first 20 #if (i > length(rawValuesSorted)-20) print(as.numeric(c(i,j,groupData$normData[j],rawValuesGroupDataSorted[j]))) # print last 20 } j = j + 1 } else { i = i + 1 if (i < length(rawValuesSorted)) { intvervalLow <- rawValuesSorted[i] intvervalHigh <- rawValuesSorted[i + 1] intervalHighLow <- intvervalHigh - intvervalLow intervalNormData <- normData[(i+1)*100-50] - normData[i*100-50] if (debugAndPlot) { abline(v = c(intvervalLow), lty = 2, col = (i %% 5) + 1) #if (i < 20) print(as.numeric(c( intvervalLow, intvervalHigh,intervalHighLow))) # print first 20 #if (i > length(rawValuesSorted)-20) print(as.numeric(c( intvervalLow, intvervalHigh,intervalHighLow))) # print last 20 #print(as.numeric(c(i, intvervalLow, intvervalHigh, intervalHighLow, intervalNormData))) } # debug } # if } # if else } # while if (debugAndPlot) abline(v = c(intvervalHigh), lty = 2, col = (i %% 5) + 1) # plot last data point / line of modelled subset data #print(as.numeric(c(i,j, length(rawValuesSorted), length(rawValuesGroupDataSorted)))) # apply linear model on data that is larger than model data of subset if (i >= length(rawValuesSorted) && j <= length(rawValuesGroupDataSorted)) { xData <- rawValuesSorted[(length(rawValuesSorted)*0.95):length(rawValuesSorted)] # get the highest 5% of the data for fitting a linear model yData <- transformedData[(length(rawValuesSorted)*0.95):length(rawValuesSorted)] #print("done") linearModelHigh <- lm(yData ~ xData) differenceLastModelDataPoint <- predict.lm(linearModelHigh, data.frame(xData = rawValuesGroupDataSorted[j])) - transformedData[i] # to ensure continuity / steadiness of the data a shift correction of the fitted data and the projected data is neccessary groupData$normData[(j):length(rawValuesGroupDataSorted)] <- predict.lm(linearModelHigh, data.frame(xData = rawValuesGroupDataSorted[(j):length(rawValuesGroupDataSorted)])) - differenceLastModelDataPoint if (debugAndPlot) { #print(tail(rawValuesSorted)) #print(tail(transformedData)) points(x = xData, y = yData, col = "lightgrey") # plot the modelled data points(y = groupData$normData[(j):length(rawValuesGroupDataSorted)], x = rawValuesGroupDataSorted[(j):length(rawValuesGroupDataSorted)], col = "darkgrey") # plot fitted points; j-1 because value of j ist now already bigger/equal intervalLow #print(as.numeric(c(i,j,groupData$normData[(j):length(rawValuesGroupDataSorted)],rawValuesGroupDataSorted[(j):length(rawValuesGroupDataSorted)]))) abline(linearModelHigh, col="darkgrey") # plot line to show linear model } # debug } # if } else { # if (doApplyOnNormalDistribution) groupData$normData <- transformedData # if groupDataModel = groupData, then just put transformed data into table } names(groupData)[ncol(groupData)] <- paste(paramName, '.zrank', sep = "") #project_to_Gaussian, rename current column name to .zrank } # for (param in 1:(length(selectedParams))) aggregatedGroupData <- bind_rows(aggregatedGroupData, groupData) } # for (group in 1:(length(allGroups))) rOut <- rOut %>% left_join(select(aggregatedGroupData, c("rowID",tail(names(aggregatedGroupData),length(selectedParams)))), by="rowID") %>% select(-one_of(c("rowID"))) rOut <- as.data.frame(rOut) row.names(rOut) <- row.names(kIn) } # if (transformation == "project_to_Gaussian") #print("done all loops") if (storeRandomDistributionTable) { randomDataTable <- cbind(randomDataTable, rank = row.names(randomDataTable)) rSave <- rOut if(nrow(rOut) < nrow(randomDataTable)) { rOut <- cbind(rOut, rowNames = row.names(rOut)) for (fillNA in (nrow(rOut)+1):nrow(randomDataTable)) rOut <- data.frame(rbind(rOut, rep(NA, ncol(rOut)))) rOut <- data.frame(cbind(rOut, randomDataTable)) } } ]]> ###################################################################################### # name: Uniform Manifold Approximation and Projection (UMAP) # author: Martin Stoeter # category: dimension_reduction # preview: Uniform Manifold Approximation and Projection (UMAP) is an algorithm for dimensional reduction. Originally describer here: https://arxiv.org/abs/1802.03426 This template uses R package 'umap'. More details here: https://cran.r-project.org/web/packages/umap/vignettes/umap.html ######

$$$TEMPLATE_DESC$$$ # 1. Parameter selection # 1) select parameters for UMAP calculation paramName = c(); # 2) UMAP configuration ######################################################################################## # name: Find Peaks Plot-Table # author: Martin Stoeter # category: analysis # preview: Find_peaks_plot.png Analyses a sequence of data point and detects peaks therein. The plot version just a visualizes, the snippet version just outputs the peak table. ######

Custom template to create a line plot with peak detection https://www.rdocumentation.org/packages/pracma/versions/1.9.9/topics/findpeaks https://www.mathworks.com/help/signal/ref/findpeaks.html#bufbbs1-2 Arguments 1) x : numerical vector taken as a time series 2) nups : minimum number of increasing steps before a peak is reached 3) ndowns : minimum number of decreasing steps after the peak 4) zeros : can be "+", "-", or "0"; how to interprete succeeding steps of the same value: increasing, decreasing, or special 5) peakpat : define a peak as a regular pattern, such as the default pattern "[+]1,[-]1,"; if a pattern is provided, the parameters nups and ndowns are not taken into account 6) minpeakheight : the minimum (absolute) height a peak has to have to be recognized as such 7) minpeakdistance : the minimum distance (in indices) peaks have to have to be counted 8) threshold : the minimum height difference between a peak and its neighbors. Use this argument to have findpeaks return only those peaks that exceed their immediate neighboring values by at least the value of "threshold" 9) npeaks : the number of peaks to return 10) sortstr : logical; should the peaks be returned sorted in decreasing oreder of their maximum value Returns a matrix where each row represents one peak found. The first column gives the height, the second the position/index where the maximum is reached, the third and forth the indices of where the peak begins and ends --- in the sense of where the pattern starts and ends. Parameter: input data/signal - date series to be used for peak detection time series - column with time values for series; -USE INDEX- will return the index values (1 to end of series) signal peak threshold - minimum peak height threshold delta slope threshold - minimum difference threshold in slope detection (within chosen distance, onset and offset) delta slope distance - distance value (index) in slope between the difference is calculated To use a flow variable check box and type in variable names using the format: FLOWVAR(variable name) Layout Options: - plot points or lines or both - set size of points (default = [0.7,2.0,1.5,1.2]) - give plot title and title factor (default = 1.2) - adjust y-axis scale (enter min + max for y-axis) #1.0 Parameter selection #R debugging: delete all variables except kIn #rm(list=ls(all=TRUE)[which(ls(all=TRUE) != "kIn")]) # Define your numerical parameters for input signal signalValueColumn = # Define your numerical parameters for time series timeColumn = output = "table" # delete this line for snippet template, to diaable GUI choise # Define peak detection parameters # adjustment parameter, minimum peak height threshold and lower threshold minPeakHeight = as.numeric() threshold = as.numeric() # adjustment parameter, minimum difference in slope (within chosen distance) minDeltaSlope = as.numeric() # More/advanced peak definition parameters # adjustment parameter, distance value (index) in slope between the difference is calculated distanceDeltaSlope = as.numeric() # adjustment parameter, minimum number of increasing/decreasing steps before/after a peak is reached nUpsDowns = as.numeric() # adjustment parameter, the minimum (absolute) height / distance (indices) peaks has to have to be recognized minPeakDistance = as.numeric() #diagram options plotPointsLines = pointPointSizes = as.numeric() plotTitleSize = as.numeric(c()) = minDeltaSlope)) # now start moving left while (currentSlopeDelta >= minDeltaSlope & currentSlopePosition > 1) { currentSlopePosition = currentSlopePosition - 1 currentSlopeDelta <- pSignal[currentSlopePosition + distanceDeltaSlope] - pSignal[currentSlopePosition] #print(c(i, currentPeakPosition, currentSlopePosition, pSignal[currentSlopePosition + distanceDeltaSlope], pSignal[currentSlopePosition], currentSlopeDelta, currentSlopeDelta > minDeltaSlope)) } peakOnset[i] <- currentSlopePosition + 1 # this vector will contain all peak onset indices #print(peakOnset) # find peak offset and move from peak maximum rightwards (higher index) until currentSlopeDelta is smaller than distanceDeltaSlope currentSlopePosition <- currentPeakPosition + distanceDeltaSlope # currentSlopePosition is positon that is increased and checked iteratively if(currentSlopePosition >= length(pSignal)) currentSlopePosition = length(pSignal) # if peak is at right edge of signal currentSlopeDelta <- pSignal[currentPeakPosition] - pSignal[currentSlopePosition] #print(c(i, currentPeakPosition, currentSlopePosition, pSignal[currentPeakPosition], pSignal[currentSlopePosition], currentSlopeDelta, currentSlopeDelta >= minDeltaSlope)) # now start moving right while (currentSlopeDelta >= minDeltaSlope & currentSlopePosition < length(pSignal)) { currentSlopePosition = currentSlopePosition + 1 currentSlopeDelta <- pSignal[currentSlopePosition - distanceDeltaSlope] - pSignal[currentSlopePosition] #print(c(i, currentPeakPosition, currentSlopePosition, pSignal[currentSlopePosition - distanceDeltaSlope], pSignal[currentSlopePosition], currentSlopeDelta, currentSlopeDelta > minDeltaSlope)) } peakOffset[i] <- currentSlopePosition - 1 # this vector will contain all peak offset indices #print(peakOffset) } } # this is the final index-based result table peakTable <- cbind(peakTable, onset = peakOnset, offset = peakOffset) # make rOut data frame for returing data to KNIME rOut <- data.frame(peakTable) # replace index values (position start end onset offset) with real (time series) values from input table if (timeColumn != "-USE INDEX-") { if (rOut[1,"position"] != 0) { for (col in 2:6) rOut[, col] <- kIn[rOut[, col], timeColumn] } } #2.4.1 layout plot # plot signal values if (output == "plot") { # if this is plot template then true, if this is snippet template then false if (plotPointsLines == "points_and_lines") plotPointsLines <- "o" if (plotPointsLines == "points_only") plotPointsLines <- "p" if (plotPointsLines == "lines_only") plotPointsLines <- "l" plot(x = xValues, y = pSignal, type=plotPointsLines, col="grey40", cex = pointPointSizes[1], ylim = yAxisScale, main = plotTitle, xlab = xAxisTitle, ylab = signalValueColumn, cex.main = plotTitleSize) grid() #2.4.2 plot data # plot peaks, start, end and values between onset and offset if (peakTable[1,"position"] != 0) { for(i in 1:nrow(peakTable)) { points(kIn[peakTable[i,"onset"]:peakTable[i,"offset"], "xValues"], kIn[peakTable[i,"onset"]:peakTable[i,"offset"], signalValueColumn], pch=20, col="black", cex = pointPointSizes[4]) } points(kIn[peakTable[,"position"], "xValues"], kIn[peakTable[,"position"], signalValueColumn], pch=24, col="red", cex = pointPointSizes[2]) points(kIn[peakTable[, "start"], "xValues"], kIn[peakTable[, "start"], signalValueColumn], pch=22, col="darkorange", cex = pointPointSizes[3]) points(kIn[peakTable[, "end"], "xValues"], kIn[peakTable[, "end"], signalValueColumn], pch=23, col="darkred", cex = pointPointSizes[3]) } #2.4.3 plot options # adds horizontal lines if (plotThreshold) abline(h = threshold, lty = 2) if (plotMinPeakHeight) abline(h = minPeakHeight, lty = 3) #abline(h = hlines[2], lwd = 1.5) } # end plot data # debug #rOut #peakTable #findpeaks(pSignal, threshold=threshold, sortstr=TRUE) ]]>