#### Solutions: practical about data exploration #### ## Data loading ## We now dispose of the following tables, corresponding to the different steps of the pre-processing # - rawValues: raw values # - log2Values: log2-transformed values # - log2Filtered: filtered out the undetected features # - log2MedianCentered: median-based sample-wise centering # - log2Standardized: median-based centering and IQR-based scaling ## We will have to choose the appropriate data types depending on our purposes ## Choose one data type and copy it to a variable named x x <- log2Filtered ## Check the dimensions dim(x) names(x) head(x) # Histogram hist(unlist(log2Filtered), breaks=100) ## If this si fine you can go to the next sections of this script #### Compute feature-wise descriptive statistics ##### ## Compute the mean of each feature meanPerFt <- apply(X = x, MARGIN = 1, FUN = mean) length(meanPerFt) class(meanPerFt) head(meanPerFt) hist(meanPerFt, breaks = 100, main = paste(parameters$datatype, parameters$dataset, "\n", "mean per feature"), xlab = "log2(values)") varPerFt <- apply(X = x, MARGIN = 1, FUN = var) ## Mean-variance plot plot(meanPerFt, varPerFt) ## Create a data frame with one row per feature and one column per statistics featureStat <- data.frame( mean = apply(X = x, MARGIN = 1, FUN = mean), var = apply(X = x, MARGIN = 1, FUN = var) ) featureStat$sd <- apply(X = x, MARGIN = 1, FUN = sd) dim(featureStat) names(featureStat) head(featureStat) ## A vous de jouer : add columns to featureStat with the different statistics ## IQR(), min(), p05, p25, median(), p75, p95, max() ## Tip: compute the percentile with function quantile() ## I start again, in order to get the features in a meaningful order ## Estimator of central tendency featureStat <- data.frame( mean = apply(X = x, MARGIN = 1, FUN = mean), median = apply(X = x, MARGIN = 1, FUN = median) ) ## Estimators of dispersion featureStat$sd <- apply(X = x, MARGIN = 1, FUN = sd) featureStat$IQR <- apply(X = x, MARGIN = 1, FUN = IQR) featureStat$var <- apply(X = x, MARGIN = 1, FUN = var) ## Milestones percentiles featureStat$min <- apply(X = x, MARGIN = 1, FUN = min) featureStat$p05 <- apply(X = x, MARGIN = 1, FUN = quantile, prob = 0.05) featureStat$p25 <- apply(X = x, MARGIN = 1, FUN = quantile, prob = 0.25) featureStat$p75 <- apply(X = x, MARGIN = 1, FUN = quantile, prob = 0.75) featureStat$p90 <- apply(X = x, MARGIN = 1, FUN = quantile, prob = 0.90) featureStat$max <- apply(X = x, MARGIN = 1, FUN = max) ## Some representative plots xmin <- floor(min(x)) # compute the limits of all the values xmax <- ceiling(max(x)) par(mfrow = c(3,2)) hist(featureStat$mean, breaks = 100, main = paste(parameters$datatype, parameters$dataset, "\n", "mean"), xlab = "log2(value)", xlim = c(xmin, xmax)) hist(featureStat$median, breaks = 100, main = "median", xlab = "log2(value)", xlim = c(xmin, xmax)) hist(featureStat$min, breaks = 100, main = "min", xlab = "log2(value)", xlim = c(xmin, xmax)) hist(featureStat$max, breaks = 100, main = "max", xlab = "log2(value)", xlim = c(xmin, xmax)) hist(featureStat$sd, breaks = 100, main = "sd", xlab = "log2(value)") hist(featureStat$var, breaks = 100, main = "var", xlab = "log2(value)") par(mfrow = c(1,1)) #### Mean versus median plot #### ## As simple as possible plot(featureStat$mean, featureStat$median) ## A bit more readable ## We use the function densCols to color features according to the local density. plot(x = featureStat$mean, y = featureStat$median, las = 1, col = densCols(x = featureStat$mean, y = featureStat$median), main = paste(parameters$datatype, parameters$dataset, "\n", "Mean versus median"), xlab = "mean", ylab = "median") grid() abline(a = 0, b = 1) #### Mean versus variance plot #### plot(x = featureStat$mean, y = featureStat$var, las = 1, col = densCols(x = featureStat$mean, y = featureStat$var), main = paste(parameters$datatype, parameters$dataset, "\n", "Mean versus var"), xlab = "mean", ylab = "var") grid() #### Mean versus standard deviation plot #### plot(x = featureStat$mean, y = featureStat$sd, las = 1, col = densCols(x = featureStat$mean, y = featureStat$sd), main = paste(parameters$datatype, parameters$dataset, "\n", "Mean versus standard deviation"), xlab = "mean", ylab = "sd") grid() #### Standard deviation versus variance plot #### ## This is trivial: the SD is the square root of the variance plot(x = featureStat$sd, y = featureStat$var, las = 1, col = densCols(x = featureStat$sd, y = featureStat$var), main = paste(parameters$datatype, parameters$dataset, "\n", "Standard deviation versus var"), xlab = "sd", ylab = "var") grid() #### Standard deviation versus IQR plot #### plot(x = featureStat$sd, y = featureStat$IQR, las = 1, col = densCols(x = featureStat$sd, y = featureStat$IQR), main = paste(parameters$datatype, parameters$dataset, "\n", "Standard deviation versus IQR"), xlab = "sd", ylab = "IQR") grid() ## Note: the IQR of a normal distribution is ~1.35 whereas the standard dev is 1 normIQR <- qnorm(p = 0.75) - qnorm(p = 0.25) message("Standard normal IQR = ", normIQR) abline(a = 0, b = normIQR) ## Check what it gives with normal data z <- data.frame(matrix(nrow = 8000, ncol = 10, rnorm(800000))) zSD <- apply(z, 1, sd) zIQR <- apply(z, 1, IQR) #### Standard deviation versus IQR plot #### plot(x = zSD, y = zIQR, las = 1, col = densCols(x = zSD, y = zIQR), main = paste("rnorm data", "\n", "Standard deviation versus IQR"), xlab = "sd", ylab = "IQR") grid() abline(a = 0, b = normIQR) points(median(zSD), median(zIQR)) ## TO DO # - export featureStats in a tab-separate value file (.tsv) # - display the feature stat table with kable()