# ANOVA # # Calculates averages and fit several ANOVA models. # # References: # https://stats.stackexchange.com/questions/76815/multiple-regression-or-partial-correlation-coefficient-and-relations-between-th # https://stats.stackexchange.com/questions/32464/how-does-the-correlation-coefficient-differ-from-regression-slope # # Usage: # # Rscript anova_tests.R input.RData output.csv library(broom) # library(ppcor) # library for partial correlation coefs # Options ----------------------------------------------------------------- args <- commandArgs(trailingOnly = TRUE) # for kidney # args <- list("/projects/korstanje-lab/ytakemon/JAC_DO_Kidney/RNAseq_data/DO188b_kidney_noprobs.RData", "/projects/korstanje-lab/ytakemon/JAC_DO_Kidney/Anova_output/kidney_anova_table.csv") # # two arguments expected (input and output files) stopifnot(length(args)==2) input.file = args[[1]] stopifnot(file.exists(input.file)) output.file = args[[2]] load(input.file) # Averages per group ------------------------------------------------------- ## calculates mean expression for each group ## (for example mRNA average expression of Age==6 samples) averages.mrna <- NULL vars <- c("Age", "Sex", "Generation") for (v in vars) for (u in sort(unique(annot.samples[,v]))) { sel <- annot.samples[,v] == u new.average <- colMeans(expr.mrna[sel,1:N[["pairs"]]], na.rm=TRUE) averages.mrna <- cbind(averages.mrna, new.average) colnames(averages.mrna)[ncol(averages.mrna)] <- paste("average", "mRNA", v, u, sep=".") } averages.protein <- NULL vars <- c("Age", "Sex", "Generation") for (v in vars) for (u in sort(unique(annot.samples[,v]))) { sel <- annot.samples[,v] == u new.average <- colMeans(expr.protein[sel,1:N[["pairs"]]], na.rm=TRUE) averages.protein <- cbind(averages.protein, new.average) colnames(averages.protein)[ncol(averages.protein)] <- paste("average", "Prot", v, u, sep=".") } # ANOVA (p-values, normalized coefs, and slopes) ----------------------------------- ## test for dependence between Age/Sex and x (x is mRNA/Prot expression) anova_tests_2 <- function(x) { # full model without interaction tmp.full <- lm(x ~ Age + Sex + Generation, data=annot.samples) lm.full <- tidy(tmp.full) pvalues <- c(subset(lm.full, term=="Age")$p.value, subset(lm.full, term=="Sex")$p.value) # partial correlation coefficient equals normalized beta * CONST # see http://stats.stackexchange.com/questions/76815/multiple-regression-or-partial-correlation-coefficient-and-relations-between-th pres <- !is.na(x) # must be calculated on the same observations tmp.age <- lm(Age ~ Sex + Generation, data=annot.samples[pres,]) tmp.sex <- lm(Sex ~ Age + Generation, data=annot.samples[pres,]) sigma.full <- sd(tmp.full$resid, na.rm = TRUE) sigma.age <- sd(tmp.age$resid, na.rm = TRUE) sigma.sex <- sd(tmp.sex$resid, na.rm = TRUE) coefs <- c(subset(lm.full, term=="Age")$estimate * sigma.age, subset(lm.full, term=="Sex")$estimate * sigma.sex) / sigma.full slopes <- c(subset(lm.full, term=="Age")$estimate, subset(lm.full, term=="Sex")$estimate) return(c(pvalues,coefs,slopes)) } ## test for dependence between Age/Sex and x (x is mRNA/Prot expression) ## conditioned on y (y is Prot/mRNA expression) anova_tests_3 <- function(x,y) { # full model without interaction tmp.full <- lm(x ~ Age + Sex + Generation + y, data=annot.samples) lm.full <- tidy(tmp.full) # the model without Age tmp.noage <- lm(x ~ Sex + Generation + y, data=annot.samples) lm.sex <- tidy(tmp.noage) # the model without Sex tmp.nosex <- lm(x ~ Age + Generation + y, data=annot.samples) lm.age <- tidy(tmp.nosex) pvalues <- c(subset(lm.full, term=="Age")$p.value, subset(lm.full, term=="Sex")$p.value, subset(lm.full, term=="y")$p.value, subset(lm.sex, term=="y")$p.value, subset(lm.age, term=="y")$p.value) # partial correlation coefficient equals normalized beta * CONST # see http://stats.stackexchange.com/questions/76815/multiple-regression-or-partial-correlation-coefficient-and-relations-between-th pres <- !is.na(x) & !is.na(y) # must be calculated on the same observations tmp.age <- lm(Age ~ Sex + Generation + y[pres], data=annot.samples[pres,]) tmp.sex <- lm(Sex ~ Age + Generation + y[pres], data=annot.samples[pres,]) tmp.y <- lm(y[pres] ~ Sex + Age + Generation, data=annot.samples[pres,]) sigma.full <- sd(tmp.full$resid, na.rm = TRUE) sigma.age <- sd(tmp.age$resid, na.rm = TRUE) sigma.sex <- sd(tmp.sex$resid, na.rm = TRUE) sigma.y <- sd(tmp.y$resid, na.rm = TRUE) tmp.noage.y <- lm(y[pres] ~ Sex + Generation, data=annot.samples[pres,]) tmp.nosex.y <- lm(y[pres] ~ Age + Generation, data=annot.samples[pres,]) sigma.noage <- sd(tmp.noage$resid, na.rm = TRUE) sigma.noage.y <- sd(tmp.noage.y$resid, na.rm = TRUE) sigma.nosex <- sd(tmp.nosex$resid, na.rm = TRUE) sigma.nosex.y <- sd(tmp.nosex.y$resid, na.rm = TRUE) coefs <- c(subset(lm.full, term=="Age")$estimate * sigma.age / sigma.full, subset(lm.full, term=="Sex")$estimate * sigma.sex / sigma.full, subset(lm.full, term=="y")$estimate * sigma.y / sigma.full, subset(lm.sex, term=="y")$estimate * sigma.noage.y /sigma.noage, subset(lm.age, term=="y")$estimate * sigma.nosex.y / sigma.nosex) slopes <- c(subset(lm.full, term=="Age")$estimate, subset(lm.full, term=="Sex")$estimate, subset(lm.full, term=="y")$estimate, subset(lm.sex, term=="y")$estimate, subset(lm.age, term=="y")$estimate) return(c(pvalues,coefs,slopes)) } ## test for interaction between Age and Sex anova_tests_int <- function(x,y) { # full model with interaction lm.x <- tidy(lm(x ~ Age * Sex + Generation, data=annot.samples)) lm.y <- tidy(lm(y ~ Age * Sex + Generation, data=annot.samples)) lm.x.y <- tidy(lm(x ~ Age * Sex + Generation + y, data=annot.samples)) lm.y.x <- tidy(lm(y ~ Age * Sex + Generation + x, data=annot.samples)) pvalues <- c(subset(lm.x, term=="Age:SexM")$p.value, subset(lm.y, term=="Age:SexM")$p.value, subset(lm.x.y, term=="Age:SexM")$p.value, subset(lm.y.x, term=="Age:SexM")$p.value) slopes <- c(subset(lm.x, term=="Age:SexM")$estimate, subset(lm.y, term=="Age:SexM")$estimate, subset(lm.x.y, term=="Age:SexM")$estimate, subset(lm.y.x, term=="Age:SexM")$estimate) return(c(pvalues, slopes)) } cols <- c("p.mRNA_Age.Sex", "p.mRNA_Sex.Age", "r.mRNA_Age.Sex", "r.mRNA_Sex.Age", "m.mRNA_Age.Sex", "m.mRNA_Sex.Age", "p.Prot_Age.Sex", "p.Prot_Sex.Age", "r.Prot_Age.Sex", "r.Prot_Sex.Age", "m.Prot_Age.Sex", "m.Prot_Sex.Age", "p.mRNA_Age.SexProt", "p.mRNA_Sex.AgeProt", "p.mRNA_Prot.SexAge", "p.mRNA_Prot.Sex", "p.mRNA_Prot.Age", "r.mRNA_Age.SexProt", "r.mRNA_Sex.AgeProt", "r.mRNA_Prot.SexAge", "r.mRNA_Prot.Sex", "r.mRNA_Prot.Age", "m.mRNA_Age.SexProt", "m.mRNA_Sex.AgeProt", "m.mRNA_Prot.SexAge", "m.mRNA_Prot.Sex", "m.mRNA_Prot.Age", "p.Prot_Age.SexmRNA", "p.Prot_Sex.AgemRNA", "p.Prot_mRNA.SexAge", "p.Prot_mRNA.Sex", "p.Prot_mRNA.Age", "r.Prot_Age.SexmRNA", "r.Prot_Sex.AgemRNA", "r.Prot_mRNA.SexAge", "r.Prot_mRNA.Sex", "r.Prot_mRNA.Age", "m.Prot_Age.SexmRNA", "m.Prot_Sex.AgemRNA", "m.Prot_mRNA.SexAge", "m.Prot_mRNA.Sex", "m.Prot_mRNA.Age", "p.mRNA_Interaction", "p.Prot_Interaction", "p.mRNA_Interaction.Prot", "p.Prot_Interaction.mRNA", "m.mRNA_Interaction", "m.Prot_Interaction", "m.mRNA_Interaction.Prot", "m.Prot_Interaction.mRNA") result.table <- matrix(NA, N[["pairs"]], length(cols)) colnames(result.table) <- cols # interaction tests print("Testing for interaction between Age and Sex...") for (i in 1:N[["pairs"]]) { if (i %% 100 == 0) print(i) cols <- c("p.mRNA_Interaction","p.Prot_Interaction","p.mRNA_Interaction.Prot","p.Prot_Interaction.mRNA", "m.mRNA_Interaction", "m.Prot_Interaction","m.mRNA_Interaction.Prot", "m.Prot_Interaction.mRNA") result.table[i, which(colnames(result.table) %in% cols)] <- anova_tests_int(expr.mrna[,i], expr.protein[,i]) } ## normalize verything to mean=0, sd=1 ## for easy calculation of partial correlation coefficients for (i in 1:N[["pairs"]]) { expr.mrna[,i] <- scale(expr.mrna[,i]) expr.protein[,i] <- scale(expr.protein[,i]) } annot.samples$Age <- scale(annot.samples$Age) annot.samples$Sex <- scale(as.numeric(factor(annot.samples$Sex))) # ANOVA for genes print("Testing for dependence between mRNA and Age/Sex...") for (i in 1:N[["pairs"]]) { if (i %% 100 == 0) print(i) result.table[i, which(colnames(result.table) %in% c("p.mRNA_Age.Sex","p.mRNA_Sex.Age", "r.mRNA_Age.Sex","r.mRNA_Sex.Age", "m.mRNA_Age.Sex","m.mRNA_Sex.Age"))] <- anova_tests_2(expr.mrna[,i]) } # ANOVA for proteins print("Testing for dependence between mRNA and Age/Sex...") for (i in 1:N[["pairs"]]) { if (i %% 100 == 0) print(i) result.table[i, which(colnames(result.table) %in% c("p.Prot_Age.Sex","p.Prot_Sex.Age", "r.Prot_Age.Sex","r.Prot_Sex.Age", "m.Prot_Age.Sex","m.Prot_Sex.Age"))] <- anova_tests_2(expr.protein[,i]) } # ANOVA for mRNA | Protein expression print("Testing for dependence between mRNA and Age/Sex conditioned on Protein...") for (i in 1:N[["pairs"]]) { if (i %% 100 == 0) print(i) result.table[i, which(colnames(result.table) %in% c("p.mRNA_Age.SexProt","p.mRNA_Sex.AgeProt","p.mRNA_Prot.SexAge","p.mRNA_Prot.Sex","p.mRNA_Prot.Age", "r.mRNA_Age.SexProt","r.mRNA_Sex.AgeProt","r.mRNA_Prot.SexAge","r.mRNA_Prot.Sex","r.mRNA_Prot.Age", "m.mRNA_Age.SexProt","m.mRNA_Sex.AgeProt","m.mRNA_Prot.SexAge","m.mRNA_Prot.Sex","m.mRNA_Prot.Age"))] <- anova_tests_3(expr.mrna[,i], expr.protein[,i]) } # ANOVA for mRNA | Protein expression print("Testing for dependence between mRNA and Age/Sex conditioned on Protein...") for (i in 1:N[["pairs"]]) { if (i %% 100 == 0) print(i) result.table[i, which(colnames(result.table) %in% c("p.Prot_Age.SexmRNA","p.Prot_Sex.AgemRNA","p.Prot_mRNA.SexAge","p.Prot_mRNA.Sex","p.Prot_mRNA.Age", "r.Prot_Age.SexmRNA","r.Prot_Sex.AgemRNA","r.Prot_mRNA.SexAge","r.Prot_mRNA.Sex","r.Prot_mRNA.Age", "m.Prot_Age.SexmRNA","m.Prot_Sex.AgemRNA","m.Prot_mRNA.SexAge","m.Prot_mRNA.Sex","m.Prot_mRNA.Age"))] <- anova_tests_3(expr.protein[,i], expr.mrna[,i]) } # reorder columns - p-values first, corelation coefs second pcols <- grep("^p.", colnames(result.table)) rcols <- grep("^r.", colnames(result.table)) mcols <- grep("^m.", colnames(result.table)) result.table <- result.table[,c(pcols,rcols,mcols)] # LOD (Log-Likelihood) statistics ----------------------------------------- ## test of submodel: x ~ test + y + covar vs x ~ y + covar ## xy = c(x,y), test = age or sex ## to be run with parApply question.loddrop <- function(x, y, test, covar) { # fit models lm.m1 <- lm(x ~ test + y + covar) lm.m0 <- lm(x ~ y + covar) lm.m3 <- lm(x ~ test + covar) lm.m2 <- lm(x ~ covar) # lod <- c((logLik(lm.m1)-logLik(lm.m0))/log(10), (logLik(lm.m3)-logLik(lm.m2)) / log(10)) return(lod) } lods <- matrix(NA, N[["pairs"]], 4) colnames(lods) <- c("lod.Prot_Age.mRNASex", "lod.Prot_Age.Sex", "lod.Prot_Sex.mRNAAge", "lod.Prot_Sex.Age") test.age <- model.matrix(~ Age, data=annot.samples)[,-1] covar.age<- model.matrix(~ Sex + Generation, data=annot.samples)[,-1] test.sex <- model.matrix(~ Sex, data=annot.samples)[,-1] covar.sex<- model.matrix(~ Age + Generation, data=annot.samples)[,-1] # LOD statistics print("Is Age/Sex effect mediated by RNA...") for (i in 1:N[["pairs"]]) { if (i %% 100 == 0) print(i) lods[i, 1:2] <- question.loddrop(expr.protein[,i], expr.mrna[,i], test=test.age,covar=covar.age) lods[i, 3:4] <- question.loddrop(expr.protein[,i], expr.mrna[,i], test=test.sex,covar=covar.sex) } # Output ------------------------------------------------------------------ annot.cols <- c("id", "gene_id", "symbol", "chr", "start", "end", "strand", "biotype") output <- cbind(annot.protein[1:N[["pairs"]], annot.cols], result.table, round(lods,4), round(averages.mrna,4), round(averages.protein,4)) write.csv(output, output.file, row.names=FALSE)