--- title: "Gene expression and Disease" author: "Aparna Pandey and Stephan Peischl" date: "2024-05-17" format: html: toc: true code-tools: true engine: knitr --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # Introduction In this analysis, we simulate a biological dataset where gene expression levels influence the presence or absence of a disease trait. We generate synthetic gene expression data, create a response variable (`disease_presence`), and fit logistic regression and decision tree models. Finally, we evaluate and visualize the models. **Concept companions (short reads on the site):** [Module 01 — Foundations](../modules/module-01-foundations.qmd) (splits and generalization) and [Module 03 — Trees](../modules/module-03-trees-and-cultures.qmd) — this file is the **long gene lab** with all simulation and plots. # Summary of the Script ## Dataset Generation: - Number of Samples (n): 200 - Number of Features (p): 10 genes (named Gene1 to Gene10) - Gene Expression Data: Simulated as normally distributed random values. - Causal Genes: - Gene1: Positive effect with a coefficient of 0.5, plus interaction with Gene3. - Gene3: Positive effect with a coefficient of 0.8, plus interaction with Gene1. - Gene4: Negative effect with a coefficient of -0.3. - Response Variable (disease_presence): Binary (indicating the presence or absence of a disease trait) generated based on a linear combination of Gene1, Gene3, and Gene4 with some added noise. ## Modeling: - Logistic Regression: A generalized linear model (glm) with a binomial family was used to model the binary response variable (disease_presence). - Decision Tree (rpart): A recursive partitioning tree was used to classify the binary response variable. ## Visualization: - Pair Plot: Shows relationships among Gene1 to Gene4, colored by the response variable (disease_presence). - Decision Boundaries: Plots decision boundaries for logistic regression and the rpart model using Gene1 and Gene3 as features. # Data Generation We simulate gene expression data for 200 samples and 10 genes. `Gene1`, `Gene3`, and `Gene4` are causal genes influencing disease presence. ```{r data-generation, message=FALSE, warning=FALSE} # Load required libraries library(yardstick) library(rpart) library(ggplot2) library(rlang) library(GGally) # Set seed for reproducibility set.seed(123) # Number of samples n <- 200 # Number of genes (features) p <- 10 # Generate gene names gene_names <- paste0("Gene", 1:p) # Generate gene expression levels genes_data <- data.frame(matrix(rnorm(n * p), nrow = n)) colnames(genes_data) <- gene_names # True coefficients representing gene influence on disease presence true_coefficients <- c(0.5, rep(0, 1), 0.8, -0.3, rep(0, p - 3)) # Generate disease trait presence based on gene expression levels disease_presence <- ifelse(0.5 * genes_data$Gene1 + 0.8 * genes_data$Gene3 - 0.3 * genes_data$Gene4 - genes_data$Gene1*genes_data$Gene3 * 2 + rnorm(n, sd=0.5) > 0, 1, 0) # Convert disease_presence to a factor with consistent levels disease_presence <- factor(disease_presence, levels = c(0, 1), labels = c("Absent", "Present")) # Combine gene expression and disease presence into a single data frame bio_data <- data.frame(genes_data, disease_presence) ``` ## Visualizations ```{r} # Pair Plot of some features colored by disease presence pair_plot <- ggpairs(bio_data, columns = 1:5, aes(color = disease_presence)) + theme_minimal() print(pair_plot) ``` ## Modeling We fit logistic regression and decision tree models to classify disease presence. ```{r} # Using logistic regression logistic_model <- glm(disease_presence ~ ., data = bio_data, family = "binomial") # Using rpart for classification rpart_model <- rpart(disease_presence ~ ., data = bio_data, method = "class") ``` ## Evaluation We evaluate and compare the accuracy of logistic regression and decision tree models. ```{r} # Predictions from models logistic_pred <- predict(logistic_model, type = "response") rpart_pred <- predict(rpart_model, type = "class", newdata = bio_data) # Convert predictions to factors with levels logistic_pred <- factor(ifelse(logistic_pred > 0.5, "Present", "Absent"), levels = c("Absent", "Present")) rpart_pred <- factor(rpart_pred, levels = levels(disease_presence)) # Evaluate model performance (yardstick) logistic_accuracy <- mean(logistic_pred == bio_data$disease_presence) rpart_accuracy <- mean(rpart_pred == bio_data$disease_presence) cat("Logistic Regression Accuracy:", logistic_accuracy, "\n") cat("Decision Tree (rpart) Accuracy:", rpart_accuracy, "\n") ``` ```{r plot-decision-tree, fig.width=8, fig.height=8} # Plotting the decision tree library(rpart.plot) rpart.plot(rpart_model,type=5) ``` ## Summarize Logistic Regression Coefficients ```{r summarize-logistic-coefficients} # Summarizing logistic regression coefficients summary(logistic_model) ``` ```{r} # Decision boundary visualization plot_decision_boundary <- function(model, data, feature1, feature2, model_type) { # Create grid for visualization grid <- expand.grid( Feature1 = seq(min(data[[feature1]]) - 1, max(data[[feature1]]) + 1, length.out = 200), Feature2 = seq(min(data[[feature2]]) - 1, max(data[[feature2]]) + 1, length.out = 200) ) colnames(grid) <- c(feature1, feature2) # Add the remaining variables as means for (feature in setdiff(names(data), c(feature1, feature2, "disease_presence"))) { grid[[feature]] <- mean(data[[feature]]) } if (model_type == "logistic") { grid$response <- predict(model, newdata = grid, type = "response") grid$response <- factor(ifelse(grid$response > 0.5, "Present", "Absent"), levels = c("Absent", "Present")) } else { grid$response <- predict(model, newdata = grid, type = "class") } ggplot(data, aes(x = !!rlang::sym(feature1), y = !!rlang::sym(feature2))) + geom_point(aes(fill = disease_presence), alpha = 1, shape = 21, col = "black", size = 3) + geom_tile( data = grid, aes(x = !!rlang::sym(feature1), y = !!rlang::sym(feature2), fill = response), alpha = 0.3, inherit.aes = FALSE ) + scale_fill_manual(values = c("Absent" = "turquoise", "Present" = "magenta")) + scale_color_manual(values = c("Absent" = "turquoise", "Present" = "magenta")) + theme_minimal() + labs(title = paste("Decision Boundary for", model_type, "Model"), x = feature1, y = feature2) } # Decision boundary for Logistic Regression using Gene1 and Gene3 print(plot_decision_boundary(logistic_model, bio_data, "Gene1", "Gene3", "logistic")) # Decision boundary for rpart using Gene1 and Gene3 print(plot_decision_boundary(rpart_model, bio_data, "Gene1", "Gene3", "rpart")) ```