##########################################
## Solution for HW7 - RNA-Seq Analysis  ##
##########################################

library(Rsubread)
library(GenomicFeatures)
library(systemPipeR)
library(edgeR)

## ── Input setup ───────────────────────────────────────────────
## Run after completing the RNA-Seq workflow through HISAT2 mapping.
## The sal object and sorted BAM files must be available.
## The TxDb object (tair10.sqlite) is created from the GFF annotation
## during the workflow and loaded here with loadDb().

txdb <- loadDb("./data/tair10.sqlite")

## Exons grouped by gene — used as annotation for featureCounts
## in Parts A and B (equivalent to exonsBy(txdb, by="gene"))
eByg <- exonsBy(txdb, by = "gene")

## Helper: convert GRangesList/GRanges to featureCounts annotation data frame
grl_to_df <- function(grl) {
    gr <- unlist(grl)
    data.frame(GeneID = names(gr),
               Chr    = as.character(seqnames(gr)),
               Start  = start(gr),
               End    = end(gr),
               Strand = as.character(strand(gr)),
               stringsAsFactors = FALSE)
}

bam_files <- getColumn(sal, step = "hisat2_mapping",
                       "outfiles", column = "samtools_sort_bam")
## Alternative if sal is not available:
# bam_files <- list.files("results/hisat2_mapping/",
#                         pattern = "sorted.bam$", full.names = TRUE)

########################################################
## A. Unstranded and Strand-Specific Read Counting    ##
########################################################

############
## Task A1 ##
############

## 1. Unstranded — strandSpecific = 0
fc_unstranded <- featureCounts(
    files                  = bam_files,
    annot.ext              = grl_to_df(eByg),
    isGTFAnnotationFile    = FALSE,
    useMetaFeatures        = TRUE,
    strandSpecific         = 0,
    isPairedEnd            = TRUE,
    countMultiMappingReads = FALSE,
    nthreads               = 4
)
unstranded <- fc_unstranded$counts
unstranded[1:4, ]

## 2. Sense (positive) strand — strandSpecific = 1
fc_stranded_pos <- featureCounts(
    files                  = bam_files,
    annot.ext              = grl_to_df(eByg),
    isGTFAnnotationFile    = FALSE,
    useMetaFeatures        = TRUE,
    strandSpecific         = 1,
    isPairedEnd            = TRUE,
    countMultiMappingReads = FALSE,
    nthreads               = 4
)
stranded_pos <- fc_stranded_pos$counts
stranded_pos[1:4, ]

## 3. Antisense (negative) strand — strandSpecific = 2
fc_stranded_neg <- featureCounts(
    files                  = bam_files,
    annot.ext              = grl_to_df(eByg),
    isGTFAnnotationFile    = FALSE,
    useMetaFeatures        = TRUE,
    strandSpecific         = 2,
    isPairedEnd            = TRUE,
    countMultiMappingReads = FALSE,
    nthreads               = 4
)
stranded_neg <- fc_stranded_neg$counts
stranded_neg[1:4, ]

############
## Task A2 ##
############
## Test whether the two strand-specific count tables sum to very similar
## values as the unstranded count table. For paired-end data one can only
## expect very similar but not identical results. The following calculates
## for each strand-specific result the averaged mapping frequency in percent
## relative to the unstranded result.

perc_pos <- mean((stranded_pos / unstranded) * 100, na.rm = TRUE)
perc_neg <- mean((stranded_neg / unstranded) * 100, na.rm = TRUE)
perc_pos
perc_neg
## Since both perc_pos and perc_neg are close to 50%, the RNA-Seq data set
## used here is very likely unstranded.

############
## Task A3 ##
############
## Biological utility of the different strand-specific counting modes:
##
## 1. When to use strand-specific counting:
##    If an RNA-Seq library has been prepared in a strand-specific manner
##    (e.g. using dUTP or ligation-based strand-specific library prep kits),
##    read counting should be performed using the corresponding strandSpecific
##    mode (1 or 2) to correctly assign reads to genes. Using unstranded
##    counting on stranded data would undercount reads and reduce sensitivity.
##
## 2. Overlapping genes on opposite strands:
##    Strand-specific counting allows unambiguous assignment of reads to
##    overlapping genes encoded on opposite strands. Without strand information,
##    reads from two overlapping antisense genes cannot be reliably assigned
##    to one or the other, leading to ambiguous or discarded counts.
##
## 3. Antisense strand counting:
##    Counting reads on the antisense strand (strandSpecific = 2) can reveal
##    natural antisense transcripts (NATs) and other antisense regulation
##    events such as gene silencing via antisense non-coding RNAs. These
##    are biologically important but invisible in unstranded or sense-only
##    counting.
##
## 4. Determining library strandedness:
##    By comparing the sense and antisense strand-specific count tables with
##    the unstranded count table, one can determine whether an RNA-Seq
##    experiment used a strand-specific protocol. If both perc_pos and
##    perc_neg are close to ~50%, the library is unstranded. If one value
##    dominates (e.g. ~100% sense, ~0% antisense), the library is
##    strand-specific and the appropriate strandSpecific mode should be
##    used for downstream analysis.

##################################################
## B. Read Counting for Different Feature Types ##
##################################################

############
## Task B  ##
############

## 1. Genes — eByg = exonsBy(txdb, by="gene"), loaded from tair10.sqlite
fc_genes <- featureCounts(
    files                  = bam_files,
    annot.ext              = grl_to_df(eByg),
    isGTFAnnotationFile    = FALSE,
    useMetaFeatures        = TRUE,
    strandSpecific         = 1,
    isPairedEnd            = TRUE,
    countMultiMappingReads = FALSE,
    nthreads               = 4
)
gene_ranges_countDF <- fc_genes$counts
gene_ranges_countDF[1:4, ]

## 2. Exons — individual exon level using exons(txdb)
exon_ranges <- exons(txdb)
fc_exons <- featureCounts(
    files                  = bam_files,
    annot.ext              = grl_to_df(exon_ranges),
    isGTFAnnotationFile    = FALSE,
    useMetaFeatures        = FALSE,
    strandSpecific         = 1,
    isPairedEnd            = TRUE,
    countMultiMappingReads = FALSE,
    nthreads               = 4
)
exon_ranges_countDF <- fc_exons$counts
exon_ranges_countDF[1:4, ]

## 3. Exons by genes — eByg is directly equivalent to exonsBy(txdb, by="gene")
##    so we reuse the gene-level result
exonByg_ranges_countDF <- gene_ranges_countDF
exonByg_ranges_countDF[1:4, ]

## 4. Introns by transcripts — derive from txdb, pass GRangesList directly
intron_ranges <- intronsByTranscript(txdb, use.names = TRUE)
fc_introns <- featureCounts(
    files                  = bam_files,
    annot.ext              = grl_to_df(intron_ranges),
    isGTFAnnotationFile    = FALSE,
    strandSpecific         = 1,
    isPairedEnd            = FALSE,
    countMultiMappingReads = FALSE,
    nthreads               = 4
)
intron_ranges_countDF <- fc_introns$counts
intron_ranges_countDF[1:4, ]

## 5. 5'-UTRs by transcripts — same approach as introns
fiveUTR_ranges <- fiveUTRsByTranscript(txdb, use.names = TRUE)
fc_fiveUTR <- featureCounts(
    files                  = bam_files,
    annot.ext              = grl_to_df(fiveUTR_ranges),
    isGTFAnnotationFile    = FALSE,
    strandSpecific         = 1,
    isPairedEnd            = FALSE,
    countMultiMappingReads = FALSE,
    nthreads               = 4
)
fiveUTR_ranges_countDF <- fc_fiveUTR$counts
fiveUTR_ranges_countDF[1:4, ]

#####################
## C. DEG Analysis ##
#####################

############
## Task C  ##
############

## Retrieve sample comparisons from the workflow sal object
cmp <- readComp(stepsWF(sal)[["hisat2_mapping"]],
                format = "matrix", delim = "-")
## Alternative if sal is not available:
# cmp <- readComp(file = "targetsPE.txt", format = "matrix", delim = "-")

## DEG analysis with unstranded count table
edgeDF_unstranded <- run_edgeR(
    countDF     = unstranded,
    targets     = targetsWF(sal)[["hisat2_mapping"]],
    cmp         = cmp[[1]],
    independent = FALSE,
    mdsplot     = ""
)

## DEG analysis with sense strand count table
edgeDF_pos <- run_edgeR(
    countDF     = stranded_pos,
    targets     = targetsWF(sal)[["hisat2_mapping"]],
    cmp         = cmp[[1]],
    independent = FALSE,
    mdsplot     = ""
)

## (1) 4-way Venn diagram for unstranded count table
DEG_list_unstranded <- filterDEGs(
    degDF  = edgeDF_unstranded,
    filter = c(Fold = 2, FDR = 40),
    plot   = FALSE
)
vennsetup   <- overLapper(DEG_list_unstranded$Up[6:9],   type = "vennsets")
vennsetdown <- overLapper(DEG_list_unstranded$Down[6:9], type = "vennsets")
pdf("results/DEGcount_unstranded.pdf")
vennPlot(list(vennsetup, vennsetdown),
         mymain = "", mysub = "", colmode = 2,
         ccol = c("blue", "red"))
dev.off()

## (2) 4-way Venn diagram for sense strand count table
DEG_list_pos <- filterDEGs(
    degDF  = edgeDF_pos,
    filter = c(Fold = 2, FDR = 40),
    plot   = FALSE
)
vennsetup   <- overLapper(DEG_list_pos$Up[6:9],   type = "vennsets")
vennsetdown <- overLapper(DEG_list_pos$Down[6:9], type = "vennsets")
pdf("results/DEGcount_pos.pdf")
vennPlot(list(vennsetup, vennsetdown),
         mymain = "", mysub = "", colmode = 2,
         ccol = c("blue", "red"))
dev.off()

###################################################################
## D. Bonus Challenge — edgeR Native Mode, Time Course Design    ##
###################################################################

## The Howard et al. (2013) experiment is a 2-factor time course:
##   treatment: mock (M) vs Pseudomonas-infected (A)
##   time: 1h, 2h, 12h post-inoculation
## An interaction model ~ treatment + time + treatment:time
## captures whether the treatment effect changes over time —
## the biologically meaningful question in a time course experiment.

## Step 1 — Prepare sample metadata
targets           <- read.delim("targetsPE.txt", comment.char = "#")
targets$treatment <- ifelse(grepl("^M", targets$Factor), "mock", "infected")
targets$time      <- gsub("^[MA]", "", targets$Factor)
targets$time      <- factor(targets$time,      levels = c("1", "2", "12"))
targets$treatment <- factor(targets$treatment, levels = c("mock", "infected"))
targets[, c("SampleName", "Factor", "treatment", "time")]

## Step 2 — Create DGEList and filter low-count genes
countDF <- unstranded[, targets$SampleName]
countDF[is.na(countDF)] <- 0
dge     <- DGEList(counts = countDF, group = targets$Factor)

## Keep genes with CPM >= 1 in at least 2 samples
keep <- rowSums(cpm(dge) >= 1) >= 2
dge  <- dge[keep, , keep.lib.sizes = FALSE]
message(sum(keep), " genes retained after low-count filtering")

## TMM normalization
dge <- calcNormFactors(dge, method = "TMM")

## Step 3 — Build interaction design matrix
design_tc <- model.matrix(~ treatment + time + treatment:time,
                          data = targets)
colnames(design_tc)
## Intercept, treatmentinfected, time2, time12,
## treatmentinfected:time2, treatmentinfected:time12

## Step 4 — Estimate dispersion and fit QL GLM
dge    <- estimateDisp(dge, design_tc, robust = TRUE)
plotBCV(dge)
fit_tc <- glmQLFit(dge, design_tc, robust = TRUE)

## Step 5 — Test interaction terms (treatment effect changes over time)
interaction_cols <- grep("treatmentinfected:time", colnames(design_tc))
qlf_interaction  <- glmQLFTest(fit_tc, coef = interaction_cols)

topTags(qlf_interaction, n = 20)
summary(decideTests(qlf_interaction, p.value = 0.05))

tc_results <- as.data.frame(topTags(qlf_interaction, n = Inf))
write.csv(tc_results, "results/edgeR_timecourse_interaction.csv")

## Step 6 — Comparison with pairwise result from Part C
##
## The interaction model identifies genes whose response to Pseudomonas
## infection changes significantly over time — e.g. genes strongly induced
## at 12h but not at 1h or 2h. This is the core biological question of a
## time course experiment and tests all samples jointly in a single model.
##
## The pairwise approach in run_edgeR (Part C) compares mock vs infected
## at each time point independently. It asks "which genes differ between
## conditions at each time point?" but does not model the temporal trajectory.
##
## The interaction model is generally more powerful for detecting dynamic
## treatment responses because it borrows information across time points.
## It typically returns a different — often larger — biologically relevant
## DEG set. The two approaches answer complementary questions:
##   - Pairwise: "Is this gene DE at time X?"
##   - Interaction: "Does this gene's response to infection change over time?"
## Choice of design should always be driven by the biological question.