---
title: "Pcod Sex Markers"
output: html_notebook
---
```{r}
require(janitor)
require(readxl)
require(tidyverse)
require(qqman)
require(GenomicRanges)
require(IRanges)
require(plotly)
require(ggpubr)
require(knitr)
load(file="../pcod-juveniles-2023/references/pcod.gtf")
load(file = "../pcod-juveniles-2023/references/pcod.blast.GO")
`%!in%` = Negate(`%in%`)
source("biostats.R")
```
```{r, messages=F}
beagle.order <- read_delim(
file="../pcod-juveniles-2023/lcWGS/analysis-20240606/reference/pcod-refs_filtered_bamslist.txt", delim = "/t", col_names = "sample") %>%
mutate(sample=gsub("/home/lspencer/pcod-lcwgs-2023/analysis-20240606/reference/bamtools/pcod-refs_|_sorted_dedup_clipped.bam",
"", sample))
ref.sex <- beagle.order %>% left_join(
read_excel("../pcod-juveniles-2023/lcWGS/references/20230414_pcod_named.xlsx", na = c("---")) %>% clean_names() %>%
dplyr::select(ablg, sex, marine_region) %>% mutate(sample=paste0("ABLG", ablg)), by="sample") %>% filter(!is.na(sex)) %>% mutate(sex=as.factor(sex))
ref.sex %>% group_by(marine_region, sex) %>% tally() %>% kable(format="markdown")
ref.sex %>% group_by(sex) %>% tally() %>% knitr::kable(format="markdown")
ref.sex %>% select(sample) %>% write_delim(file="fish-ids.txt", col_names = F)
ref.sex %>% select(sex) %>% mutate(sex=as.numeric(sex=="female")) %>% write_delim(file="fish-sex.txt", col_names = F)
ref.sex %>% select(marine_region) %>% mutate(marine_region=as.numeric(as.factor(marine_region))) %>% write_delim(file="fish-region.txt", col_names = F)
# Load beagle that contains sexed fish, NOT Imputed probabilities
sex.beagle<-read.table(gzfile("pcod-sex.beagle.gz"), header = T, sep="\t")
```
### Read in LRT results
```{r}
#lrt<-read.table(gzfile("gwas-sex-imputed.out.lrt0.gz"), header=T, sep="\t")
lrt<-read.table(gzfile("gwas-sex.out.lrt0.gz"), header=T, sep="\t")
#we have a few LRT values that are -999, we should remove them.
length(lrt$LRT) # number of loci
length(which(lrt$LRT == -999)) #number that will be removed
length(which(lrt$LRT != -999))
#remove the values that are not -999 and that are negative
lrt_filt<-lrt[-c(which(lrt$LRT == -999),which(lrt$LRT <= 0)),]
#add snp "name" from rownumber
lrt_filt$SNP<-paste("r",1:length(lrt_filt$Chromosome), sep="")
# get pvalues from chisq
lrt_filt$pvalue<-pchisq(lrt_filt$LRT, df=1, lower=F)
hist(lrt_filt$LRT, breaks=50)
hist(lrt_filt$pvalue, breaks=50)
qqnorm(lrt_filt$pvalue)
# #we also need to make sure we don't have any tricky values like those below
# lrt_filt<-lrt_filt[-c(which(lrt_filt$pvalue == "NaN" ),
# which(lrt_filt$pvalue == "Inf"),
# which(lrt_filt$LRT == "Inf")),]
manhattan(lrt_filt %>% mutate(chr=as.numeric(as.factor(Chromosome))),
chr="chr", bp="Position", p="pvalue", main="Sex-association, GWAS")
lrt_filt %>% mutate(logp=-log10(pvalue)) %>% filter(logp>4) %>% arrange(desc(LRT))
#lrt_filt %>% mutate(fdr_pvalue=p.adjust(pvalue, method="fdr")) %>% filter(fdr_pvalue<0.05) %>% arrange(desc(LRT))
lrt_filt %>% mutate(logp=-log10(pvalue)) %>% filter(logp>4) %>% arrange(desc(LRT)) %>%
unite("marker", Chromosome:Position, sep="_") %>% select(marker)%>% write_delim(file="sex-markers-logp4.txt", col_names = F)
```
### Where are putative sex markers in relation to annotated genes?
```{r}
# Prepare gene annotation ranges
genes4gwas <- pcod.gtf %>% mutate(ncbi_id=paste0("GeneID:", ncbi_id)) %>%
filter(feature=="gene") %>%
mutate(start_flank=start-2000) %>%
mutate(start_flank=as.integer(case_when(start_flank<0~0, TRUE~start_flank))) #%>%
gene_ranges <- GRanges(
seqnames =genes4gwas$seqname,
ranges=IRanges(
start=genes4gwas$start_flank,
# start = genes4gwas$start,
end=genes4gwas$end),
ncbi_id=genes4gwas$ncbi_id,
gene_id=genes4gwas$gene_id)
# Prepare putative sex marker sites locations
sex.sites <- lrt_filt %>% mutate(logp=-log10(pvalue)) %>% filter(logp>4) %>% mutate(marker=paste0(Chromosome, "_", Position))
snp_ranges <- GRanges(
seqnames = sex.sites$Chromosome,
ranges=IRanges(
start = sex.sites$Position,
end = sex.sites$Position),
pvalue=sex.sites$pvalue,
Major=sex.sites$Major,
Minor=sex.sites$Minor)
# Find overlaps between gene flanks and SNP sites
overlaps <- findOverlaps(gene_ranges, snp_ranges)
# Extract matching rows
overlapping_genes <- genes4gwas[queryHits(overlaps), ]
overlapping_snps <- sex.sites[subjectHits(overlaps), ]
# Combine results into a single data frame
result <- cbind(overlapping_genes, overlapping_snps) %>%
mutate(feature=case_when(Position>=start~"gene", Position%
dplyr::select(-c(score, frame, attributes,biotype, exon)) %>%
# dplyr::select(feature, seqname, strand, start_flank, start, end, Position, gene_id, ncbi_id, gene_id) %>%
left_join(pcod.blast.GO[c("ncbi_id", "protein_names", "spid")] %>% mutate(ncbi_id=paste0("GeneID:", ncbi_id)), by="ncbi_id")
# what percent of SNPs are located inside
result %>% dplyr::select(-Position) %>% distinct() %>% group_by(feature) %>% tally() %>%
mutate(total=nrow(genes4gwas)) %>% mutate(perc=round(100*n/total, 2))
# require(clipr)
# (result %>% select(Chromosome, Position, spid) %>% distinct() %>% filter(!is.na(spid)))$spid %>% write_clip() #sex-associated genes
# (pcod.blast.GO %>% filter(!is.na(spid)))$spid %>% write_clip()
# result %>% arrange(desc(LRT)) %>% select(Chromosome, Position, Major, Minor, LRT, SE, pvalue, gene_id, protein_names) %>%
# mutate(across(c(LRT:pvalue), ~ signif(.x, 3))) %>% write_clip()
```
```{r, message=F, warning=F, error=F}
ref.beagle.sex.sites <-
sex.beagle %>% filter(marker %in% sex.sites$marker) %>%
pivot_longer(ABLG1968_AA:ABLG2689_BB, names_to = "individual.allele", values_to = "probability") %>%
separate(individual.allele, into=c("individual", "allele"), sep = "_") %>%
mutate(allele_base=case_when(
allele=="AA" ~ paste0(allele1, "/", allele1),
allele=="AB" ~ paste0(allele1, "/", allele2),
allele=="BB" ~ paste0(allele2, "/", allele2))) %>%
mutate(allele_base = allele_base %>%
str_replace_all("0", "A") %>%
str_replace_all("1", "C") %>%
str_replace_all("2", "G") %>%
str_replace_all("3", "T")) %>%
mutate(probability=round(as.numeric(probability), digits=4)) %>%
group_by(marker, individual) %>%
filter(!(n() == 3 & all(probability == 0.3333))) %>%
slice_max(probability,n=1) %>%
left_join(ref.sex, by=c("individual"="sample")) %>%
mutate(allele_base_num=case_when(
allele=="AA"~ 1,
allele=="AB"~0.5,
allele=="BB"~0))
sex.markers <- (sex.sites %>% arrange(desc(LRT)))$marker
for (i in 1:10) {
meta <- ref.beagle.sex.sites %>% filter(marker==sex.markers[i]) %>% ungroup() %>%
select(marker) %>% distinct() %>%
left_join(result %>% mutate(marker=paste0(Chromosome, "_", Position)))
print(ref.beagle.sex.sites %>% filter(marker==sex.markers[i]) %>%
# filter(!is.na(allele_base)) %>% droplevels() %>%
ggplot() +
geom_point(aes(x=sex, y=allele_base_num, color=sex), position=position_jitter(w = 0.25,h = 0.1)) +
theme_minimal() +
facet_wrap(~marine_region, scales="free_y", nrow = 1) +
scale_color_manual(name="Sex", values=c("male"="#2c7bb6", "female"="#d7191c")) +
#scale_color_manual(name=NULL, values=c("NBS"="blue2", "Aleutians"="orange2", "eGOA"="gray25", "wGOA"="green4", "EBS"="purple")) +
ggtitle(paste0("Marker: ", meta$marker, "\n Gene ID: ", meta$gene_id, "\n Protein: ", meta$protein_names)))
}
```
### Use PCA to look for sex-associated structure
```{r}
# Make dataframe with sample IDs and sex go generate sex-specific allele frequencies
ref.sex %>% select(sample, sex) %>% write_delim(file="fish-ids-sex.txt", col_names = F)
```
```{r}
# Read in covariance matrix and add sample IDs
sex.cov <- read_delim(file="pcod-sex.cov", col_names = ref.sex$sample) %>%
as.matrix() %>%
`rownames<-`(ref.sex$sample)
# Run PCA
pca.sex <- prcomp(sex.cov, scale=F) #scale=F for variance-covariance matrix
#pca.eigenval(pca.princomp) #The Proporation of Variance = %variance
pc.percent <- pca.eigenval(pca.sex)[2,1:6]*100 #PC % for axes 1-6
screeplot(pca.sex, bstick=FALSE) #inspect scree plot, which axes influential?
pc.percent[1:2] %>% sum() # total percent explained by PCs 1 & 2
#### Generate dataframe with prcomp results
pc.scores.sex <- data.frame(sample.id = colnames(sex.cov),
PC1 = pca.sex$rotation[,1], # the first eigenvector
PC2 = pca.sex$rotation[,2], # the second eigenvector
PC3 = pca.sex$rotation[,3], # the third eigenvector
PC4 = pca.sex$rotation[,4], # the fourth eigenvector
PC5 = pca.sex$rotation[,5], # the fourth eigenvector
PC6 = pca.sex$rotation[,6], # the fourth eigenvector
stringsAsFactors = FALSE)
# Add metadata
pc.scores.sex <- left_join(pc.scores.sex, ref.sex,by=c("sample.id"="sample"))
axes <- data.frame("pc.x"=c(1,1,2),
"pc.y"=c(2,3,3)) %>%
mutate(pc.x=paste("PC", pc.x, sep=""), pc.y=paste("PC", pc.y, sep=""))
# Variance explained by each PC axis
variance <- pc.percent %>% as.data.frame() %>% set_names("variance") %>% rownames_to_column("axis") %>%
filter(axis %in% c("PC1", "PC2", "PC3")) #%>%
# Plot PC biplots for axes 1-3
#ggplotly(
ggscatter(pc.scores.sex,
group=c("sex"), col="sex", text="sample.id",
x=axes[1,"pc.x"], y=axes[1,"pc.y"], size=1.5, alpha=0.85,
ellipse = T, star.plot = F) +
theme_minimal() + ggtitle("Global gene expression PC1xPC2") +
ylab(paste(axes[1, "pc.y"], " (", round(variance[variance$axis==axes[1, "pc.y"], "variance"], digits = 2), "%)", sep="")) +
xlab(paste(axes[1, "pc.x"], " (", round(variance[variance$axis==axes[1, "pc.x"], "variance"], digits = 2), "%)", sep="")) +
theme(legend.position = "bottom", legend.text=element_text(size=8), legend.title=element_text(size=9)) +
scale_color_manual(name="Sex", values=c("male"="#2c7bb6", "female"="#d7191c")) +
scale_fill_manual(guide=F, values=c("male"="#2c7bb6", "female"="#d7191c")) +
ggtitle(paste("PCA using putative sex markers in Pacific cod\n", axes[1, "pc.y"], "x", axes[1, "pc.x"], sep=" ")) + facet_wrap(~marine_region) #,
# hoverinfo = list("sex", "sample.id", "marine_region"), tooltip = list("treatment", "sample.id", "marine_region"))
#ggplotly(
ggscatter(pc.scores.sex,
group=c("sex"), col="sex", text="sample.id",
x=axes[2,"pc.x"], y=axes[2,"pc.y"], size=1.5, alpha=0.85,
ellipse = T, star.plot = F) +
theme_minimal() + ggtitle("Global gene expression PC1xPC3") +
ylab(paste(axes[2, "pc.y"], " (", round(variance[variance$axis==axes[2, "pc.y"], "variance"], digits = 2), "%)", sep="")) +
xlab(paste(axes[2, "pc.x"], " (", round(variance[variance$axis==axes[2, "pc.x"], "variance"], digits = 2), "%)", sep="")) +
theme(legend.position = "bottom", legend.text=element_text(size=8), legend.title=element_text(size=9)) +
scale_color_manual(name="Sex", values=c("male"="#2c7bb6", "female"="#d7191c")) +
scale_fill_manual(guide=F, values=c("male"="#2c7bb6", "female"="#d7191c")) +
ggtitle(paste(axes[2, "pc.y"], "x", axes[2, "pc.x"], sep=" "))+ facet_wrap(~marine_region) #,
# hoverinfo = list("sex", "sample.id", "marine_region"), tooltip = list("treatment", "sample.id", "marine_region"))
```
### WGSassign?
```{r}
```
### Any sex-associated microhaplotype on Chromosome 11 (NC_082392.1)
```{r}
result %>% arrange(desc(LRT))
ref.beagle.sex.sites %>% filter(str_detect(marker, "NC_082392.1")) %>%
separate(marker, into=c("a","b","position"), remove = F, sep = "_") %>% select(-a, -b) %>% ungroup() %>%
mutate(position=paste0("pos.", position)) %>% dplyr::select(individual,sex,marine_region,position,allele_base) %>%
pivot_wider(names_from = position, values_from = allele_base) %>% mutate(across(everything(), ~ifelse(is.na(.), "NA", .))) %>%
unite("microhap", pos.13871084,pos.13891711, sep=".") %>% filter(!str_detect(microhap,"NA")) %>%
mutate(microhap=as.factor(microhap), sex=as.factor(sex)) %>%
ggplot() +
geom_bar(aes(x = sex, y = microhap, fill=microhap), position="stack", stat="identity", color=NA) +
theme_minimal() + facet_wrap(~marine_region)
```