--- title: "Pcod Sex Markers" output: html_notebook --- ```{r} require(janitor) require(readxl) require(tidyverse) require(qqman) require(GenomicRanges) require(IRanges) require(plotly) require(ggpubr) require(knitr) load(file="../pcod-juveniles-2023/references/pcod.gtf") load(file = "../pcod-juveniles-2023/references/pcod.blast.GO") `%!in%` = Negate(`%in%`) source("biostats.R") ``` ```{r, messages=F} beagle.order <- read_delim( file="../pcod-juveniles-2023/lcWGS/analysis-20240606/reference/pcod-refs_filtered_bamslist.txt", delim = "/t", col_names = "sample") %>% mutate(sample=gsub("/home/lspencer/pcod-lcwgs-2023/analysis-20240606/reference/bamtools/pcod-refs_|_sorted_dedup_clipped.bam", "", sample)) ref.sex <- beagle.order %>% left_join( read_excel("../pcod-juveniles-2023/lcWGS/references/20230414_pcod_named.xlsx", na = c("---")) %>% clean_names() %>% dplyr::select(ablg, sex, marine_region) %>% mutate(sample=paste0("ABLG", ablg)), by="sample") %>% filter(!is.na(sex)) %>% mutate(sex=as.factor(sex)) ref.sex %>% group_by(marine_region, sex) %>% tally() %>% kable(format="markdown") ref.sex %>% group_by(sex) %>% tally() %>% knitr::kable(format="markdown") ref.sex %>% select(sample) %>% write_delim(file="fish-ids.txt", col_names = F) ref.sex %>% select(sex) %>% mutate(sex=as.numeric(sex=="female")) %>% write_delim(file="fish-sex.txt", col_names = F) ref.sex %>% select(marine_region) %>% mutate(marine_region=as.numeric(as.factor(marine_region))) %>% write_delim(file="fish-region.txt", col_names = F) # Load beagle that contains sexed fish, NOT Imputed probabilities sex.beagle<-read.table(gzfile("pcod-sex.beagle.gz"), header = T, sep="\t") ``` ### Read in LRT results ```{r} #lrt<-read.table(gzfile("gwas-sex-imputed.out.lrt0.gz"), header=T, sep="\t") lrt<-read.table(gzfile("gwas-sex.out.lrt0.gz"), header=T, sep="\t") #we have a few LRT values that are -999, we should remove them. length(lrt$LRT) # number of loci length(which(lrt$LRT == -999)) #number that will be removed length(which(lrt$LRT != -999)) #remove the values that are not -999 and that are negative lrt_filt<-lrt[-c(which(lrt$LRT == -999),which(lrt$LRT <= 0)),] #add snp "name" from rownumber lrt_filt$SNP<-paste("r",1:length(lrt_filt$Chromosome), sep="") # get pvalues from chisq lrt_filt$pvalue<-pchisq(lrt_filt$LRT, df=1, lower=F) hist(lrt_filt$LRT, breaks=50) hist(lrt_filt$pvalue, breaks=50) qqnorm(lrt_filt$pvalue) # #we also need to make sure we don't have any tricky values like those below # lrt_filt<-lrt_filt[-c(which(lrt_filt$pvalue == "NaN" ), # which(lrt_filt$pvalue == "Inf"), # which(lrt_filt$LRT == "Inf")),] manhattan(lrt_filt %>% mutate(chr=as.numeric(as.factor(Chromosome))), chr="chr", bp="Position", p="pvalue", main="Sex-association, GWAS") lrt_filt %>% mutate(logp=-log10(pvalue)) %>% filter(logp>4) %>% arrange(desc(LRT)) #lrt_filt %>% mutate(fdr_pvalue=p.adjust(pvalue, method="fdr")) %>% filter(fdr_pvalue<0.05) %>% arrange(desc(LRT)) lrt_filt %>% mutate(logp=-log10(pvalue)) %>% filter(logp>4) %>% arrange(desc(LRT)) %>% unite("marker", Chromosome:Position, sep="_") %>% select(marker)%>% write_delim(file="sex-markers-logp4.txt", col_names = F) ``` ### Where are putative sex markers in relation to annotated genes? ```{r} # Prepare gene annotation ranges genes4gwas <- pcod.gtf %>% mutate(ncbi_id=paste0("GeneID:", ncbi_id)) %>% filter(feature=="gene") %>% mutate(start_flank=start-2000) %>% mutate(start_flank=as.integer(case_when(start_flank<0~0, TRUE~start_flank))) #%>% gene_ranges <- GRanges( seqnames =genes4gwas$seqname, ranges=IRanges( start=genes4gwas$start_flank, # start = genes4gwas$start, end=genes4gwas$end), ncbi_id=genes4gwas$ncbi_id, gene_id=genes4gwas$gene_id) # Prepare putative sex marker sites locations sex.sites <- lrt_filt %>% mutate(logp=-log10(pvalue)) %>% filter(logp>4) %>% mutate(marker=paste0(Chromosome, "_", Position)) snp_ranges <- GRanges( seqnames = sex.sites$Chromosome, ranges=IRanges( start = sex.sites$Position, end = sex.sites$Position), pvalue=sex.sites$pvalue, Major=sex.sites$Major, Minor=sex.sites$Minor) # Find overlaps between gene flanks and SNP sites overlaps <- findOverlaps(gene_ranges, snp_ranges) # Extract matching rows overlapping_genes <- genes4gwas[queryHits(overlaps), ] overlapping_snps <- sex.sites[subjectHits(overlaps), ] # Combine results into a single data frame result <- cbind(overlapping_genes, overlapping_snps) %>% mutate(feature=case_when(Position>=start~"gene", Position% dplyr::select(-c(score, frame, attributes,biotype, exon)) %>% # dplyr::select(feature, seqname, strand, start_flank, start, end, Position, gene_id, ncbi_id, gene_id) %>% left_join(pcod.blast.GO[c("ncbi_id", "protein_names", "spid")] %>% mutate(ncbi_id=paste0("GeneID:", ncbi_id)), by="ncbi_id") # what percent of SNPs are located inside result %>% dplyr::select(-Position) %>% distinct() %>% group_by(feature) %>% tally() %>% mutate(total=nrow(genes4gwas)) %>% mutate(perc=round(100*n/total, 2)) # require(clipr) # (result %>% select(Chromosome, Position, spid) %>% distinct() %>% filter(!is.na(spid)))$spid %>% write_clip() #sex-associated genes # (pcod.blast.GO %>% filter(!is.na(spid)))$spid %>% write_clip() # result %>% arrange(desc(LRT)) %>% select(Chromosome, Position, Major, Minor, LRT, SE, pvalue, gene_id, protein_names) %>% # mutate(across(c(LRT:pvalue), ~ signif(.x, 3))) %>% write_clip() ``` ```{r, message=F, warning=F, error=F} ref.beagle.sex.sites <- sex.beagle %>% filter(marker %in% sex.sites$marker) %>% pivot_longer(ABLG1968_AA:ABLG2689_BB, names_to = "individual.allele", values_to = "probability") %>% separate(individual.allele, into=c("individual", "allele"), sep = "_") %>% mutate(allele_base=case_when( allele=="AA" ~ paste0(allele1, "/", allele1), allele=="AB" ~ paste0(allele1, "/", allele2), allele=="BB" ~ paste0(allele2, "/", allele2))) %>% mutate(allele_base = allele_base %>% str_replace_all("0", "A") %>% str_replace_all("1", "C") %>% str_replace_all("2", "G") %>% str_replace_all("3", "T")) %>% mutate(probability=round(as.numeric(probability), digits=4)) %>% group_by(marker, individual) %>% filter(!(n() == 3 & all(probability == 0.3333))) %>% slice_max(probability,n=1) %>% left_join(ref.sex, by=c("individual"="sample")) %>% mutate(allele_base_num=case_when( allele=="AA"~ 1, allele=="AB"~0.5, allele=="BB"~0)) sex.markers <- (sex.sites %>% arrange(desc(LRT)))$marker for (i in 1:10) { meta <- ref.beagle.sex.sites %>% filter(marker==sex.markers[i]) %>% ungroup() %>% select(marker) %>% distinct() %>% left_join(result %>% mutate(marker=paste0(Chromosome, "_", Position))) print(ref.beagle.sex.sites %>% filter(marker==sex.markers[i]) %>% # filter(!is.na(allele_base)) %>% droplevels() %>% ggplot() + geom_point(aes(x=sex, y=allele_base_num, color=sex), position=position_jitter(w = 0.25,h = 0.1)) + theme_minimal() + facet_wrap(~marine_region, scales="free_y", nrow = 1) + scale_color_manual(name="Sex", values=c("male"="#2c7bb6", "female"="#d7191c")) + #scale_color_manual(name=NULL, values=c("NBS"="blue2", "Aleutians"="orange2", "eGOA"="gray25", "wGOA"="green4", "EBS"="purple")) + ggtitle(paste0("Marker: ", meta$marker, "\n Gene ID: ", meta$gene_id, "\n Protein: ", meta$protein_names))) } ``` ### Use PCA to look for sex-associated structure ```{r} # Make dataframe with sample IDs and sex go generate sex-specific allele frequencies ref.sex %>% select(sample, sex) %>% write_delim(file="fish-ids-sex.txt", col_names = F) ``` ```{r} # Read in covariance matrix and add sample IDs sex.cov <- read_delim(file="pcod-sex.cov", col_names = ref.sex$sample) %>% as.matrix() %>% `rownames<-`(ref.sex$sample) # Run PCA pca.sex <- prcomp(sex.cov, scale=F) #scale=F for variance-covariance matrix #pca.eigenval(pca.princomp) #The Proporation of Variance = %variance pc.percent <- pca.eigenval(pca.sex)[2,1:6]*100 #PC % for axes 1-6 screeplot(pca.sex, bstick=FALSE) #inspect scree plot, which axes influential? pc.percent[1:2] %>% sum() # total percent explained by PCs 1 & 2 #### Generate dataframe with prcomp results pc.scores.sex <- data.frame(sample.id = colnames(sex.cov), PC1 = pca.sex$rotation[,1], # the first eigenvector PC2 = pca.sex$rotation[,2], # the second eigenvector PC3 = pca.sex$rotation[,3], # the third eigenvector PC4 = pca.sex$rotation[,4], # the fourth eigenvector PC5 = pca.sex$rotation[,5], # the fourth eigenvector PC6 = pca.sex$rotation[,6], # the fourth eigenvector stringsAsFactors = FALSE) # Add metadata pc.scores.sex <- left_join(pc.scores.sex, ref.sex,by=c("sample.id"="sample")) axes <- data.frame("pc.x"=c(1,1,2), "pc.y"=c(2,3,3)) %>% mutate(pc.x=paste("PC", pc.x, sep=""), pc.y=paste("PC", pc.y, sep="")) # Variance explained by each PC axis variance <- pc.percent %>% as.data.frame() %>% set_names("variance") %>% rownames_to_column("axis") %>% filter(axis %in% c("PC1", "PC2", "PC3")) #%>% # Plot PC biplots for axes 1-3 #ggplotly( ggscatter(pc.scores.sex, group=c("sex"), col="sex", text="sample.id", x=axes[1,"pc.x"], y=axes[1,"pc.y"], size=1.5, alpha=0.85, ellipse = T, star.plot = F) + theme_minimal() + ggtitle("Global gene expression PC1xPC2") + ylab(paste(axes[1, "pc.y"], " (", round(variance[variance$axis==axes[1, "pc.y"], "variance"], digits = 2), "%)", sep="")) + xlab(paste(axes[1, "pc.x"], " (", round(variance[variance$axis==axes[1, "pc.x"], "variance"], digits = 2), "%)", sep="")) + theme(legend.position = "bottom", legend.text=element_text(size=8), legend.title=element_text(size=9)) + scale_color_manual(name="Sex", values=c("male"="#2c7bb6", "female"="#d7191c")) + scale_fill_manual(guide=F, values=c("male"="#2c7bb6", "female"="#d7191c")) + ggtitle(paste("PCA using putative sex markers in Pacific cod\n", axes[1, "pc.y"], "x", axes[1, "pc.x"], sep=" ")) + facet_wrap(~marine_region) #, # hoverinfo = list("sex", "sample.id", "marine_region"), tooltip = list("treatment", "sample.id", "marine_region")) #ggplotly( ggscatter(pc.scores.sex, group=c("sex"), col="sex", text="sample.id", x=axes[2,"pc.x"], y=axes[2,"pc.y"], size=1.5, alpha=0.85, ellipse = T, star.plot = F) + theme_minimal() + ggtitle("Global gene expression PC1xPC3") + ylab(paste(axes[2, "pc.y"], " (", round(variance[variance$axis==axes[2, "pc.y"], "variance"], digits = 2), "%)", sep="")) + xlab(paste(axes[2, "pc.x"], " (", round(variance[variance$axis==axes[2, "pc.x"], "variance"], digits = 2), "%)", sep="")) + theme(legend.position = "bottom", legend.text=element_text(size=8), legend.title=element_text(size=9)) + scale_color_manual(name="Sex", values=c("male"="#2c7bb6", "female"="#d7191c")) + scale_fill_manual(guide=F, values=c("male"="#2c7bb6", "female"="#d7191c")) + ggtitle(paste(axes[2, "pc.y"], "x", axes[2, "pc.x"], sep=" "))+ facet_wrap(~marine_region) #, # hoverinfo = list("sex", "sample.id", "marine_region"), tooltip = list("treatment", "sample.id", "marine_region")) ``` ### WGSassign? ```{r} ``` ### Any sex-associated microhaplotype on Chromosome 11 (NC_082392.1) ```{r} result %>% arrange(desc(LRT)) ref.beagle.sex.sites %>% filter(str_detect(marker, "NC_082392.1")) %>% separate(marker, into=c("a","b","position"), remove = F, sep = "_") %>% select(-a, -b) %>% ungroup() %>% mutate(position=paste0("pos.", position)) %>% dplyr::select(individual,sex,marine_region,position,allele_base) %>% pivot_wider(names_from = position, values_from = allele_base) %>% mutate(across(everything(), ~ifelse(is.na(.), "NA", .))) %>% unite("microhap", pos.13871084,pos.13891711, sep=".") %>% filter(!str_detect(microhap,"NA")) %>% mutate(microhap=as.factor(microhap), sex=as.factor(sex)) %>% ggplot() + geom_bar(aes(x = sex, y = microhap, fill=microhap), position="stack", stat="identity", color=NA) + theme_minimal() + facet_wrap(~marine_region) ```