MiSeq-MethylKit-genome2.0

Load libraries

list.of.packages <- c("tidyverse", "reshape2", "here", "methylKit", "ggforce", "matrixStats", "Pstat", "viridis", "plotly", "readr", "GenomicRanges", "vegan", "factoextra", "PerformanceAnalytics", "corrplot", "janitor") #add new libraries here 

new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# Load all libraries 
lapply(list.of.packages, FUN = function(X) {
  do.call("require", list(X)) 
})

Obtain session information

sessionInfo()

## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.15.7
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] janitor_2.0.1              corrplot_0.84             
##  [3] PerformanceAnalytics_2.0.4 xts_0.12-0                
##  [5] zoo_1.8-8                  factoextra_1.0.7          
##  [7] vegan_2.5-6                lattice_0.20-41           
##  [9] permute_0.9-5              plotly_4.9.2.1            
## [11] viridis_0.5.1              viridisLite_0.3.0         
## [13] Pstat_1.2                  matrixStats_0.56.0        
## [15] ggforce_0.3.1              methylKit_1.8.1           
## [17] GenomicRanges_1.34.0       GenomeInfoDb_1.18.2       
## [19] IRanges_2.16.0             S4Vectors_0.20.1          
## [21] BiocGenerics_0.28.0        here_0.1                  
## [23] reshape2_1.4.4             forcats_0.5.0             
## [25] stringr_1.4.0              dplyr_0.8.5               
## [27] purrr_0.3.4                readr_1.3.1               
## [29] tidyr_1.0.3                tibble_3.0.1              
## [31] ggplot2_3.3.0              tidyverse_1.3.0           
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.4-1            ellipsis_0.3.0             
##  [3] mclust_5.4.5                rprojroot_1.3-2            
##  [5] snakecase_0.11.0            qvalue_2.14.1              
##  [7] XVector_0.22.0              fs_1.4.1                   
##  [9] rstudioapi_0.11             farver_2.0.3               
## [11] ggrepel_0.8.2               fansi_0.4.1                
## [13] mvtnorm_1.1-0               lubridate_1.7.8            
## [15] xml2_1.3.2                  splines_3.5.1              
## [17] R.methodsS3_1.8.0           knitr_1.28                 
## [19] polyclip_1.10-0             jsonlite_1.6.1             
## [21] Rsamtools_1.34.1            broom_0.5.6                
## [23] cluster_2.1.0               dbplyr_1.4.3               
## [25] R.oo_1.23.0                 compiler_3.5.1             
## [27] httr_1.4.1                  backports_1.1.6            
## [29] lazyeval_0.2.2              assertthat_0.2.1           
## [31] Matrix_1.2-18               limma_3.38.3               
## [33] cli_2.0.2                   tweenr_1.0.1               
## [35] htmltools_0.4.0             tools_3.5.1                
## [37] coda_0.19-3                 gtable_0.3.0               
## [39] glue_1.4.0                  GenomeInfoDbData_1.2.0     
## [41] Rcpp_1.0.4.6                bbmle_1.0.23.1             
## [43] Biobase_2.42.0              cellranger_1.1.0           
## [45] vctrs_0.3.0                 Biostrings_2.50.2          
## [47] nlme_3.1-143                rtracklayer_1.42.2         
## [49] xfun_0.13                   fastseg_1.28.0             
## [51] rvest_0.3.5                 lifecycle_0.2.0            
## [53] gtools_3.8.2                XML_3.99-0.3               
## [55] zlibbioc_1.28.0             MASS_7.3-51.6              
## [57] scales_1.1.1                hms_0.5.3                  
## [59] SummarizedExperiment_1.12.0 yaml_2.2.1                 
## [61] gridExtra_2.3               emdbook_1.3.12             
## [63] bdsmatrix_1.3-4             stringi_1.4.6              
## [65] BiocParallel_1.16.6         rlang_0.4.6                
## [67] pkgconfig_2.0.3             bitops_1.0-6               
## [69] evaluate_0.14               htmlwidgets_1.5.1          
## [71] GenomicAlignments_1.18.1    tidyselect_1.1.0           
## [73] plyr_1.8.6                  magrittr_1.5               
## [75] R6_2.4.1                    generics_0.0.2             
## [77] DelayedArray_0.8.0          DBI_1.1.0                  
## [79] mgcv_1.8-31                 pillar_1.4.4               
## [81] haven_2.2.0                 withr_2.2.0                
## [83] RCurl_1.98-1.2              modelr_0.1.7               
## [85] crayon_1.3.4                rmarkdown_2.1              
## [87] grid_3.5.1                  readxl_1.3.1               
## [89] data.table_1.12.8           reprex_0.3.0               
## [91] digest_0.6.25               numDeriv_2016.8-1.1        
## [93] R.utils_2.9.2               munsell_0.5.0              
## [95] quadprog_1.5-8

Check current directory

getwd()

## [1] "/Volumes/Bumblebee/C.magister_methyl-oa/notebooks"

Create list of all bismark data files, which have reads that are already trimmed and aligned. These BAM files are also sorted and indexed.

IMPORTANT NOTE: The location of these files depends on where the user saved them. I (Laura) used an external hard drive.

These files are from the original analysis using the old genome, and were transferred from Hyak on December 27th, 2020 using rsync.

# file.list=list("/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH01-06_S1_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH01-14_S2_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH01-22_S3_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH01-38_S4_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH03-04_S5_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH03-15_S6_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH03-33_S7_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH05-01_S8_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH05-06_S9_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH05-21_S10_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH05-24_S11_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH07-06_S12_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH07-11_S13_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH07-24_S14_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH09-02_S15_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH09-13_S16_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH09-28_S17_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH10-01_S18_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH10-08_S19_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
#                "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH10-11_S20_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam")

These files are from the re-run using the new genome, and transferred from Hyak on February 3rd, 2021 using rsync.

file.list=list("/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH01-06_S1_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH01-14_S2_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH01-22_S3_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH01-38_S4_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH03-04_S5_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH03-15_S6_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH03-33_S7_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH05-01_S8_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH05-06_S9_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH05-21_S10_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH05-24_S11_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH07-06_S12_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH07-11_S13_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH07-24_S14_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH09-02_S15_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH09-13_S16_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH09-28_S17_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH10-01_S18_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH10-08_S19_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark-genome2.0/CH10-11_S20_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam")

Read in MiSeq bismark summary with sample treatment info

summary <- read_delim(file="../qc-processing/MiSeq-QC/mbdall.txt", delim="\t") %>%
  arrange(sample_mbd) %>% filter(sample_mbd != "NA")

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   sample = col_character(),
##   Low_CpGmeth = col_character(),
##   treatment = col_character(),
##   treatment_high_lowCO2 = col_character(),
##   developmental_stage = col_character(),
##   notes = col_character(),
##   DNAiso_batch = col_character(),
##   `pool for library?` = col_character()
## )

## See spec(...) for full column specifications.

head(summary)

## # A tibble: 6 x 30
##   sample sample_mbd Low_CpGmeth  tank treatment treatment_high_…
##   <chr>       <dbl> <chr>       <dbl> <chr>     <chr>           
## 1 CH01-…          1 no              3 LC        L               
## 2 CH01-…          2 no              3 LC        L               
## 3 CH01-…          3 no              1 LB        L               
## 4 CH01-…          4 no              1 LB        L               
## 5 CH03-…          5 yes             1 LB        L               
## 6 CH03-…          6 yes             1 LB        L               
## # … with 24 more variables: developmental_stage <chr>, notes <chr>,
## #   DNAiso_batch <chr>, conc_ng.uL <dbl>, yield_ug <dbl>,
## #   MBD_concentration_ng.uL <dbl>, Total_recovery_ng <dbl>,
## #   Percent_recovery <dbl>, MBD_date <dbl>,
## #   library_bioanlyzer_mean_fragment_size_bp <dbl>,
## #   library_bioanalyzer_molarity_pmol.L <dbl>, `pool for library?` <chr>,
## #   qubit_concentration_ng.uL <dbl>, qubit_molarity_nM <dbl>,
## #   library_4nM_uL <dbl>, Total_Reads <dbl>, Aligned_Reads <dbl>,
## #   Unaligned_Reads <dbl>, Ambiguously_Aligned_Reads <dbl>, Unique_reads <dbl>,
## #   perc_totalread_unique <dbl>, CpGs_Meth <dbl>, CHGs_Meth <dbl>,
## #   CHHs_Meth <dbl>

Read in Bismark summary stats from the genome2.0 alignment and merge with library prep stats

summary2.0 <- read_delim(file="../qc-processing/MiSeq-QC/mbd_key.txt", delim="\t") %>% clean_names() %>%
  right_join(
    read_delim(file="../qc-processing/MiSeq-QC/bismark-genome2.0/bismark_summary_report.txt", delim="\t") %>% mutate(File = str_replace_all(File, '_.*', "")), 
    by=c("sample"="File")) %>%
  rename(Total_Reads=`Total Reads`, Aligned_Reads=`Aligned Reads`, Unaligned_Reads=`Unaligned Reads`, 
         Ambiguously_Aligned_Reads=`Ambiguously Aligned Reads`, 
         Unique_reads=`Unique Reads (remaining)`, Low_CpGmeth=low_cp_gmeth) %>%
  mutate(perc_totalread_unique=Unique_reads/Total_Reads, 
         CpGs_Meth=100*`Methylated CpGs`/(`Methylated CpGs`+`Unmethylated CpGs`), 
         CHGs_Meth=100*`Methylated chgs`/(`Methylated chgs`+`Unmethylated chgs`), 
         CHHs_Meth=100*`Methylated CHHs`/(`Methylated CHHs`+`Unmethylated CHHs`))

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   sample = col_character(),
##   Low_CpGmeth = col_character(),
##   treatment = col_character(),
##   treatment_high_lowCO2 = col_character(),
##   developmental_stage = col_character(),
##   notes = col_character(),
##   DNAiso_batch = col_character(),
##   `pool for library?` = col_character()
## )

## See spec(...) for full column specifications.

## Parsed with column specification:
## cols(
##   File = col_character(),
##   `Total Reads` = col_double(),
##   `Aligned Reads` = col_double(),
##   `Unaligned Reads` = col_double(),
##   `Ambiguously Aligned Reads` = col_double(),
##   `No Genomic Sequence` = col_double(),
##   `Duplicate Reads (removed)` = col_double(),
##   `Unique Reads (remaining)` = col_double(),
##   `Total Cs` = col_double(),
##   `Methylated CpGs` = col_double(),
##   `Unmethylated CpGs` = col_double(),
##   `Methylated chgs` = col_double(),
##   `Unmethylated chgs` = col_double(),
##   `Methylated CHHs` = col_double(),
##   `Unmethylated CHHs` = col_double()
## )

investigate possible factors influencing low %CpGmeth

cor(summary2.0 %>% dplyr::select_if(is.numeric)) %>% corrplot(tl.cex=.45)

#### Look closer at factors that correlate with CpGs_Meth (dark(ish) blue dots)

chart.Correlation(summary2.0 %>% 
                    select(CpGs_Meth, library_bioanlyzer_mean_fragment_size_bp, qubit_concentration_ng_u_l,
                           Total_Reads, Aligned_Reads, perc_totalread_unique, CHHs_Meth), 
                  histogram=F, pch=19)

#### Plot meth ~ fragment size (bioanalyzer)

ggplot(summary2.0, 
                aes(x=library_bioanlyzer_mean_fragment_size_bp, y=CpGs_Meth, 
                    label=sample_mbd, color=Low_CpGmeth)) + 
           geom_point(size=3) + scale_color_manual(values=c("black", "red")) +
           geom_text(aes(label=sample_mbd), vjust = -0.8)

Plot meth ~ Total Read

ggplot(summary2.0, 
                aes(x=Total_Reads, y=CpGs_Meth, 
                    label=sample_mbd, color=Low_CpGmeth)) + 
           geom_point(size=3) + scale_color_manual(values=c("black", "red")) +
           geom_text(aes(label=sample_mbd), vjust = -0.8)

Plot meth ~ Aligned Reads

ggplot(summary2.0, 
                aes(x=Aligned_Reads, y=CpGs_Meth, 
                    label=sample_mbd, color=Low_CpGmeth)) + 
           geom_point(size=3) + scale_color_manual(values=c("black", "red")) +
           geom_text(aes(label=sample_mbd), vjust = -0.8)

Plot meth ~ total reads that are unique

ggplot(summary2.0, 
                aes(x=perc_totalread_unique, y=CpGs_Meth, 
                    label=sample_mbd, color=Low_CpGmeth)) + 
           geom_point(size=3) + scale_color_manual(values=c("black", "red")) +
           geom_text(aes(label=sample_mbd), vjust = -0.8)

Plot meth ~ CHH methylation

ggplot(summary2.0, 
                aes(x=CHHs_Meth, y=CpGs_Meth, 
                    label=sample_mbd, color=Low_CpGmeth)) + 
           geom_point(size=3) + scale_color_manual(values=c("black", "red")) +
           geom_text(aes(label=sample_mbd), vjust = -0.8)

#### The following command reads sorted BAM files and calls methylation percentage per base, and creates a methylRaw object for CpG methylation. It also assigns minimum coverage of 2x and the treatments (in this case, the CO2 treatment)

myobj_MiSeq = processBismarkAln(location = file.list, sample.id = list("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18", "19", "20"), assembly = "pilon_scaffolds.fasta", read.context="CpG", mincov=2, treatment = as.numeric(as.factor(summary2.0$treatment_high_low_co2)))

Save the MethylKit object; re-doing the previous step is memory/time intensive, so best to use the saved object moving forward.

#getwd()
#save(myobj_MiSeq, file = "../qc-processing/MiSeq-QC/myobj_MiSeq")

Load object if needed

load("../qc-processing/MiSeq-QC/myobj_MiSeq")

What does the resulting object look like?

# Check out data for sample #1 
head(myobj_MiSeq[[1]])

##               chr start   end strand coverage numCs numTs
## 1  contig_1_pilon 24401 24401      -        2     0     2
## 2  contig_1_pilon 24409 24409      -        2     0     2
## 3  contig_1_pilon 24419 24419      -        2     0     2
## 4 contig_10_pilon 34002 34002      +        2     0     2
## 5 contig_10_pilon 51920 51920      -        2     0     2
## 6 contig_10_pilon 55069 55069      +        2     0     2

Check the basic stats about the methylation data - coverage and percent methylation. Index the object to look through each sample

# First look at % CpG methylation (panels)
for (i in 1:20) {
getMethylationStats(myobj_MiSeq[[i]],plot=T,both.strands=TRUE)
     mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}

# Now look at coverage 
for (i in 1:20) {
 getCoverageStats(myobj_MiSeq[[i]],plot=TRUE,both.strands=TRUE) 
   mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}

Column bind all samples, and destrand.

meth_unite=methylKit::unite(myobj_MiSeq, destrand=TRUE, min.per.group=1L)
#save(meth_unite, file = "../qc-processing/MiSeq-QC/meth_unite") #save object to file

Reshape data to do various filtering and calculations

meth_unite_reshaped <- melt(data=meth_unite, id=c("chr", "start", "end", "strand"), value.name = "count") %>%
 tidyr::separate(col = "variable" , into = c("variable", "sample"), sep = "(?<=[a-zA-Z])\\s*(?=[0-9])") %>%
  dcast(chr+start+end+strand+sample ~  variable) %>%
  drop_na(coverage) %>% 
  mutate(percMeth = 100*(numCs/coverage)) %>% 
  mutate(sample=as.numeric(sample))

#save(meth_unite_reshaped, file = "../qc-processing/MiSeq-QC/meth_unite_reshaped")

# Load reshaped object if needed
load(file = "../qc-processing/MiSeq-QC/meth_unite_reshaped") 
head(meth_unite_reshaped)

##                 chr start   end strand sample coverage numCs numTs percMeth
## 1 contig_1003_pilon 25122 25122      +      1       10    10     0      100
## 2 contig_1003_pilon 25143 25143      +      1        7     7     0      100
## 3 contig_1003_pilon 25146 25146      +      1        3     3     0      100
## 4 contig_1003_pilon 25162 25162      +      1        5     5     0      100
## 5 contig_1003_pilon 25166 25166      +      1        5     3     2       60
## 6 contig_1003_pilon 25186 25186      +      1        7     7     0      100

No coverage threshold, what is mean % methylated and how many loci are there?

meth_unite_reshaped %>% 
  group_by(sample) %>%
  summarize(mean = mean(percMeth), nloci = n())

## # A tibble: 20 x 3
##    sample  mean  nloci
##     <dbl> <dbl>  <int>
##  1      1  77.9 307273
##  2      2  72.6 372596
##  3      3  78.3 299331
##  4      4  72.7 333387
##  5      5  24.1  51783
##  6      6  21.6  60153
##  7      7  74.9 431389
##  8      8  65.5 139884
##  9      9  71.0 337914
## 10     10  74.8 414587
## 11     11  56.7 206273
## 12     12  76.0 280847
## 13     13  72.7 491751
## 14     14  72.3 258637
## 15     15  74.8 416299
## 16     16  35.2 132251
## 17     17  48.9  64462
## 18     18  77.0 347761
## 19     19  68.5 280665
## 20     20  17.8  31092

5x coverage, what is mean % methylated and how many loci are there?

meth_unite_reshaped %>% 
  filter(coverage>=5) %>% 
  group_by(sample) %>%
  summarize(mean = mean(percMeth,), nloci = n())

## # A tibble: 20 x 3
##    sample  mean  nloci
##     <dbl> <dbl>  <int>
##  1      1 83.5  150500
##  2      2 78.6  165889
##  3      3 83.7  121088
##  4      4 80.9  158257
##  5      5  2.41   2773
##  6      6  2.18   3054
##  7      7 81.5  224040
##  8      8 69.3   10767
##  9      9 80.6  158567
## 10     10 80.2  240812
## 11     11 59.6   16414
## 12     12 81.4   97256
## 13     13 79.4  288343
## 14     14 78.8  113501
## 15     15 79.9  190444
## 16     16  9.69   6078
## 17     17 61.0    5172
## 18     18 82.3  197277
## 19     19 76.4   53183
## 20     20  1.84   1418

10x coverage, what is mean % methylated and how many loci are there?

meth_unite_reshaped %>% 
  filter(coverage>=10) %>% 
  group_by(sample) %>%
  summarize(mean = mean(percMeth), nloci = n())

## # A tibble: 20 x 3
##    sample  mean  nloci
##     <dbl> <dbl>  <int>
##  1      1 84.7   75634
##  2      2 80.1   48507
##  3      3 85.2   36303
##  4      4 82.7   78369
##  5      5  2.29    864
##  6      6  1.88    919
##  7      7 83.3   85681
##  8      8 54.1     417
##  9      9 83.9   82386
## 10     10 81.6  127130
## 11     11 21.0    1281
## 12     12 82.7   25360
## 13     13 81.1  132739
## 14     14 80.0   56056
## 15     15 81.4   66130
## 16     16  3.15   1908
## 17     17 51.5     846
## 18     18 83.5  113142
## 19     19 72.6    3528
## 20     20  1.38    366

What is mean % methylated with 15x coverage?

meth_unite_reshaped %>% 
  filter(coverage>=15) %>% 
  group_by(sample) %>%
  summarize(mean = mean(percMeth), n = n())

## # A tibble: 20 x 3
##    sample  mean     n
##     <dbl> <dbl> <int>
##  1      1 85.2  39878
##  2      2 80.4  11926
##  3      3 85.5   9520
##  4      4 83.5  41526
##  5      5  1.96   379
##  6      6  1.49   419
##  7      7 84.1  27937
##  8      8 36.3     74
##  9      9 85.3  52617
## 10     10 82.3  66801
## 11     11 14.0    465
## 12     12 83.0   5656
## 13     13 82.0  52841
## 14     14 80.6  30239
## 15     15 81.8  19507
## 16     16  2.64   910
## 17     17 29.8    214
## 18     18 84.3  68228
## 19     19 54.9    336
## 20     20  1.03   154

Is % methylation a function of coverage within a sample?

The following samples have low CpG methylation: 5, 6, 9, 13, 14, kind of 19. Check out each sample individually

meth_unite_reshaped %>% 
  ggplot() + geom_point(aes(x=coverage, y=percMeth)) +
  facet_wrap(~sample)

Generate CpG methylation histograms after filtering for coverage

Filter out loci where coverage is less than 5x or greater than 100x.

MiSeq_5x=filterByCoverage(myobj_MiSeq,lo.count=5,lo.perc=NULL,
                                      hi.count=100,hi.perc=NULL)
save(MiSeq_5x, file="../qc-processing/MiSeq-QC/MiSeq_5x")

Now look at CpG methylation after filtering for 5x coverage

for (i in 1:20) {
getMethylationStats(MiSeq_5x[[i]],plot=T,both.strands=TRUE)
  mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}

Filter out loci where coverage is less than 10x or greater than 100x.

MiSeq_10x=filterByCoverage(myobj_MiSeq,lo.count=10,lo.perc=NULL,
                                      hi.count=100,hi.perc=NULL)
save(MiSeq_10x, file="../qc-processing/MiSeq-QC/MiSeq_10x")

Now look at CpG methylation after filtering for 10x coverage

for (i in 1:20) {
getMethylationStats(MiSeq_10x[[i]],plot=T,both.strands=TRUE)
  mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}

Out of interest, do cluster analysis

clusterSamples(meth_unite, dist="correlation", method="ward", plot=TRUE)

## The "ward" method has been renamed to "ward.D"; note new "ward.D2"

## 
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
## 
## Cluster method   : ward.D 
## Distance         : pearson 
## Number of objects: 20

PCASamples(meth_unite)