Chapter 13 Gallery of reproducible examples
13.1 Visualizing pairwise nucleotide sequence distance with phylogenetic tree
This example reproduces Fig. 1 of (Chen et al. 2017). It extracts accession numbers from tip labels of the HPV58 tree and calculates pairwise nucleotide sequence distances. The distance matrix is visualized as dot and line plots. This example demonstrates the abilities of adding multiple layers to a specific panel. As illustrated on Figure 13.1, the facet_plot
function displays sequence distances as a dot plot and then adds a layer of line plot to the same panel, i.e. Sequence Distance. In addition, the tree in facet_plot
can be fully annotated with multiple layers (clade labels, bootstrap support values, etc). The source code of this example was firstly published in Supplemental File of (Yu et al. 2018).
library(tibble)
library(tidyr)
library(Biostrings)
library(treeio)
library(ggplot2)
library(ggtree)
read.tree("data/HPV58.tree")
tree <-
c(A3 = 92, A1 = 94, A2 = 108, B1 = 156, B2 = 159, C = 163, D1 = 173, D2 = 176)
clade <- groupClade(tree, clade)
tree <- c(A1 = "#EC762F", A2 = "#CA6629", A3 = "#894418", B1 = "#0923FA",
cols <-B2 = "#020D87", C = "#000000", D1 = "#9ACD32",D2 = "#08630A")
## visualize the tree with tip labels and tree scale
ggtree(tree, aes(color = group), ladderize = FALSE) %>% rotate(rootnode(tree)) +
p <- geom_tiplab(aes(label = paste0("italic('", label, "')")), parse = TRUE, size = 2.5) +
geom_treescale(x = 0, y = 1, width = 0.002) +
scale_color_manual(values = c(cols, "black"), na.value = "black", name = "Lineage",
breaks = c("A1", "A2", "A3", "B1", "B2", "C", "D1", "D2")) +
guides(color = guide_legend(override.aes = list(size = 5, shape = 15))) +
theme_tree2(legend.position = c(.1, .88))
## Optional
## add labels for monophyletic (A, C and D) and paraphyletic (B) groups
p + geom_cladelabel(94, "italic(A1)", color = cols[["A1"]], offset = .003, align = TRUE,
p <-offset.text = -.001, barsize = 1.2, extend = c(0, 0.5), parse = TRUE) +
geom_cladelabel(108, "italic(A2)", color = cols[["A2"]], offset = .003, align = TRUE,
offset.text = -.001, barsize = 1.2, extend = 0.5, parse = TRUE) +
geom_cladelabel(131, "italic(A3)", color = cols[["A3"]], offset = .003, align = TRUE,
offset.text = -.001, barsize = 1.2, extend = c(0.5, 0), parse = TRUE) +
geom_cladelabel(92, "italic(A)", color = "darkgrey", offset = .00315, align = TRUE,
offset.text = 0.0002, barsize = 2, fontsize = 5, parse = TRUE) +
geom_cladelabel(156, "italic(B1)", color = cols[["B1"]], offset = .003, align = TRUE,
offset.text = -.001, barsize = 1.2, extend = c(0, 0.5), parse = TRUE) +
geom_cladelabel(159, "italic(B2)", color = cols[["B2"]], offset = .003, align = TRUE,
offset.text = -.001, barsize = 1.2, extend = c(0.5, 0), parse = TRUE) +
geom_strip(65, 71, "italic(B)", color = "darkgrey", offset = 0.00315, align = TRUE,
offset.text = 0.0002, barsize = 2, fontsize = 5, parse = TRUE) +
geom_cladelabel(163, "italic(C)", color = "darkgrey", offset = .0031, align = TRUE,
offset.text = 0.0002, barsize = 3.2, fontsize = 5, parse = TRUE) +
geom_cladelabel(173, "italic(D1)", color = cols[["D1"]], offset = .003, align = TRUE,
offset.text = -.001, barsize = 1.2, extend = c(0, 0.5), parse = TRUE) +
geom_cladelabel(176, "italic(D2)", color = cols[["D2"]], offset = .003, align = TRUE,
offset.text = -.001, barsize = 1.2, extend = c(0.5, 0), parse = TRUE) +
geom_cladelabel(172, "italic(D)", color = "darkgrey", offset = .00315, align = TRUE,
offset.text = 0.0002, barsize = 2, fontsize = 5, parse = TRUE)
## Optional
## display support values
p + geom_nodelab(aes(subset = (node == 92), label = "*"),
p <-color = "black", nudge_x = -.001, nudge_y = 1) +
geom_nodelab(aes(subset = (node == 155), label = "*"),
color = "black", nudge_x = -.0003, nudge_y = -1) +
geom_nodelab(aes(subset = (node == 158), label = "95/92/1.00"),
color = "black", nudge_x = -0.0001, nudge_y = -1, hjust = 1) +
geom_nodelab(aes(subset = (node == 162), label = "98/97/1.00"),
color = "black", nudge_x = -0.0001, nudge_y = -1, hjust = 1) +
geom_nodelab(aes(subset = (node == 172), label = "*"),
color = "black", nudge_x = -.0003, nudge_y = -1)
## extract accession numbers from tip labels
tree$tip.label
tl <- sub("\\w+\\|", "", tl)
acc <-names(tl) <- acc
## read sequences from GenBank directly into R
## and convert the object to DNAStringSet
ape::read.GenBank(acc) %>% as.character %>%
tipseq <- lapply(., paste0, collapse = "") %>% unlist %>%
DNAStringSet
## align the sequences using muscle
muscle::muscle(tipseq)
tipseq_aln <- DNAStringSet(tipseq_aln) tipseq_aln <-
## calculate pairwise hamming distances among sequences
stringDist(tipseq_aln, method = "hamming")
tipseq_dist <-
## calculate percentage of differences
as.matrix(tipseq_dist) / width(tipseq_aln[1]) * 100
tipseq_d <-
## convert the matrix to tidy data frame for facet_plot
as_tibble(tipseq_d)
dd <-$seq1 <- rownames(tipseq_d)
dd gather(dd,seq2, dist, -seq1)
td <-$seq1 <- tl[td$seq1]
td$seq2 <- tl[td$seq2]
td
p$data$group
g <-names(g) <- p$data$label
$clade <- g[td$seq2]
td
## visualize the sequence differences using dot plot and line plot
## and align the sequence difference plot to the tree using facet_plot
facet_plot(p, panel = "Sequence Distance", data = td, geom_point,
p2 <-mapping = aes(x = dist, color = clade, shape = clade), alpha = .6) %>%
facet_plot(panel = "Sequence Distance", data = td, geom = geom_path,
mapping=aes(x = dist, group = seq2, color = clade), alpha = .6) +
scale_shape_manual(values = 1:8, guide = FALSE)
print(p2)
13.2 Displaying different symbolic points for bootstrap values.
We can cut the bootstrap values into several intervals, e.g. to indicate whether the clade is high, moderate or low support. Then we can use these intervals as categorical variable to set different color or shape of symbolic points to indicate the bootstrap values belong to which category.
library(treeio)
library(ggplot2)
library(ggtree)
read.newick("data/RMI.phy_phyml_tree_rooted_labeled.txt", node.label='support')
tree <- rootnode(tree)
root <-ggtree(tree, color="black", size=1.5, linetype=1, right=TRUE) +
geom_tiplab(size=4.5, hjust = -0.060, fontface="bold") + xlim(0, 0.09) +
geom_point2(aes(subset=!isTip & node != root,
fill=cut(support, c(0, 700, 900, 1000))),
shape=21, size=4) +
theme_tree(legend.position=c(0.2, 0.2)) +
scale_fill_manual(values=c("white", "grey", "black"), guide='legend',
name='Bootstrap Percentage(BP)',
breaks=c('(900,1e+03]', '(700,900]', '(0,700]'),
labels=expression(BP>=90,70 <= BP * " < 90", BP < 70))
13.3 Highlighting different groups.
This example reproduces Figure 1 of (Larsen et al. 2019). It used groupOTU
to add grouping information of chicken CTLDcps. The branch linetype and color are defined based on this grouping information. Two groups of CTLDcps are highlighted in different background colors using geom_hilight
(red for Group II and green for Group V). The avian-specific expansion of group V with subgroup of A and B- are labelled using geom_cladelabel
(Figure 13.3).
read.tree("data/Tree 30.4.19.nwk")
mytree <-
# Define nodes for coloring later on
mytree$tip.label
tiplab <- tiplab[grep("^ch", tiplab)]
cls <- groupOTU(mytree, cls)
labeltree <-
ggtree(labeltree, aes(color=group, linetype=group), layout="circular") +
p <- scale_color_manual(values = c("#efad29", "#63bbd4")) +
geom_nodepoint(color="black", size=0.1) +
geom_tiplab(size=2, color="black")
flip(p, 136, 110) %>%
p2 <- flip(141, 145) %>%
rotate(141) %>%
rotate(142) %>%
rotate(160) %>%
rotate(164) %>%
rotate(131)
### Group V and II coloring
p2 + geom_hilight(node = 110, fill = "#229f8a", alpha = 0.2, extend = 0.43) +
p3 <- geom_hilight(node = 88, fill = "#229f8a", alpha = 0.2, extend = 0.445) +
geom_hilight(node = 156, fill = "#229f8a", alpha = 0.2, extend = 0.35) +
geom_hilight(node = 136, fill = "#f9311f", alpha = 0.2, extend = 0.512)
### Putting on a label on the avian specific expansion
p3 + geom_cladelabel(node = 113, label = "Avian-specific expansion",
p4 <-align = TRUE, angle = -35, offset.text = 0.05,
hjust = "center", fontsize = 2, offset = 0.2, barsize = 0.2)
### Adding the bootstrap values with subset used to remove all bootstraps < 50
p4 + geom_text2(aes(label=label,
p5 <-subset = !is.na(as.numeric(label)) & as.numeric(label) > 50),
size = 2, color = "black",nudge_y = 0.7, nudge_x = -0.05)
### Putting labels on the subgroups
p5 + geom_cladelabel(node = 114, label = "Subgroup A", align = TRUE,
p6 <-angle = -55, offset.text = .03, hjust = "center",
offset = 0.05, barsize = 0.2, fontsize = 2) +
geom_cladelabel(node = 121, label = "Subgroup B", align = TRUE,
angle = -15, offset.text = .03, hjust = "center",
offset = 0.05, barsize = 0.2, fontsize = 2) +
theme(legend.position="none",
plot.margin=grid::unit(c(-15,-15,-15,-15), "mm"))
print(p6)
13.4 Phylogenetic tree with genome locus structure
The geom_motif
is defined in ggtree
and it is a wrapper layer of gggenes::geom_gene_arrow
. The geom_motif
can automatically adjust genomic alignment by selective gene (via the on
parameter) and can label genes via the label
parameter. In the following example, we use example_genes
dataset provided by gggenes. As the dataset only provide genomic coordinations of a set of genes, a phylogeny for the genomes need to be firstly constructed. We calculate jaccard similarity based on the ratio of overlapping genes among genomes and correspondingly determine genome distance. BioNJ algorithm was applied to construct the tree (Figure 13.4).
library(dplyr)
library(ggplot2)
library(gggenes)
library(ggtree)
function(data, genome) {
get_genes <-filter(data, molecule == genome) %>% pull(gene)
}
unique(example_genes[,1])
g <- length(g)
n <- matrix(nrow = n, ncol = n)
d <-rownames(d) <- colnames(d) <- g
lapply(g, get_genes, data = example_genes)
genes <-
for (i in 1:n) {
for (j in 1:i) {
length(intersect(genes[[i]], genes[[j]])) /
jaccard_sim <- length(union(genes[[i]], genes[[j]]))
d[i, j] <- 1 - jaccard_sim
d[j, i] <-
}
}
ape::bionj(d)
tree <-
ggtree(tree, branch.length='none') +
p <- geom_tiplab() + xlim_tree(5.5) +
geom_facet(mapping = aes(xmin = start, xmax = end, fill = gene),
data = example_genes, geom = geom_motif, panel = 'Alignment',
on = 'genE', label = 'gene', align = 'left') +
scale_fill_brewer(palette = "Set3") +
scale_x_continuous(expand=c(0,0)) +
theme(strip.text=element_blank(),
panel.spacing=unit(0, 'cm'))
facet_widths(p, widths=c(1,2))
References
Chen, Zigui, Wendy C. S. Ho, Siaw Shi Boon, Priscilla T. Y. Law, Martin C. W. Chan, Rob DeSalle, Robert D. Burk, and Paul K. S. Chan. 2017. “Ancient Evolution and Dispersion of Human Papillomavirus 58 Variants.” Journal of Virology 91 (21): e01285–17. https://doi.org/10.1128/JVI.01285-17.
Larsen, Frederik T., Bertrand Bed’Hom, Bernt Guldbrandtsen, and Tina S. Dalgaard. 2019. “Identification and Tissue-Expression Profiling of Novel Chicken c-Type Lectin-Like Domain Containing Proteins as Potential Targets for Carbohydrate-Based Vaccine Strategies.” Molecular Immunology 114 (October): 216–25. https://doi.org/10.1016/j.molimm.2019.07.022.
Yu, Guangchuang, Tommy Tsan-Yuk Lam, Huachen Zhu, and Yi Guan. 2018. “Two Methods for Mapping and Visualizing Associated Data on Phylogeny Using Ggtree.” Molecular Biology and Evolution 35 (12): 3041–3. https://doi.org/10.1093/molbev/msy194.