## Plotly plot of a rectangular phylogenetic tree ##

In this Jupyter Notebook we illustrate how to generate the Plotly plot of a phylogram with rectangular layout.

In [1]:
from Bio import Phylo
import pandas as pd

In [2]:
from plotly.offline import download_plotlyjs, init_notebook_mode,  iplot
init_notebook_mode(connected=True)

Read the  Zika Virus tree file in `newick` format, downloaded from  [nextstrain](http://nextstrain.org/zika):

In [3]:
tree = Phylo.read('Data/nextstrain_zika_tree.new', "newick")

The next cells contain commented lines that were typed to inspect the file structure:

In [4]:
#print(tree)

In [5]:
#dir(tree)
#tree.get_terminals()

In [6]:
#tree.count_terminals()# Counts the number of terminal (leaf) nodes within this tree.

The functions `get_x_coordinates()`, `get_y_coordinates()` are mainly the functions with the same name 
from Biopython: [https://github.com/biopython/biopython/blob/master/Bio/Phylo/_utils.py](https://github.com/biopython/biopython/blob/master/Bio/Phylo/_utils.py).

They assign cartesian coordinates to the tree nodes.

In [7]:
def get_x_coordinates(tree):
    # Associates to  each clade a x-coord.
    # returns a dict {clade: x-coord}, i.e the key is a clade, and x-coord its value
    
    xcoords = tree.depths()
    # tree.depth() maps tree clades to depths (by branch length).
    # returns a dict {clade: depth} where clade runs over all Clade instances of the tree,
    # and depth is the distance from root to clade
    
    # If there are no branch lengths, assign unit branch lengths
    if not max(xcoords.values()):
        xcoords = tree.depths(unit_branch_lengths=True)
    return xcoords
   
def get_y_coordinates(tree, dist=1.3):
    # y-coordinates are   multiple of dist (i*dist below); 
    # dist: vertical distance between two consecutive leafs; it is chosen such that to get a tree of 
    # reasonable height 
    # returns  a dict {clade: y-coord}
        
    maxheight = tree.count_terminals()#Counts the number of tree leafs.
  
    ycoords = dict((leaf, maxheight - i*dist) for i, leaf in enumerate(reversed(tree.get_terminals())))
    def calc_row(clade):
            for subclade in clade:
                if subclade not in ycoords:
                    calc_row(subclade)
            ycoords[clade] = (ycoords[clade.clades[0]] +
                              ycoords[clade.clades[-1]]) / 2

    if tree.root.clades:
        calc_row(tree.root)
    return ycoords


In [8]:
x_coords = get_x_coordinates(tree)
y_coords = get_y_coordinates(tree)

In [9]:
def get_clade_lines(orientation='horizontal', y_curr=0, x_start=0, x_curr=0, y_bot=0, y_top=0,
                    line_color='rgb(25,25,25)', line_width=0.5):
    # define a Plotly shape of type 'line', for each branch
    
    branch_line = dict(type= 'line',
                       layer='below',
                       line=dict(color=line_color, 
                                 width=line_width)
                     )
    if orientation == 'horizontal':
        branch_line.update(x0=x_start,
                           y0=y_curr,
                           x1=x_curr,
                           y1=y_curr)
    elif orientation == 'vertical':
        branch_line.update(x0=x_curr,
                           y0=y_bot,
                           x1=x_curr,
                           y1=y_top)
    else:
        raise ValueError("Line type can be 'horizontal' or 'vertical'")
       
    return branch_line    
        
    

def draw_clade(clade, x_start,  line_shapes,  line_color='rgb(15,15,15)', line_width=1):
    # defines recursively  the tree  lines (branches), starting from the argument clade
    
    x_curr = x_coords[clade]
    y_curr = y_coords[clade]
   
    # Draw a horizontal line 
    branch_line=get_clade_lines(orientation='horizontal', y_curr=y_curr, x_start=x_start, x_curr=x_curr,  
                                line_color=line_color, line_width=line_width)
   
    line_shapes.append(branch_line)
   
    if clade.clades:
        # Draw a vertical line connecting all children
        y_top = y_coords[clade.clades[0]]
        y_bot = y_coords[clade.clades[-1]]
       
        line_shapes.append(get_clade_lines(orientation='vertical', x_curr=x_curr, y_bot=y_bot, y_top=y_top,
                                           line_color=line_color, line_width=line_width))
       
        # Draw descendants
        for child in clade:
            draw_clade(child, x_curr, line_shapes)
    
    

In [10]:
line_shapes = [] 
draw_clade(tree.root, 0, line_shapes, line_color='rgb(25,25,25)', line_width=1)

Get the node coordinates:

In [11]:
my_tree_clades = x_coords.keys()
X = [] # list of nodes x-coordinates
Y = [] # list of nodes y-coords
text = [] #list of text to be displayed on hover over nodes

for cl in my_tree_clades:
    X.append(x_coords[cl])
    Y.append(y_coords[cl])
    text.append(cl.name)

Read the metatadata file to record more info about nodes:

In [12]:
df = pd.read_csv('Data/nextstrain_zika_metadata.csv')
df.columns

Index(['Strain', 'Accession', 'Date', 'Region', 'Country', 'Division',
       'Authors', 'Journal', 'Title', 'Url', 'Num Date', 'Db', 'Raw Date'],
      dtype='object')

In [13]:
len(df)

377

In [14]:
set(list(df['Region']))

{'China',
 'Japan Korea',
 'North America',
 'Oceania',
 'South America',
 'Southeast Asia'}

Set the intermediate node color:

In [15]:
intermediate_node_color = 'rgb(100,100,100)'

Set the  node colors depending on  region (continent):

In [16]:
# North Amer.
NA_color = {'Cuba': 'rgb(252, 196, 174)',
            'Dominican Republic': 'rgb(201, 32, 32)',
            'El Salvador': 'rgb(253, 202, 181)',
            'Guadeloupe': 'rgb(253, 202, 181)',
            'Guatemala': 'rgb(252, 190, 167)',
            'Haiti': 'rgb(252, 145, 114)',
            'Honduras': 'rgb(239, 66, 49)',
            'Jamaica': 'rgb(252, 185, 161)',
            'Martinique': 'rgb(252, 190, 167)',
            'Mexico': 'rgb(247, 109, 82)',
            'Nicaragua': 'rgb(249, 121, 92)',
            'Panama': 'rgb(252, 185, 161)',
            'Puerto Rico': 'rgb(252, 174, 148)',
            'Saint Barthelemy': 'rgb(253, 202, 181)',
            'USA': 'rgb(188, 20, 26)',
            'USVI': 'rgb(206, 36, 34)'}



# South Amer.
SAmer_color = {'Brazil': 'rgb(21, 127, 59)',
               'Colombia': 'rgb(153, 213, 149)',
               'Ecuador': 'rgb(208, 237, 202)',
               'French Guiana': 'rgb(211, 238, 205)',
               'Peru': 'rgb(208, 237, 202)',
               'Suriname': 'rgb(206, 236, 200)',
               'Venezuela': 'rgb(202, 234, 196)'}

# South Asia
SAsia_color = {'Singapore': '#0000EE', 'Vietnam': '#1E90FF'}
pl_SAsia = [[0.0, '#1E90FF'], 
            [0.5, '#1E90FF'], 
            [0.5, '#0000EE'], 
            [1.0,'#0000EE' ]]


Oceania_color = {'American Samoa': 'rgb(209,95,238)',
                 'Fiji': 'rgb(238,130, 238)',
                 'French Polynesia': 'rgb(148,0,211)',
                 'Tonga': 'rgb(238,130, 238)'}


China_color={'China': 'rgb(255,185,15'} 

JapanKorea_color={'Japan': '#fcdd04'}

Assign color to nodes according to region and country:

In [17]:
country=[]
region=[]
color=[intermediate_node_color]*len(X)

for k, strain in enumerate(df['Strain']):
   
    i=text.index(strain)
    text[i]=text[i]+f"<br>Country: {df.loc[k, 'Country']}\
            <br>Region: {df.loc[k, 'Region']}\
            <br>Collection date: {df.loc[k,'Date']}\
            <br>Journal: {df.loc[k, 'Journal']}\
            <br>Authors: {df.loc[k, 'Authors']}"
    country.append(df.loc[k, 'Country'])
    region.append(df.loc[k, 'Region'])
    if df.loc[k, 'Region'] == 'North America':
        color[i] = NA_color[df.loc[k, 'Country']]
    elif df.loc[k, 'Region'] == 'South America':
        color[i] = SAmer_color[df.loc[k, 'Country']] 
    elif df.loc[k, 'Region'] == 'Southeast Asia':
        color[i] = SAsia_color[df.loc[k, 'Country']]
    elif df.loc[k, 'Region'] == 'Oceania':
        color[i] = Oceania_color[df.loc[k, 'Country']] 
    elif df.loc[k, 'Region'] == 'China':
        color[i] = '#fecc00'
    elif df.loc[k, 'Region'] == 'Japan Korea':
        color[i]= '#dc7928'
    else: pass

Define the Plotly trace for the tree nodes:

In [18]:
nodes = dict(type='scatter',
             x=X,
             y=Y,
             mode='markers',
             marker=dict(color=color, 
                         size=5),
             opacity=1.0,
             text=text,
             hoverinfo='text')

The branches are already defined and stored as Plotly shapes that are included in the plot layout below:

In [20]:
layout=dict(title='Phylogeny of Zika Virus<br>377 genomes colored according to region and country',
            font=dict(family='Balto',size=14),
            width=1000,
            height=3000,
            autosize=False,
            showlegend=False,
            xaxis=dict(showline=True,  
                       zeroline=False,
                       showgrid=False,
                       ticklen=4,          
                       showticklabels=True,
                       title='branch length'),
            yaxis=dict(visible=False), 
            hovermode='closest',
            plot_bgcolor='rgb(250,250,250)',
            margin=dict(l=10),
            shapes=line_shapes # lines for tree branches
           )

In [21]:
fig = dict(data=[nodes], layout=layout)
iplot(fig)#offline plot

In [22]:
from IPython.core.display import HTML
def  css_styling():
    styles = open("./custom.css", "r").read()
    return HTML(styles)
css_styling()