## <center>Cloudy Mountain Plot<br>an informative RDI<i>(*)</i> categorical distribution plot<br>inspired by Violin, Bean and Pirate Plots</center>
#### <center>云山图 [yún shān tú] / 雲山図 [くもやまず/kumo yama zu]</center>
### by [Dr Giuseppe Insana](http://insana.net), August 2019 - coded in Julia and Python
#### <i>(*)</i>: RDI = Raw data + Descriptive statistics + Inferential statistics

### This notebook shows usage examples of Cloudy Mountain Plot in Python3.
### Please check the [cmplot.py github page](https://github.com/g-insana/cmplot.py) for installation instruction and options

In [1]:
import plotly.graph_objects as go
from cmplot import cmplot

# OPTIONAL requirements for this notebook
from plotly.subplots import make_subplots #for subplots, used in the examples
from pathlib import Path #for saving files to homedir

In [2]:
# To load some public test data:
from pydataset import data
iris = data('iris') # Iris dataset; Anderson, Edgar (1935) Fisher, R. A. (1936) ; http://vincentarelbundock.github.io/Rdatasets/doc/datasets/iris.html
train = data('Wages') # Individual wages, US, 1976 to 1982; Cornwell, C. and P. Rupert (1988) ; http://vincentarelbundock.github.io/Rdatasets/doc/Ecdat/Wages.html
train['expYears']=["<10" if x < 10 else "10-29" if (x >= 10 and x < 30) else ">30" for x in train['exp']] #binning on years of full-time work experience
train['eduYears']=["<9" if x < 9 else "9-13" if (x >= 9 and x < 14) else ">13" for x in train['ed']] #binning on years of education
train = train.sample(frac=1).reset_index(drop=True) #shuffle if you wish to bias when using pointsmaxdisplayed

train[0:3]

Unnamed: 0,exp,wks,bluecol,ind,south,smsa,married,sex,union,ed,black,lwage,expYears,eduYears
0,31,47,yes,1,no,no,yes,male,yes,9,no,6.99942,>30,9-13
1,33,49,yes,1,yes,no,yes,male,yes,7,no,5.99146,>30,<9
2,35,50,no,0,no,yes,yes,male,no,14,no,7.67322,>30,>13


In [3]:
## Alternatively, load your own data, e.g. via CSV:
#train = pd.read_csv('train.csv',delimiter=',', decimal='.') #dataset on loans

## Some data cleaning:
#train['LoanAmount']=train['LoanAmount'].fillna(train['LoanAmount'].mean()) #replace missing values in LoanAmount with their mean
#train['Married']=train['Married'].mask(pd.isnull,'No') #replace missing values in Married with "No"
#train['Gender']=train['Gender'].fillna('Male') #replace missing values in Gender with "Male"

## 1 min Quickstart: just pass as arguments a dataframe and the column label for the categorical data

In [4]:
go.Figure(*cmplot(iris,xcol="Species")) #using splat operator

## A little evolutionary history, via box plots and violins:

In [5]:
boxbygender=go.Box(y=train.sex, x=train.lwage, name="Gender",legendgroup="gender",orientation="h",marker_color="green")
boxbymarried=go.Box(y=train.married, x=train.lwage, name="Married",legendgroup="married",orientation="h",marker_color="blue")
violbygender=go.Violin(y=train.sex, x=train.lwage, name="Gender",legendgroup="gender",box_visible=True,
    orientation="h",marker_color="green")
violbymarried=go.Violin(y=train.married, x=train.lwage, name="Married",legendgroup="married",box_visible=True,
    orientation="h",marker_color="blue")

fig = make_subplots(rows=1, cols=2,subplot_titles=("BoxPlot","ViolinPlot"),shared_yaxes=True)
fig.add_trace(boxbygender,row=1,col=1)
fig.add_trace(boxbymarried,row=1,col=1)

fig.add_trace(violbygender,row=1,col=2)
fig.add_trace(violbymarried,row=1,col=2)
fig.update_layout(xaxis_title="Wage (log)",yaxis_title="Married+Gender",
        xaxis_showgrid=True, yaxis_showgrid=True,
        margin_t=20
        )

### And this would be same data shown using bean plots and pirate plots:

In [6]:
(bt1,layout)=cmplot(train, xcol="sex", ycol="lwage", orientation="h",inf="none",ycolorgroups=False,
    side="both",colorshift=2,colorrange=4,pointsopacity=1,showpoints=True,showboxplot=False,pointshapes=["line-ns"],
    markoutliers=False,pointsmaxdisplayed=200)
(bt2,layout)=cmplot(train, xcol="married", ycol="lwage", orientation="h",inf="none",ycolorgroups=False,
    side="both",colorshift=0,colorrange=4,pointsopacity=1,showpoints=True,showboxplot=False,pointshapes=["line-ns"],
    markoutliers=False,pointsmaxdisplayed=200)

(pt1,layout)=cmplot(train, xcol="sex", ycol="lwage", orientation="h",inf="hdi",ycolorgroups=False,
    side="both",colorshift=2,colorrange=4,pointsopacity=0.3,showpoints=True,showboxplot=False,pointshapes=["circle"],
    markoutliers=False,pointsmaxdisplayed=200)
(pt2,layout)=cmplot(train, xcol="married", ycol="lwage", orientation="h",inf="hdi",ycolorgroups=False,
    side="both",colorshift=0,colorrange=4,pointsopacity=0.3,showpoints=True,showboxplot=False,pointshapes=["circle"],
    markoutliers=False,pointsmaxdisplayed=200)

fig = make_subplots(rows=1, cols=2,subplot_titles=("BeanPlot","PiratePlot"),shared_yaxes=True)
for i in range(0,4):
    fig.add_trace(bt1[i],row=1,col=1)
for i in range(0,4):
    fig.add_trace(bt2[i],row=1,col=1)
for i in range(0,6):
    fig.add_trace(pt1[i],row=1,col=2)
    fig.add_trace(pt2[i],row=1,col=2)

fig.update_layout(xaxis_title="Wage (log)",yaxis_title="Married+Gender",
        yaxis_showgrid=True,
        margin_t=20
        )

### Until we arrive to the cloudy mountain plots:

In [7]:
# union of two separate Xcolumns (Gender + Married)
(traces1,layout)=cmplot(train, xcol="sex", ycol="lwage", xsuperimposed=False,orientation="h",
    colorshift=2,colorrange=4,ycolorgroups=False,side="pos",inf="hdi",conf_level=0.95,altsidesflip=False,
    pointsoverdens=True,showpoints=True,pointshapes=["triangle-down","triangle-up"],pointsopacity=0.2,
    pointsdistance=1,pointsmaxdisplayed=400)
(traces2,layout)=cmplot(train, xcol="married", ycol="lwage", xsuperimposed=False,
    orientation="h",colorshift=0,colorrange=4,ycolorgroups=False,side="pos",inf="hdi",conf_level=0.95,
    altsidesflip=False,pointsoverdens=True,showpoints=True,pointshapes=["triangle-right","triangle-left"],
    pointsopacity=0.3,pointsdistance=1,pointsmaxdisplayed=400,title="CloudyMountainPlot")
layout["legend_tracegroupgap"]=0
layout["xaxis_title"]="Wage (log)"
layout["yaxis_title"]="Married and Gender"
layout["margin_l"]=60
go.Figure(traces1+traces2,layout)

### ... which is particularly powerful when overimposed for same X:

In [8]:
# Superimposed rdi plots for union of two separate Xcolumns (Gender + Married)
(traces1,layout)=cmplot(train, xcol="sex", ycol="lwage", xlabel="M/F", xsuperimposed=True,orientation="h",
    colorshift=2,colorrange=4,ycolorgroups=False,side="alt",inf="hdi",conf_level=0.95,altsidesflip=False,
    pointsoverdens=True,showpoints=True,pointshapes=["triangle-down","triangle-up"],
    pointsdistance=0.6,pointsmaxdisplayed=400)
(traces2,layout)=cmplot(train, xcol="married", ycol="lwage", xlabel="married?", xsuperimposed=True,
    orientation="h",colorshift=0,colorrange=4,ycolorgroups=False,side="alt",inf="hdi",conf_level=0.95,
    altsidesflip=False,pointsoverdens=True,showpoints=True,pointshapes=["triangle-right","triangle-left"],
    pointsdistance=0.6,pointsmaxdisplayed=400,title="Married + Gender ~ Wage")
layout["legend_tracegroupgap"]=0
layout["yaxis_title"]="Married and Gender"
layout["xaxis_title"]="Wage (log)"
layout["margin_l"]=60
layout["yaxis_range"]=[-0.51,1.51]
go.Figure(traces1+traces2,layout)
#savefig(p1::Union{Plot,PlotlyJS.SyncPlot}, joinpath(homedir(),"married_plus_gender-loanamount_overimposed_rdiplot.pdf"))

NOTE: label female -> M/F
NOTE: label male -> M/F
NOTE: label no -> married?
NOTE: label yes -> married?


## Now some illustrative usage examples
### First: two Ycol side by side
#### (notice the raw point "clouds" on opposite side of the kernel density "mountains", for better clarity)

In [9]:
#IRIS dataset, plotting two Ycol side by side
traces,layout=cmplot(iris,xcol="Species",ycol=["Sepal.Length","Petal.Length"],colorrange=3,
      pointshapes=["star-triangle-up","star-diamond","star-square"])
go.Figure(traces,layout)

### It works well also with three Ycol:

In [10]:
#IRIS dataset, plotting three Ycol side by side
traces,layout=cmplot(iris,xcol="Species",ycol=["Sepal.Length","Petal.Length","Sepal.Width",],
    pointshapes=["star-triangle-up","star-diamond","star-square"])
go.Figure(traces,layout)

## Excursus: not only we can show different Xcol together, but we can intersect them, grouping the data by two or more X
### For example, combining two X a different picture is revealed:

In [11]:
#intersection of two different Xcolumns:
traces,layout=cmplot(train, xcol=["married","sex"], ycol="lwage",ycolorgroups=False,
    side="pos",pointshapes=["star-diamond"],pointsmaxdisplayed=500)
layout["legend_tracegroupgap"]=0
layout["title_text"]="Married & Gender ~ Wage"
layout["yaxis_title"]="Married & Gender"
layout["xaxis_title"]="Wage (log)"
go.Figure(traces,layout)


### Or with three:

In [12]:
#intersection of three different Xcolumns:
traces,layout=cmplot(train, xcol=["sex","married","smsa"], ycol="lwage", ycolorgroups=False, side="both",
    pointsmaxdisplayed=300)
layout["legend_tracegroupgap"]=0
layout["margin_l"]=180
layout["margin_r"]=0
layout["title_text"]="Gender & Married & LivesInCity ~ Wage"
layout["yaxis_title"]="Gender & Married & LivesInCity"
layout["xaxis_title"]="Wage (log)"
go.Figure(traces,layout)

## Super imposition of distributions: one of the best features of cloudy mountain plots

In [13]:
#Superimposed plots for a single Xcolumn
(traces1,layout)=cmplot(train, xcol="south", ycol="lwage", xsuperimposed=True, pointsoverdens=True,
    ycolorgroups=False,altsidesflip=False, colorshift=2, colorrange=4, pointshapes=["star","pentagon"],
    pointsmaxdisplayed=400)
layout["legend_tracegroupgap"]=0
layout["margin_b"]=50
layout["title_text"]="South ~ Wage"
layout["yaxis_title"]="Resides in the south?"
layout["xaxis_title"]="Wage (log)"
layout["yaxis_range"]=[-0.51,0.51]
go.Figure(traces1,layout)

In [14]:
#superimposed plot for combination of two X variables
traces,layout=cmplot(train,xcol=["sex","bluecol"],xsuperimposed=True,ycol="lwage",
    ycolorgroups=False, pointsoverdens=True,markoutliers=False,pointshapes=["hexagon"],pointsmaxdisplayed=500)
layout["legend_tracegroupgap"]=0
layout["title_text"]="Gender & BlueCollar ~ Wage"
layout["xaxis_title"]="Wage (log)"
layout["yaxis_title"]="Gender & BlueCollar"
go.Figure(traces,layout)

In [15]:
#superimposed plot for combination of two X variables, one of which with 3 bins
traces,layout=cmplot(train,xcol=["eduYears","union"],xsuperimposed=True,ycol="lwage",
    ycolorgroups=False, altsidesflip=True, pointsoverdens=True,markoutliers=False,
    pointshapes=["hexagon"],pointsmaxdisplayed=100)
layout["legend_tracegroupgap"]=0
layout["title_text"]="EducationYears & UnionContract ~ Wage"
layout["xaxis_title"]="Wage (log)"
layout["yaxis_title"]="EducationYears & UnionContract"
go.Figure(traces,layout)

## To save publication ready vector graphic files (e.g. svg or pdf):

In [16]:
#using orca to export images (see https://plot.ly/python/getting-started/#static-image-export-support)
#(to install: conda install -c plotly plotly-orca psutil requests)

###
#fig.write_image('figure.svg')
#fig.write_image('figure.pdf')

## To save a plot as standalone html:

In [17]:
jsonplot1=fig.to_json()
template = """<html>
<head>
    <script src='https://cdn.plot.ly/plotly-latest.min.js'></script>
</head>
<body>
    <div id='divPlotly'></div>
    <script>
        var plotly_data = {}
        Plotly.react('divPlotly', plotly_data.data, plotly_data.layout);
    </script>
</body>

</html>""".format(jsonplot1)
homedir = str(Path.home())+'/'
outputfilename="plot_name.html"

###
#with open(homedir+outputfilename, 'w') as f:
#    f.write(template)