{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Y-Dotplot Geometry"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Preparation"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from lets_plot import *\n",
"from lets_plot.mapping import as_discrete\n",
"LetsPlot.setup_html()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def plot_matrix(plots=[], width=400, height=300, columns=2):\n",
" bunch = GGBunch()\n",
" for i in range(len(plots)):\n",
" row = int(i / columns)\n",
" column = i % columns\n",
" bunch.add_plot(plots[i], column * width, row * height, width, height)\n",
" return bunch.show()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"DEF_BIN_COUNT = 20\n",
"DEF_BINWIDTH_RATIO = 1/DEF_BIN_COUNT\n",
"\n",
"def get_binwidth(df, column, binwidth_ratio=DEF_BINWIDTH_RATIO):\n",
" return binwidth_ratio * (df[column].max() - df[column].min())\n",
"\n",
"def get_bincount(df, column, binwidth):\n",
" return int(round((df[column].max() - df[column].min()) / binwidth))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" manufacturer | \n",
" model | \n",
" displ | \n",
" year | \n",
" cyl | \n",
" trans | \n",
" drv | \n",
" cty | \n",
" hwy | \n",
" fl | \n",
" class | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" dodge | \n",
" ram 1500 pickup 4wd | \n",
" 4.7 | \n",
" 2008 | \n",
" 8 | \n",
" manual(m6) | \n",
" 4 | \n",
" 9 | \n",
" 12 | \n",
" e | \n",
" pickup | \n",
"
\n",
" \n",
" | 1 | \n",
" toyota | \n",
" toyota tacoma 4wd | \n",
" 4.0 | \n",
" 2008 | \n",
" 6 | \n",
" auto(l5) | \n",
" 4 | \n",
" 16 | \n",
" 20 | \n",
" r | \n",
" pickup | \n",
"
\n",
" \n",
" | 2 | \n",
" toyota | \n",
" camry | \n",
" 2.2 | \n",
" 1999 | \n",
" 4 | \n",
" auto(l4) | \n",
" f | \n",
" 21 | \n",
" 27 | \n",
" r | \n",
" midsize | \n",
"
\n",
" \n",
" | 3 | \n",
" audi | \n",
" a4 quattro | \n",
" 2.0 | \n",
" 2008 | \n",
" 4 | \n",
" manual(m6) | \n",
" 4 | \n",
" 20 | \n",
" 28 | \n",
" p | \n",
" compact | \n",
"
\n",
" \n",
" | 4 | \n",
" jeep | \n",
" grand cherokee 4wd | \n",
" 4.7 | \n",
" 2008 | \n",
" 8 | \n",
" auto(l5) | \n",
" 4 | \n",
" 14 | \n",
" 19 | \n",
" r | \n",
" suv | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" manufacturer model displ year cyl trans drv cty \\\n",
"0 dodge ram 1500 pickup 4wd 4.7 2008 8 manual(m6) 4 9 \n",
"1 toyota toyota tacoma 4wd 4.0 2008 6 auto(l5) 4 16 \n",
"2 toyota camry 2.2 1999 4 auto(l4) f 21 \n",
"3 audi a4 quattro 2.0 2008 4 manual(m6) 4 20 \n",
"4 jeep grand cherokee 4wd 4.7 2008 8 auto(l5) 4 14 \n",
"\n",
" hwy fl class \n",
"0 12 e pickup \n",
"1 20 r pickup \n",
"2 27 r midsize \n",
"3 28 p compact \n",
"4 19 r suv "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/mpg.csv\")\n",
"df = df.drop(columns=[\"Unnamed: 0\"])\n",
"df = df.sample(n=100, random_state=42, ignore_index=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.6"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"binwidth = get_binwidth(df, \"hwy\")\n",
"binwidth"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Minimalistic example"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ggplot(df, aes(y=\"hwy\")) + geom_ydotplot() + ggtitle(\"Simplest example\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Comparison of geoms"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"PACIFIC_BLUE = '#118ed8'"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ggplot(df, aes(x=\"drv\", y=\"hwy\")) + \\\n",
" geom_violin(fill=PACIFIC_BLUE, size=0) + \\\n",
" geom_ydotplot(binwidth=binwidth, stackratio=.5, \\\n",
" color=PACIFIC_BLUE, fill='white') + \\\n",
" ggtitle(\"violin + ydotplot (method='dotdensity')\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ggplot(df, aes(x=\"drv\", y=\"hwy\")) + \\\n",
" geom_violin(fill=PACIFIC_BLUE, size=0) + \\\n",
" geom_ydotplot(method='histodot', binwidth=binwidth, stackratio=.5, \\\n",
" color=PACIFIC_BLUE, fill='white') + \\\n",
" ggtitle(\"violin + ydotplot (method='histodot')\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Parameters"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### `stackdir`"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"p = ggplot(df, aes(x=\"drv\", y=\"hwy\"))\n",
"p1 = p + geom_ydotplot(binwidth=binwidth, stackdir='left') + \\\n",
" ggtitle(\"stackdir='left'\")\n",
"p2 = p + geom_ydotplot(binwidth=binwidth, stackdir='right') + \\\n",
" ggtitle(\"stackdir='right'\")\n",
"p3 = p + geom_ydotplot(binwidth=binwidth, stackdir='center') + \\\n",
" ggtitle(\"stackdir='center' (default)\")\n",
"p4 = p + geom_ydotplot(binwidth=binwidth, stackdir='centerwhole') + \\\n",
" ggtitle(\"stackdir='centerwhole'\")\n",
"\n",
"plot_matrix([p1, p2, p3, p4])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### `stackratio`"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"p = ggplot(df, aes(x=\"drv\", y=\"hwy\"))\n",
"p1 = p + geom_ydotplot(binwidth=binwidth, stackratio=1.0) + ggtitle(\"stackratio=1.0 (default)\")\n",
"p2 = p + geom_ydotplot(binwidth=binwidth, stackratio=0.5) + ggtitle(\"stackratio=0.5\")\n",
"p3 = p + geom_ydotplot(binwidth=binwidth, stackratio=1.5) + ggtitle(\"stackratio=1.5\")\n",
"\n",
"plot_matrix([p1, p2, p3])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### `dotsize`"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"p = ggplot(df, aes(x=\"drv\", y=\"hwy\"))\n",
"p1 = p + geom_ydotplot(binwidth=binwidth) + ggtitle(\"dotsize=1.0 (default)\")\n",
"p2 = p + geom_ydotplot(binwidth=binwidth, dotsize=0.5) + ggtitle(\"dotsize=0.5\")\n",
"p3 = p + geom_ydotplot(binwidth=binwidth, dotsize=1.5) + ggtitle(\"dotsize=1.5\")\n",
"\n",
"plot_matrix([p1, p2, p3])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### `center`"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"p = ggplot(df, aes(x=\"drv\", y=\"hwy\"))\n",
"p1 = p + geom_ydotplot(binwidth=binwidth, method='histodot') + ggtitle(\"Default\")\n",
"p2 = p + geom_ydotplot(binwidth=binwidth, method='histodot', center=11.0) + ggtitle(\"center=11.0\")\n",
"\n",
"plot_matrix([p1, p2])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### `boundary`"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"p = ggplot(df, aes(x=\"drv\", y=\"hwy\"))\n",
"p1 = p + geom_ydotplot(binwidth=binwidth, method='histodot') + ggtitle(\"Default\")\n",
"p2 = p + geom_ydotplot(binwidth=binwidth, method='histodot', boundary=11.0) + ggtitle(\"boundary=11.0\")\n",
"\n",
"plot_matrix([p1, p2])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### `bins`"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"p = ggplot(df, aes(x=\"drv\", y=\"hwy\"))\n",
"p1 = p + geom_ydotplot(method='histodot') + ggtitle(\"Default\")\n",
"p2 = p + geom_ydotplot(method='histodot', bins=25) + ggtitle(\"bins=25\")\n",
"\n",
"plot_matrix([p1, p2])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Grouping"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"p = ggplot(df, aes(x=\"drv\", y=\"hwy\"))\n",
"p1 = p + geom_ydotplot(aes(fill=as_discrete(\"year\")), \\\n",
" binwidth=binwidth, stackratio=.75, color='black') + \\\n",
" ggtitle(\"method='dotdensity', stackgroups=False (default)\")\n",
"p2 = p + geom_ydotplot(aes(fill=as_discrete(\"year\")), \\\n",
" binwidth=binwidth, stackratio=.75, \\\n",
" stackgroups=True, color='black') + \\\n",
" ggtitle(\"method='dotdensity', stackgroups=True\")\n",
"\n",
"plot_matrix([p1, p2], width=800, height=400, columns=1)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"p = ggplot(df, aes(x=\"drv\", y=\"hwy\"))\n",
"p1 = p + geom_ydotplot(aes(fill=as_discrete(\"year\")), \\\n",
" binwidth=binwidth, stackratio=.75, \\\n",
" method='histodot', color='black') + \\\n",
" ggtitle(\"method='histodot', stackgroups=False (default)\")\n",
"p2 = p + geom_ydotplot(aes(fill=as_discrete(\"year\")), \\\n",
" binwidth=binwidth, stackratio=.75, \\\n",
" method='histodot', stackgroups=True, color='black') + \\\n",
" ggtitle(\"method='histodot', stackgroups=True\")\n",
"\n",
"plot_matrix([p1, p2], width=800, height=400, columns=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tooltips"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ggplot(df, aes(x=\"drv\", y=\"hwy\")) + \\\n",
" geom_ydotplot(binwidth=binwidth, stackratio=.75, \\\n",
" tooltips=layer_tooltips().line(\"^x\")\\\n",
" .line(\"Stack center|^y\")\\\n",
" .line(\"Number of dots in stack|@..count..\")\\\n",
" .line(\"Width of the bin|@..binwidth..\")) + \\\n",
" ggtitle(\"With tooltips\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Facetting"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ggplot(df, aes(x=\"drv\", y=\"hwy\")) + \\\n",
" geom_ydotplot(aes(fill=\"drv\"), \\\n",
" binwidth=binwidth, color='black') + \\\n",
" facet_grid(x=\"year\") + \\\n",
" ggtitle(\"facet_grid()\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Flip coordinates"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ggplot(df, aes(x=\"drv\", y=\"hwy\")) + \\\n",
" geom_ydotplot(aes(fill=\"drv\"), binwidth=binwidth, \\\n",
" stackratio=.5, dotsize=.5, color='black') + \\\n",
" coord_flip() + \\\n",
" ggtitle(\"Flip coordinates\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## \"identity\" statistic"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count | \n",
" hwy | \n",
" drv | \n",
" binwidth | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 3 | \n",
" 12.0 | \n",
" 4 | \n",
" 1.6 | \n",
"
\n",
" \n",
" | 1 | \n",
" 5 | \n",
" 13.6 | \n",
" 4 | \n",
" 1.6 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" 15.2 | \n",
" 4 | \n",
" 1.6 | \n",
"
\n",
" \n",
" | 3 | \n",
" 13 | \n",
" 16.8 | \n",
" 4 | \n",
" 1.6 | \n",
"
\n",
" \n",
" | 4 | \n",
" 5 | \n",
" 18.4 | \n",
" 4 | \n",
" 1.6 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" count hwy drv binwidth\n",
"0 3 12.0 4 1.6\n",
"1 5 13.6 4 1.6\n",
"2 2 15.2 4 1.6\n",
"3 13 16.8 4 1.6\n",
"4 5 18.4 4 1.6"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"identity_df = pd.concat([\n",
" pd.DataFrame(\n",
" list(zip(*np.histogram(df[df.drv == drv].hwy, bins=get_bincount(df[df.drv == drv], \"hwy\", binwidth)))),\n",
" columns=[\"count\", \"hwy\"]\n",
" ).assign(\n",
" drv = [drv] * get_bincount(df[df.drv == drv], \"hwy\", binwidth),\n",
" binwidth = [binwidth] * get_bincount(df[df.drv == drv], \"hwy\", binwidth),\n",
" )\n",
" for drv in df.drv.unique()\n",
"])\n",
"identity_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ggplot(identity_df, aes(x=\"drv\", y=\"hwy\", stacksize=\"count\", binwidth=\"binwidth\")) + \\\n",
" geom_ydotplot(aes(fill=\"drv\"), stat='identity', stackratio=.75, color=\"black\") + \\\n",
" ggtitle(\"stat='identity'\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Additional layers"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ggplot(df, aes(x=\"drv\", y=\"hwy\")) + \\\n",
" geom_ydotplot(aes(fill=as_discrete(\"drv\")), method='histodot', \\\n",
" bins=15, stackdir='center', stackratio=.75, \\\n",
" color='black', alpha=.5, size=.2) + \\\n",
" scale_fill_brewer(type='qual', palette='Set1') + \\\n",
" theme_grey() + \\\n",
" ggtitle(\"Some additional aesthetics, parameters and layers\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}