{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Y-Dotplot Geometry" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preparation" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "from lets_plot import *\n", "from lets_plot.mapping import as_discrete\n", "LetsPlot.setup_html()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def plot_matrix(plots=[], width=400, height=300, columns=2):\n", " bunch = GGBunch()\n", " for i in range(len(plots)):\n", " row = int(i / columns)\n", " column = i % columns\n", " bunch.add_plot(plots[i], column * width, row * height, width, height)\n", " return bunch.show()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "DEF_BIN_COUNT = 20\n", "DEF_BINWIDTH_RATIO = 1/DEF_BIN_COUNT\n", "\n", "def get_binwidth(df, column, binwidth_ratio=DEF_BINWIDTH_RATIO):\n", " return binwidth_ratio * (df[column].max() - df[column].min())\n", "\n", "def get_bincount(df, column, binwidth):\n", " return int(round((df[column].max() - df[column].min()) / binwidth))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
manufacturermodeldisplyearcyltransdrvctyhwyflclass
0dodgeram 1500 pickup 4wd4.720088manual(m6)4912epickup
1toyotatoyota tacoma 4wd4.020086auto(l5)41620rpickup
2toyotacamry2.219994auto(l4)f2127rmidsize
3audia4 quattro2.020084manual(m6)42028pcompact
4jeepgrand cherokee 4wd4.720088auto(l5)41419rsuv
\n", "
" ], "text/plain": [ " manufacturer model displ year cyl trans drv cty \\\n", "0 dodge ram 1500 pickup 4wd 4.7 2008 8 manual(m6) 4 9 \n", "1 toyota toyota tacoma 4wd 4.0 2008 6 auto(l5) 4 16 \n", "2 toyota camry 2.2 1999 4 auto(l4) f 21 \n", "3 audi a4 quattro 2.0 2008 4 manual(m6) 4 20 \n", "4 jeep grand cherokee 4wd 4.7 2008 8 auto(l5) 4 14 \n", "\n", " hwy fl class \n", "0 12 e pickup \n", "1 20 r pickup \n", "2 27 r midsize \n", "3 28 p compact \n", "4 19 r suv " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/mpg.csv\")\n", "df = df.drop(columns=[\"Unnamed: 0\"])\n", "df = df.sample(n=100, random_state=42, ignore_index=True)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.6" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "binwidth = get_binwidth(df, \"hwy\")\n", "binwidth" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Minimalistic example" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(y=\"hwy\")) + geom_ydotplot() + ggtitle(\"Simplest example\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Comparison of geoms" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "PACIFIC_BLUE = '#118ed8'" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(x=\"drv\", y=\"hwy\")) + \\\n", " geom_violin(fill=PACIFIC_BLUE, size=0) + \\\n", " geom_ydotplot(binwidth=binwidth, stackratio=.5, \\\n", " color=PACIFIC_BLUE, fill='white') + \\\n", " ggtitle(\"violin + ydotplot (method='dotdensity')\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(x=\"drv\", y=\"hwy\")) + \\\n", " geom_violin(fill=PACIFIC_BLUE, size=0) + \\\n", " geom_ydotplot(method='histodot', binwidth=binwidth, stackratio=.5, \\\n", " color=PACIFIC_BLUE, fill='white') + \\\n", " ggtitle(\"violin + ydotplot (method='histodot')\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Parameters" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### `stackdir`" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "p = ggplot(df, aes(x=\"drv\", y=\"hwy\"))\n", "p1 = p + geom_ydotplot(binwidth=binwidth, stackdir='left') + \\\n", " ggtitle(\"stackdir='left'\")\n", "p2 = p + geom_ydotplot(binwidth=binwidth, stackdir='right') + \\\n", " ggtitle(\"stackdir='right'\")\n", "p3 = p + geom_ydotplot(binwidth=binwidth, stackdir='center') + \\\n", " ggtitle(\"stackdir='center' (default)\")\n", "p4 = p + geom_ydotplot(binwidth=binwidth, stackdir='centerwhole') + \\\n", " ggtitle(\"stackdir='centerwhole'\")\n", "\n", "plot_matrix([p1, p2, p3, p4])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### `stackratio`" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "p = ggplot(df, aes(x=\"drv\", y=\"hwy\"))\n", "p1 = p + geom_ydotplot(binwidth=binwidth, stackratio=1.0) + ggtitle(\"stackratio=1.0 (default)\")\n", "p2 = p + geom_ydotplot(binwidth=binwidth, stackratio=0.5) + ggtitle(\"stackratio=0.5\")\n", "p3 = p + geom_ydotplot(binwidth=binwidth, stackratio=1.5) + ggtitle(\"stackratio=1.5\")\n", "\n", "plot_matrix([p1, p2, p3])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### `dotsize`" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "p = ggplot(df, aes(x=\"drv\", y=\"hwy\"))\n", "p1 = p + geom_ydotplot(binwidth=binwidth) + ggtitle(\"dotsize=1.0 (default)\")\n", "p2 = p + geom_ydotplot(binwidth=binwidth, dotsize=0.5) + ggtitle(\"dotsize=0.5\")\n", "p3 = p + geom_ydotplot(binwidth=binwidth, dotsize=1.5) + ggtitle(\"dotsize=1.5\")\n", "\n", "plot_matrix([p1, p2, p3])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### `center`" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "p = ggplot(df, aes(x=\"drv\", y=\"hwy\"))\n", "p1 = p + geom_ydotplot(binwidth=binwidth, method='histodot') + ggtitle(\"Default\")\n", "p2 = p + geom_ydotplot(binwidth=binwidth, method='histodot', center=11.0) + ggtitle(\"center=11.0\")\n", "\n", "plot_matrix([p1, p2])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### `boundary`" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "p = ggplot(df, aes(x=\"drv\", y=\"hwy\"))\n", "p1 = p + geom_ydotplot(binwidth=binwidth, method='histodot') + ggtitle(\"Default\")\n", "p2 = p + geom_ydotplot(binwidth=binwidth, method='histodot', boundary=11.0) + ggtitle(\"boundary=11.0\")\n", "\n", "plot_matrix([p1, p2])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### `bins`" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "p = ggplot(df, aes(x=\"drv\", y=\"hwy\"))\n", "p1 = p + geom_ydotplot(method='histodot') + ggtitle(\"Default\")\n", "p2 = p + geom_ydotplot(method='histodot', bins=25) + ggtitle(\"bins=25\")\n", "\n", "plot_matrix([p1, p2])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Grouping" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "p = ggplot(df, aes(x=\"drv\", y=\"hwy\"))\n", "p1 = p + geom_ydotplot(aes(fill=as_discrete(\"year\")), \\\n", " binwidth=binwidth, stackratio=.75, color='black') + \\\n", " ggtitle(\"method='dotdensity', stackgroups=False (default)\")\n", "p2 = p + geom_ydotplot(aes(fill=as_discrete(\"year\")), \\\n", " binwidth=binwidth, stackratio=.75, \\\n", " stackgroups=True, color='black') + \\\n", " ggtitle(\"method='dotdensity', stackgroups=True\")\n", "\n", "plot_matrix([p1, p2], width=800, height=400, columns=1)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "p = ggplot(df, aes(x=\"drv\", y=\"hwy\"))\n", "p1 = p + geom_ydotplot(aes(fill=as_discrete(\"year\")), \\\n", " binwidth=binwidth, stackratio=.75, \\\n", " method='histodot', color='black') + \\\n", " ggtitle(\"method='histodot', stackgroups=False (default)\")\n", "p2 = p + geom_ydotplot(aes(fill=as_discrete(\"year\")), \\\n", " binwidth=binwidth, stackratio=.75, \\\n", " method='histodot', stackgroups=True, color='black') + \\\n", " ggtitle(\"method='histodot', stackgroups=True\")\n", "\n", "plot_matrix([p1, p2], width=800, height=400, columns=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tooltips" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(x=\"drv\", y=\"hwy\")) + \\\n", " geom_ydotplot(binwidth=binwidth, stackratio=.75, \\\n", " tooltips=layer_tooltips().line(\"^x\")\\\n", " .line(\"Stack center|^y\")\\\n", " .line(\"Number of dots in stack|@..count..\")\\\n", " .line(\"Width of the bin|@..binwidth..\")) + \\\n", " ggtitle(\"With tooltips\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Facetting" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(x=\"drv\", y=\"hwy\")) + \\\n", " geom_ydotplot(aes(fill=\"drv\"), \\\n", " binwidth=binwidth, color='black') + \\\n", " facet_grid(x=\"year\") + \\\n", " ggtitle(\"facet_grid()\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Flip coordinates" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(x=\"drv\", y=\"hwy\")) + \\\n", " geom_ydotplot(aes(fill=\"drv\"), binwidth=binwidth, \\\n", " stackratio=.5, dotsize=.5, color='black') + \\\n", " coord_flip() + \\\n", " ggtitle(\"Flip coordinates\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## \"identity\" statistic" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
counthwydrvbinwidth
0312.041.6
1513.641.6
2215.241.6
31316.841.6
4518.441.6
\n", "
" ], "text/plain": [ " count hwy drv binwidth\n", "0 3 12.0 4 1.6\n", "1 5 13.6 4 1.6\n", "2 2 15.2 4 1.6\n", "3 13 16.8 4 1.6\n", "4 5 18.4 4 1.6" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "identity_df = pd.concat([\n", " pd.DataFrame(\n", " list(zip(*np.histogram(df[df.drv == drv].hwy, bins=get_bincount(df[df.drv == drv], \"hwy\", binwidth)))),\n", " columns=[\"count\", \"hwy\"]\n", " ).assign(\n", " drv = [drv] * get_bincount(df[df.drv == drv], \"hwy\", binwidth),\n", " binwidth = [binwidth] * get_bincount(df[df.drv == drv], \"hwy\", binwidth),\n", " )\n", " for drv in df.drv.unique()\n", "])\n", "identity_df.head()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(identity_df, aes(x=\"drv\", y=\"hwy\", stacksize=\"count\", binwidth=\"binwidth\")) + \\\n", " geom_ydotplot(aes(fill=\"drv\"), stat='identity', stackratio=.75, color=\"black\") + \\\n", " ggtitle(\"stat='identity'\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Additional layers" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(x=\"drv\", y=\"hwy\")) + \\\n", " geom_ydotplot(aes(fill=as_discrete(\"drv\")), method='histodot', \\\n", " bins=15, stackdir='center', stackratio=.75, \\\n", " color='black', alpha=.5, size=.2) + \\\n", " scale_fill_brewer(type='qual', palette='Set1') + \\\n", " theme_grey() + \\\n", " ggtitle(\"Some additional aesthetics, parameters and layers\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 4 }