{ "cells": [ { "cell_type": "markdown", "id": "e9dfd456-0a10-41bd-962d-36671423c1d9", "metadata": {}, "source": [ "## Statistical Annotations: `geom_bracket()` and `geom_bracket_dodge()`\n", "\n", "`geom_bracket()` is a general-purpose layer for annotating ranges. It is particularly effective for adding significance bars (p-values) to categorical plots.\n", "\n", "`geom_bracket_dodge()` is a specialized layer designed for annotating ranges between dodged positions (usually dodged groups within a category).\n", "\n", "> **Note:** `geom_bracket` does not compute statistics internally. It is a visualization tool that renders the results of your analysis. In this demo, we use `scipy.stats` to calculate p-values before passing them to the plot." ] }, { "cell_type": "code", "execution_count": 1, "id": "497fa5e0-e256-47b8-be94-dfb9674e26c1", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from scipy.stats import mannwhitneyu\n", "\n", "from lets_plot import *" ] }, { "cell_type": "code", "execution_count": 2, "id": "4ea27973-8d48-4094-87d7-68e2480273d9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "LetsPlot.setup_html()" ] }, { "cell_type": "code", "execution_count": 3, "id": "a4e05860-58b9-4a77-be8a-a0a81c70abad", "metadata": {}, "outputs": [], "source": [ "# In the examples below, we will use the Mann–Whitney U test to calculate p-values\n", "def get_p_value(df, cat_col, val_col, g1, g2):\n", " x = df.loc[df[cat_col] == g1, val_col]\n", " y = df.loc[df[cat_col] == g2, val_col]\n", " return mannwhitneyu(x, y, alternative=\"two-sided\").pvalue\n", "\n", "def get_p_values_data(df, *, cat_col, val_col, base_dy, step):\n", " from itertools import combinations\n", " categories = df[cat_col].unique().tolist()\n", " rows = []\n", " for i, (xmin, xmax) in enumerate(combinations(categories, 2)):\n", " y = base_dy + i * step\n", " p = get_p_value(df, cat_col, val_col, xmin, xmax)\n", " rows.append({\"y\": y, \"p\": p,\n", " \"xmin\": xmin,\n", " \"xmax\": xmax})\n", " return pd.DataFrame(rows)" ] }, { "cell_type": "code", "execution_count": 4, "id": "53f4e5e3-0bc5-47ac-931c-44e1757a7706", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(234, 12)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0manufacturermodeldisplyearcyltransdrvctyhwyflclass
01audia41.819994auto(l5)f1829pcompact
12audia41.819994manual(m5)f2129pcompact
23audia42.020084manual(m6)f2031pcompact
34audia42.020084auto(av)f2130pcompact
45audia42.819996auto(l5)f1626pcompact
\n", "
" ], "text/plain": [ " Unnamed: 0 manufacturer model displ year cyl trans drv cty hwy \\\n", "0 1 audi a4 1.8 1999 4 auto(l5) f 18 29 \n", "1 2 audi a4 1.8 1999 4 manual(m5) f 21 29 \n", "2 3 audi a4 2.0 2008 4 manual(m6) f 20 31 \n", "3 4 audi a4 2.0 2008 4 auto(av) f 21 30 \n", "4 5 audi a4 2.8 1999 6 auto(l5) f 16 26 \n", "\n", " fl class \n", "0 p compact \n", "1 p compact \n", "2 p compact \n", "3 p compact \n", "4 p compact " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"https://raw.githubusercontent.com/JetBrains/lets-plot-docs/refs/heads/master/data/mpg.csv\")\n", "print(df.shape)\n", "df.head()" ] }, { "cell_type": "markdown", "id": "a847d81f-1879-48b3-91a6-fdc6d8607e4c", "metadata": {}, "source": [ "## Adding Significance Bars (p-values) to Categorical Plots" ] }, { "cell_type": "markdown", "id": "d45fc66e-966c-4d8c-8c4e-53fbd14a2448", "metadata": {}, "source": [ "#### Compute Significance Data" ] }, { "cell_type": "code", "execution_count": 5, "id": "9a1f4372-afff-4eab-82e8-63d0b30aa2cb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(3, 4)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ypxminxmax
0479.041090e-28f4
1515.955333e-11fr
2554.104577e-024r
\n", "
" ], "text/plain": [ " y p xmin xmax\n", "0 47 9.041090e-28 f 4\n", "1 51 5.955333e-11 f r\n", "2 55 4.104577e-02 4 r" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p_values_df = get_p_values_data(df, cat_col=\"drv\", val_col=\"hwy\", base_dy=47, step=4)\n", "print(p_values_df.shape)\n", "p_values_df" ] }, { "cell_type": "code", "execution_count": 6, "id": "836a3838-0579-4426-af16-2625de23c1c9", "metadata": {}, "outputs": [], "source": [ "p = ggplot(df, aes(\"drv\", \"hwy\", fill=\"drv\")) + \\\n", " geom_boxplot(alpha=.25) + \\\n", " geom_jitter(aes(color=\"drv\"), height=0, shape=1, alpha=.25, show_legend=False, seed=42)" ] }, { "cell_type": "markdown", "id": "b86cbee0-d7e8-405c-bbe8-d6815c4bf796", "metadata": {}, "source": [ "#### Basic p-value Annotations" ] }, { "cell_type": "code", "execution_count": 7, "id": "5800e6e7-eb1a-4008-926e-e7d94a1ffeda", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p + geom_bracket(aes(xmin=\"xmin\", xmax=\"xmax\", y=\"y\", label=\"p\"), \n", " data=p_values_df # <-- Pass p-values data to the brackets layer\n", " )" ] }, { "cell_type": "markdown", "id": "a8e73c0b-485c-4a54-834c-34cf744787e1", "metadata": {}, "source": [ "#### Format Labels with `label_format`" ] }, { "cell_type": "code", "execution_count": 8, "id": "3ffc450c-d4ab-44e0-b3ae-e09848224b8a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p + geom_bracket(aes(xmin=\"xmin\", xmax=\"xmax\", y=\"y\", label=\"p\"), data=p_values_df, label_format=\".2~g\")" ] }, { "cell_type": "markdown", "id": "af72dbca-98ec-40fb-bbb3-1c6a41e70a59", "metadata": {}, "source": [ "#### Format Labels with `label_pvalue()` from `mizani`\n", "\n", "Mizani is a scales package for graphics: https://pypi.org/project/mizani/ " ] }, { "cell_type": "code", "execution_count": 9, "id": "7cda0a3d-1c97-4118-bf79-ba9e5c08799c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from mizani.labels import label_pvalue\n", "\n", "formatter = label_pvalue(add_p=True)\n", "p_values_df = p_values_df.assign(label=lambda d: formatter(d[\"p\"]))\n", "\n", "p + geom_bracket(aes(xmin=\"xmin\", xmax=\"xmax\", y=\"y\", label=\"label\"), data=p_values_df)" ] }, { "cell_type": "markdown", "id": "a537e348-6518-45b3-8511-c7491efee2ff", "metadata": {}, "source": [ "#### Apply Custom Formatting Logic (Significance Stars)\n", "\n", "Map p-values to custom strings (like *** or ns) using your own function." ] }, { "cell_type": "code", "execution_count": 10, "id": "becbdfcd-2944-4e00-a206-96c2133b45fa", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def stars_formatter(value):\n", " if value <= 0.001:\n", " return \"***\"\n", " if value <= 0.01:\n", " return \"**\"\n", " if value <= 0.05:\n", " return \"*\"\n", " return \"ns\"\n", "\n", "p_values_df = p_values_df.assign(star=lambda d: d[\"p\"].map(stars_formatter))\n", "\n", "p + geom_bracket(aes(xmin=\"xmin\", xmax=\"xmax\", y=\"y\", label=\"star\"), data=p_values_df)" ] }, { "cell_type": "markdown", "id": "c6d1ca97-1c2c-4a64-8976-0c720b449771", "metadata": {}, "source": [ "## Annotating Dodged Groups\n", "\n", "To draw brackets between dodged groups, use `geom_bracket_dodge()`. Instead of using data coordinates, this layer uses group indices:\n", "\n", "- x: The main category on the x-axis.\n", "\n", "- istart, iend: The zero-based indices of the dodged groups to compare.\n", "\n" ] }, { "cell_type": "code", "execution_count": 11, "id": "08d9d3c5-02d3-4aa6-aeef-d2f4811363a2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ypxminxmaxxstarstartend
047.00.0000254.06.0compact***01
151.00.9103674.05.0compactns03
255.00.0301586.05.0compact*13
347.00.0301946.08.0midsize*12
451.00.0000096.04.0midsize***10
\n", "
" ], "text/plain": [ " y p xmin xmax x star start end\n", "0 47.0 0.000025 4.0 6.0 compact *** 0 1\n", "1 51.0 0.910367 4.0 5.0 compact ns 0 3\n", "2 55.0 0.030158 6.0 5.0 compact * 1 3\n", "3 47.0 0.030194 6.0 8.0 midsize * 1 2\n", "4 51.0 0.000009 6.0 4.0 midsize *** 1 0" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Group indexes must be specified in the same order as they appear on the plot for other layers\n", "group_indices = {4.0: 0, 6.0: 1, 8.0: 2, 5.0: 3}\n", "p_values_grouped_df = pd.concat([\n", " get_p_values_data(df[df[\"class\"] == name], cat_col=\"cyl\", val_col=\"hwy\", base_dy=47, step=4).assign(\n", " x=name,\n", " star=lambda d: d[\"p\"].map(stars_formatter) if d.shape[0] > 0 else None,\n", " start=lambda d: d[\"xmin\"].map(group_indices) if d.shape[0] > 0 else None,\n", " end=lambda d: d[\"xmax\"].map(group_indices) if d.shape[0] > 0 else None,\n", " )\n", " for name in df[\"class\"].unique()\n", "]).reset_index(drop=True)\n", "\n", "p_values_grouped_df.head()" ] }, { "cell_type": "code", "execution_count": 12, "id": "7a119429-97f0-4548-998c-9c90664617b2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(ggplot(df, aes(\"class\", \"hwy\", fill=as_discrete(\"cyl\")))\n", " + geom_boxplot(alpha=.25)\n", " + geom_point(aes(color=as_discrete(\"cyl\")),\n", " position=position_jitterdodge(jitter_width=.2, jitter_height=0),\n", " shape=1, alpha=.25, show_legend=False, seed=42)\n", " + geom_bracket_dodge(aes(\"x\", \"y\", istart=\"start\", iend=\"end\", label=\"star\"), data=p_values_grouped_df)\n", " + scale_brewer([\"color\", \"fill\"], palette=\"Accent\", format=\"d\")\n", " + ggsize(1000, 500))" ] }, { "cell_type": "markdown", "id": "a959e877-8dc0-4274-9812-6929de1010fb", "metadata": {}, "source": [ "## Customizing Bracket Geometry" ] }, { "cell_type": "markdown", "id": "3093baf9-9855-4802-b030-378cb0dacd2e", "metadata": {}, "source": [ "#### Bottom-Axis (Upside Down) Annotations\n", "\n", "To place brackets at the bottom of a plot, use negative values for `lenstart` and `lenend`. \\\n", "You may also need to adjust `vjust` to position the labels correctly outside the bracket." ] }, { "cell_type": "code", "execution_count": 13, "id": "7027408a-bb86-431e-9c37-5d29f62155ac", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(3, 5)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ypxminxmaxlabel
089.041090e-28f4p<0.001
145.955333e-11frp<0.001
204.104577e-024rp=0.041
\n", "
" ], "text/plain": [ " y p xmin xmax label\n", "0 8 9.041090e-28 f 4 p<0.001\n", "1 4 5.955333e-11 f r p<0.001\n", "2 0 4.104577e-02 4 r p=0.041" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "upside_down_p_values_df = get_p_values_data(df, cat_col=\"drv\", val_col=\"hwy\", base_dy=8, step=-4).assign(\n", " label=lambda d: formatter(d[\"p\"])\n", ")\n", "print(upside_down_p_values_df.shape)\n", "upside_down_p_values_df" ] }, { "cell_type": "code", "execution_count": 14, "id": "3540e177-97d7-41a5-b7ae-b9d1f167a0ba", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(p + geom_bracket(aes(xmin=\"xmin\", xmax=\"xmax\", y=\"y\", label=\"label\"), data=upside_down_p_values_df,\n", " lenstart=-5, lenend=-5, # Negative values to reverse the direction of the brackets\n", " vjust=2)) # Specify vjust to move the labels under the brackets" ] }, { "cell_type": "markdown", "id": "13676b7e-66f9-4fe8-bc1f-5c8c2578ec9a", "metadata": {}, "source": [ "#### Precise Alignment with `tiplength_unit='identity'`\n", "\n", "By default, bracket tips use a fixed length. Setting `tiplength_unit='identity'` allows you to specify tip lengths in **data units**. \\\n", "This is essential when you want a bracket tip to reach a specific coordinate, such as the exact top of a boxplot or a particular data point." ] }, { "cell_type": "code", "execution_count": 15, "id": "dd243510-74ab-4cff-882f-3fb5bb997f1b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Upper limits of boxes for each category:\n", "{'4': 28, 'f': 44, 'r': 26}\n", "\n", "Dataset with p-values:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ypxminxmaxlabelstar
0479.041090e-28f4p<0.001***
1515.955333e-11frp<0.001***
2554.104577e-024rp=0.041*
\n", "
" ], "text/plain": [ " y p xmin xmax label star\n", "0 47 9.041090e-28 f 4 p<0.001 ***\n", "1 51 5.955333e-11 f r p<0.001 ***\n", "2 55 4.104577e-02 4 r p=0.041 *" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Let's recall the y-coordinates from the datasets\n", "print(f\"Upper limits of boxes for each category:\\n{df.groupby('drv')['hwy'].max().to_dict()}\\n\")\n", "print(\"Dataset with p-values:\")\n", "p_values_df" ] }, { "cell_type": "code", "execution_count": 16, "id": "bde35876-fe70-4b65-8600-550d5f86b0fd", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# The tip lengths are chosen so that each tip reaches either the previous bracket or the box\n", "# Example (first bracket):\n", "# lenstart = y - upper_box_limit_at_xmin - 1\n", "# -> 47 - 44 - 1 = 2\n", "# lenend = y - upper_box_limit_at_xmax - 1\n", "# -> 47 - 28 - 1 = 18\n", "(p + geom_bracket(aes(xmin=\"xmin\", xmax=\"xmax\", y=\"y\", label=\"label\",\n", " lenstart=[2, 1, 1],\n", " lenend=[18, 24, 1]),\n", " data=p_values_df,\n", " tiplength_unit='identity')) # identity units (data space)" ] }, { "cell_type": "markdown", "id": "b6daee5f-20ce-42ce-8b85-7539ddf90809", "metadata": {}, "source": [ "## Non-Statistical Annotations: Range Grouping\n", "\n", "Brackets can be used for more than just drawing p-values.\n", "For example, they can be used to highlight cluster boundaries in the following scatter plot:" ] }, { "cell_type": "code", "execution_count": 17, "id": "8cee04ac-5c93-47ac-8d74-9f47fb3222eb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_continuous_brackets_data(df, cat_col, primary_val_col, secondary_val_col, step_mult):\n", " secondary_min, secondary_max = df[secondary_val_col].min(), df[secondary_val_col].max()\n", " step = (secondary_max - secondary_min) * step_mult\n", " base = secondary_max + step\n", " df = df.copy()\n", " df[\"drv\"] = pd.Categorical(df[\"drv\"], categories=[\"r\", \"4\", \"f\"], ordered=True)\n", " df = df.sort_values(by=\"drv\")\n", " return pd.merge(\n", " df.groupby(cat_col, observed=False)[primary_val_col].min().to_frame(f\"min_{primary_val_col}\").reset_index(),\n", " df.groupby(cat_col, observed=False)[primary_val_col].max().to_frame(f\"max_{primary_val_col}\").reset_index(),\n", " on=\"drv\"\n", " ).reset_index().assign(**{f\"{secondary_val_col}_level\": lambda d: base + d[\"index\"] * step})\n", "\n", "(ggplot()\n", " + geom_point(aes(\"hwy\", \"cty\", color=\"drv\"), data=df, alpha=.25, show_legend=False)\n", "\n", " # Horizontal brackets for hwy ranges\n", " + geom_bracket(aes(xmin=\"min_hwy\", xmax=\"max_hwy\", y=\"cty_level\", label=\"drv\", color=\"drv\"),\n", " data=get_continuous_brackets_data(df, \"drv\", \"hwy\", \"cty\", .1))\n", " \n", " # Vertical brackets for cty ranges\n", " + geom_bracket(aes(x=\"hwy_level\", ymin=\"min_cty\", ymax=\"max_cty\", label=\"drv\", color=\"drv\"),\n", " data=get_continuous_brackets_data(df, \"drv\", \"cty\", \"hwy\", .05))\n", " + theme_minimal() + labs(x='hwy', y='cty'))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.23" } }, "nbformat": 4, "nbformat_minor": 5 }