{ "cells": [ { "cell_type": "markdown", "id": "2ac0e74d", "metadata": {}, "source": [ "# `\"boxplot_outlier\"` Statistics\n", "\n", "Computes outlier values on `\"box-plot\"` chart but can be used in alternative visualizations as well." ] }, { "cell_type": "code", "execution_count": 1, "id": "8ab5b1ed", "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:05:52.906138Z", "iopub.status.busy": "2024-04-26T12:05:52.906138Z", "iopub.status.idle": "2024-04-26T12:05:53.913767Z", "shell.execute_reply": "2024-04-26T12:05:53.913767Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "from lets_plot import *\n", "from lets_plot.mapping import as_discrete" ] }, { "cell_type": "code", "execution_count": 2, "id": "b0b9825c", "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:05:53.913767Z", "iopub.status.busy": "2024-04-26T12:05:53.913767Z", "iopub.status.idle": "2024-04-26T12:05:53.929513Z", "shell.execute_reply": "2024-04-26T12:05:53.929513Z" } }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "LetsPlot.setup_html()" ] }, { "cell_type": "code", "execution_count": 3, "id": "5e6d519e", "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:05:53.929513Z", "iopub.status.busy": "2024-04-26T12:05:53.929513Z", "iopub.status.idle": "2024-04-26T12:05:54.071371Z", "shell.execute_reply": "2024-04-26T12:05:54.071371Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0manufacturermodeldisplyearcyltransdrvctyhwyflclass
01audia41.819994auto(l5)f1829pcompact
12audia41.819994manual(m5)f2129pcompact
23audia42.020084manual(m6)f2031pcompact
\n", "
" ], "text/plain": [ " Unnamed: 0 manufacturer model displ year cyl trans drv cty hwy \\\n", "0 1 audi a4 1.8 1999 4 auto(l5) f 18 29 \n", "1 2 audi a4 1.8 1999 4 manual(m5) f 21 29 \n", "2 3 audi a4 2.0 2008 4 manual(m6) f 20 31 \n", "\n", " fl class \n", "0 p compact \n", "1 p compact \n", "2 p compact " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mpg = pd.read_csv('https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/mpg.csv')\n", "mpg.head(3)" ] }, { "cell_type": "code", "execution_count": 4, "id": "34ba78e2-e63a-402b-95dc-4ee2b38c373c", "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:05:54.071371Z", "iopub.status.busy": "2024-04-26T12:05:54.071371Z", "iopub.status.idle": "2024-04-26T12:05:54.087027Z", "shell.execute_reply": "2024-04-26T12:05:54.087027Z" } }, "outputs": [], "source": [ "p = (ggplot(mpg, aes(y='hwy')) \n", " + scale_color_viridis(option=\"magma\", end=0.8) \n", " + ggsize(700, 400))" ] }, { "cell_type": "code", "execution_count": 5, "id": "f11eacef-aab2-4f35-a3b5-bdf0eddc4f8f", "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:05:54.087027Z", "iopub.status.busy": "2024-04-26T12:05:54.087027Z", "iopub.status.idle": "2024-04-26T12:05:54.102619Z", "shell.execute_reply": "2024-04-26T12:05:54.102619Z" } }, "outputs": [], "source": [ "# Ordering by variable \"..middle..\" when using stat \"boxplot\" or \"boxplot_outlier\".\n", "class_by_middle=as_discrete('class', order_by='..middle..', order=1)\n", "\n", "# Equivalent ordering by variable \"..y..\" when using `stat_summary()`.\n", "class_by_y=as_discrete('class', order_by='..y..', order=1)\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "70eb612b", "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:05:54.102619Z", "iopub.status.busy": "2024-04-26T12:05:54.102619Z", "iopub.status.idle": "2024-04-26T12:05:54.227824Z", "shell.execute_reply": "2024-04-26T12:05:54.227824Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p + geom_boxplot(aes(x=class_by_middle, color='..middle..'))" ] }, { "cell_type": "markdown", "id": "05971a56-2849-4992-9bd4-948957cc2ced", "metadata": {}, "source": [ "#### 1. Show Just Outliers\n", "\n", "Use `stat=\"boxplot_outlier\"`." ] }, { "cell_type": "code", "execution_count": 7, "id": "212344e4-3809-4529-8e61-c1ead73f6e1a", "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:05:54.227824Z", "iopub.status.busy": "2024-04-26T12:05:54.227824Z", "iopub.status.idle": "2024-04-26T12:05:54.243548Z", "shell.execute_reply": "2024-04-26T12:05:54.243548Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "outliers = geom_point(aes(x=class_by_middle, color='..middle..'), stat=\"boxplot_outlier\")\n", "\n", "p + outliers" ] }, { "cell_type": "markdown", "id": "b8877904", "metadata": {}, "source": [ "#### 2. Add Ribbons and Mid-points" ] }, { "cell_type": "code", "execution_count": 8, "id": "2196bc20-ae2d-4392-a024-9a862318eee1", "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:05:54.243548Z", "iopub.status.busy": "2024-04-26T12:05:54.243548Z", "iopub.status.idle": "2024-04-26T12:05:54.275164Z", "shell.execute_reply": "2024-04-26T12:05:54.275164Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ribbon1 = geom_ribbon(aes(\n", " x=class_by_middle, \n", " ymin=\"..ymin..\",\n", " ymax=\"..ymax..\"), stat=\"boxplot\")\n", "\n", "ribbon2 = geom_ribbon(aes(\n", " x=class_by_middle, \n", " ymin=\"..lower..\",\n", " ymax=\"..upper..\"), stat=\"boxplot\")\n", "\n", "mid_points = stat_summary(aes(x=class_by_y, color=\"..y..\"), \n", " fun=\"mq\", \n", " geom=\"point\", shape=15, size=6)\n", "\n", "p + ribbon1 + ribbon2 + mid_points + outliers + labs(color=\"Middle\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }