{ "cells": [ { "cell_type": "markdown", "id": "a3ecb71d", "metadata": {}, "source": [ "# Q-Q Plots" ] }, { "cell_type": "markdown", "id": "dbaa1352", "metadata": {}, "source": [ "## Preparation" ] }, { "cell_type": "code", "execution_count": 1, "id": "248047ba", "metadata": { "execution": { "iopub.execute_input": "2025-11-05T13:50:18.760307Z", "iopub.status.busy": "2025-11-05T13:50:18.760214Z", "iopub.status.idle": "2025-11-05T13:50:18.909341Z", "shell.execute_reply": "2025-11-05T13:50:18.909026Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "from scipy.stats import norm, skewnorm, laplace, uniform\n", "\n", "from lets_plot import *" ] }, { "cell_type": "code", "execution_count": 2, "id": "2fc6f680-34e3-4a1c-ad6b-576dd1c70a6f", "metadata": { "execution": { "iopub.execute_input": "2025-11-05T13:50:18.910729Z", "iopub.status.busy": "2025-11-05T13:50:18.910610Z", "iopub.status.idle": "2025-11-05T13:50:18.912505Z", "shell.execute_reply": "2025-11-05T13:50:18.912341Z" } }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "LetsPlot.setup_html()" ] }, { "cell_type": "code", "execution_count": 3, "id": "2b6ed83f", "metadata": { "execution": { "iopub.execute_input": "2025-11-05T13:50:18.913521Z", "iopub.status.busy": "2025-11-05T13:50:18.913451Z", "iopub.status.idle": "2025-11-05T13:50:19.123468Z", "shell.execute_reply": "2025-11-05T13:50:19.123257Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(234, 12)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0manufacturermodeldisplyearcyltransdrvctyhwyflclass
01audia41.819994auto(l5)f1829pcompact
12audia41.819994manual(m5)f2129pcompact
23audia42.020084manual(m6)f2031pcompact
34audia42.020084auto(av)f2130pcompact
45audia42.819996auto(l5)f1626pcompact
\n", "
" ], "text/plain": [ " Unnamed: 0 manufacturer model displ year cyl trans drv cty hwy \\\n", "0 1 audi a4 1.8 1999 4 auto(l5) f 18 29 \n", "1 2 audi a4 1.8 1999 4 manual(m5) f 21 29 \n", "2 3 audi a4 2.0 2008 4 manual(m6) f 20 31 \n", "3 4 audi a4 2.0 2008 4 auto(av) f 21 30 \n", "4 5 audi a4 2.8 1999 6 auto(l5) f 16 26 \n", "\n", " fl class \n", "0 p compact \n", "1 p compact \n", "2 p compact \n", "3 p compact \n", "4 p compact " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/mpg.csv\")\n", "print(df.shape)\n", "hwy = \"hwy\"\n", "cty = \"cty\"\n", "drv = \"drv\"\n", "df.head()" ] }, { "cell_type": "markdown", "id": "1ededd07", "metadata": {}, "source": [ "## Two types of Q-Q plots" ] }, { "cell_type": "markdown", "id": "ac22b750", "metadata": {}, "source": [ "### `geom_qq()` and `geom_qq_line()` functions." ] }, { "cell_type": "code", "execution_count": 4, "id": "a688fc76", "metadata": { "execution": { "iopub.execute_input": "2025-11-05T13:50:19.124602Z", "iopub.status.busy": "2025-11-05T13:50:19.124533Z", "iopub.status.idle": "2025-11-05T13:50:19.159704Z", "shell.execute_reply": "2025-11-05T13:50:19.159412Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(sample=hwy)) + \\\n", " geom_qq(size=5, color=\"#3d3d3d\", alpha=.3) + \\\n", " geom_qq_line(size=1) + \\\n", " ggtitle(\"Distribution of highway miles per gallon\", \\\n", " \"Comparison of sample quantiles with normal distribution quantiles\")" ] }, { "cell_type": "markdown", "id": "af42ae28", "metadata": {}, "source": [ "### `geom_qq2()` and `geom_qq2_line()` functions." ] }, { "cell_type": "code", "execution_count": 5, "id": "db1a3b31", "metadata": { "execution": { "iopub.execute_input": "2025-11-05T13:50:19.160764Z", "iopub.status.busy": "2025-11-05T13:50:19.160665Z", "iopub.status.idle": "2025-11-05T13:50:19.166538Z", "shell.execute_reply": "2025-11-05T13:50:19.166342Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(x=cty, y=hwy)) + \\\n", " geom_qq2(size=5, color=\"#3d3d3d\", alpha=.3) + \\\n", " geom_qq2_line(size=1) + \\\n", " ggtitle(\"City miles vs. highway miles (per gallon)\", \\\n", " \"Comparison of quantiles of two sample distributions\")" ] }, { "cell_type": "markdown", "id": "45f606db", "metadata": {}, "source": [ "## Quick Q-Q plot: the `qq_plot()` function\n", "\n", "In the 'bistro' module there is a Q-Q plot in which points and lines of both types combined to the one function with some convenient defaults." ] }, { "cell_type": "code", "execution_count": 6, "id": "be8fa9f7", "metadata": { "execution": { "iopub.execute_input": "2025-11-05T13:50:19.167385Z", "iopub.status.busy": "2025-11-05T13:50:19.167313Z", "iopub.status.idle": "2025-11-05T13:50:19.169218Z", "shell.execute_reply": "2025-11-05T13:50:19.169041Z" } }, "outputs": [], "source": [ "from lets_plot.bistro.qq import qq_plot" ] }, { "cell_type": "code", "execution_count": 7, "id": "22cf7ec8", "metadata": { "execution": { "iopub.execute_input": "2025-11-05T13:50:19.169992Z", "iopub.status.busy": "2025-11-05T13:50:19.169921Z", "iopub.status.idle": "2025-11-05T13:50:19.192535Z", "shell.execute_reply": "2025-11-05T13:50:19.192357Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "qq_plot(df, sample=hwy) + \\\n", " ggtitle(\"Distribution of highway miles per gallon\", \\\n", " \"Comparison of sample quantiles with normal distribution quantiles\")" ] }, { "cell_type": "markdown", "id": "e000c2f7", "metadata": {}, "source": [ "## Deviations investigation" ] }, { "cell_type": "code", "execution_count": 8, "id": "3c7fdc05", "metadata": { "execution": { "iopub.execute_input": "2025-11-05T13:50:19.193303Z", "iopub.status.busy": "2025-11-05T13:50:19.193231Z", "iopub.status.idle": "2025-11-05T13:50:19.195900Z", "shell.execute_reply": "2025-11-05T13:50:19.195718Z" } }, "outputs": [], "source": [ "n = 1_000\n", "norm_df = pd.DataFrame({\"sample\": norm.rvs(size=n, random_state=42)})\n", "skewed_df = pd.DataFrame({\"sample\": skewnorm.rvs(7, size=n, random_state=42)})\n", "neg_kurtosis_df = pd.DataFrame({\"sample\": uniform.rvs(size=n, random_state=42)})\n", "pos_kurtosis_df = pd.DataFrame({\"sample\": laplace.rvs(size=n, random_state=42)})" ] }, { "cell_type": "code", "execution_count": 9, "id": "21b8586b", "metadata": { "execution": { "iopub.execute_input": "2025-11-05T13:50:19.196630Z", "iopub.status.busy": "2025-11-05T13:50:19.196561Z", "iopub.status.idle": "2025-11-05T13:50:19.231354Z", "shell.execute_reply": "2025-11-05T13:50:19.231132Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p1 = ggplot(norm_df)\n", "p11 = p1 + geom_histogram(aes(x=\"sample\")) + ggtitle(\"Normal: histogram\")\n", "p12 = p1 + geom_qq(aes(sample=\"sample\")) + geom_qq_line(aes(sample=\"sample\")) + ggtitle(\"Normal: Q-Q plot\")\n", "p2 = ggplot(skewed_df)\n", "p21 = p2 + geom_histogram(aes(x=\"sample\")) + ggtitle(\"Skewed: histogram\")\n", "p22 = p2 + geom_qq(aes(sample=\"sample\")) + geom_qq_line(aes(sample=\"sample\")) + ggtitle(\"Skewed: Q-Q plot\")\n", "p3 = ggplot(neg_kurtosis_df)\n", "p31 = p3 + geom_histogram(aes(x=\"sample\")) + ggtitle(\"-Kurtosis: histogram\")\n", "p32 = p3 + geom_qq(aes(sample=\"sample\")) + geom_qq_line(aes(sample=\"sample\")) + ggtitle(\"-Kurtosis: Q-Q plot\")\n", "p4 = ggplot(pos_kurtosis_df)\n", "p41 = p4 + geom_histogram(aes(x=\"sample\")) + ggtitle(\"+Kurtosis: histogram\")\n", "p42 = p4 + geom_qq(aes(sample=\"sample\")) + geom_qq_line(aes(sample=\"sample\")) + ggtitle(\"+Kurtosis: Q-Q plot\")\n", "\n", "gggrid([p11, p12, p21, p22, p31, p32, p41, p42], ncol=2)" ] }, { "cell_type": "markdown", "id": "076e37c9", "metadata": {}, "source": [ "## Choose a distribution\n", "\n", "The `distribution` parameter of the `qq_plot()` function." ] }, { "cell_type": "code", "execution_count": 10, "id": "a58255a0", "metadata": { "execution": { "iopub.execute_input": "2025-11-05T13:50:19.232226Z", "iopub.status.busy": "2025-11-05T13:50:19.232149Z", "iopub.status.idle": "2025-11-05T13:50:19.325453Z", "shell.execute_reply": "2025-11-05T13:50:19.325233Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p1 = qq_plot(df, hwy, distribution=\"norm\", quantiles=[.1, .9]) + \\\n", " ggtitle(\"Normal distribution\")\n", "p2 = qq_plot(df, hwy, distribution=\"uniform\", quantiles=[.1, .9]) + \\\n", " ggtitle(\"Uniform distribution\")\n", "p3 = qq_plot(df, hwy, distribution=\"t\", quantiles=[.1, .9]) + \\\n", " ggtitle(\"Student's t-distribution distribution\")\n", "p4 = qq_plot(df, hwy, distribution=\"exp\", quantiles=[.1, .9]) + \\\n", " ggtitle(\"Exponential distribution\")\n", "\n", "gggrid([p1, p2, p3, p4], ncol=2)" ] }, { "cell_type": "markdown", "id": "e2dced15", "metadata": {}, "source": [ "## Q-Q stats with other geometries" ] }, { "cell_type": "code", "execution_count": 11, "id": "50ae6970", "metadata": { "execution": { "iopub.execute_input": "2025-11-05T13:50:19.326293Z", "iopub.status.busy": "2025-11-05T13:50:19.326219Z", "iopub.status.idle": "2025-11-05T13:50:19.335846Z", "shell.execute_reply": "2025-11-05T13:50:19.335654Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(x=cty, y=hwy, color=drv)) + \\\n", " geom_line(stat=\"qq2\") + \\\n", " geom_point(stat=\"qq2\", shape=15) + \\\n", " geom_line(stat=\"qq2_line\", color='#636363', linetype=5) + \\\n", " facet_grid(x=drv, scales=\"free\") + \\\n", " xlab(\"cty quantiles\") + ylab(\"hwy quantiles\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }