{ "cells": [ { "cell_type": "markdown", "id": "a3ecb71d", "metadata": {}, "source": [ "# Q-Q Plots" ] }, { "cell_type": "markdown", "id": "dbaa1352", "metadata": {}, "source": [ "## Preparation" ] }, { "cell_type": "code", "execution_count": 1, "id": "248047ba", "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:36:12.028635Z", "iopub.status.busy": "2024-04-17T07:36:12.028539Z", "iopub.status.idle": "2024-04-17T07:36:12.469111Z", "shell.execute_reply": "2024-04-17T07:36:12.468671Z" } }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "from scipy.stats import norm, skewnorm, laplace, uniform\n", "\n", "from lets_plot import *\n", "LetsPlot.setup_html()" ] }, { "cell_type": "code", "execution_count": 2, "id": "5a316834", "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:36:12.470448Z", "iopub.status.busy": "2024-04-17T07:36:12.470300Z", "iopub.status.idle": "2024-04-17T07:36:12.472616Z", "shell.execute_reply": "2024-04-17T07:36:12.472418Z" } }, "outputs": [], "source": [ "def plot_matrix(plots=[], width=400, height=300, columns=2):\n", " bunch = GGBunch()\n", " for i in range(len(plots)):\n", " row = int(i / columns)\n", " column = i % columns\n", " bunch.add_plot(plots[i], column * width, row * height, width, height)\n", " return bunch.show()" ] }, { "cell_type": "code", "execution_count": 3, "id": "2b6ed83f", "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:36:12.473407Z", "iopub.status.busy": "2024-04-17T07:36:12.473335Z", "iopub.status.idle": "2024-04-17T07:36:12.831197Z", "shell.execute_reply": "2024-04-17T07:36:12.830809Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(234, 12)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0manufacturermodeldisplyearcyltransdrvctyhwyflclass
01audia41.819994auto(l5)f1829pcompact
12audia41.819994manual(m5)f2129pcompact
23audia42.020084manual(m6)f2031pcompact
34audia42.020084auto(av)f2130pcompact
45audia42.819996auto(l5)f1626pcompact
\n", "
" ], "text/plain": [ " Unnamed: 0 manufacturer model displ year cyl trans drv cty hwy \\\n", "0 1 audi a4 1.8 1999 4 auto(l5) f 18 29 \n", "1 2 audi a4 1.8 1999 4 manual(m5) f 21 29 \n", "2 3 audi a4 2.0 2008 4 manual(m6) f 20 31 \n", "3 4 audi a4 2.0 2008 4 auto(av) f 21 30 \n", "4 5 audi a4 2.8 1999 6 auto(l5) f 16 26 \n", "\n", " fl class \n", "0 p compact \n", "1 p compact \n", "2 p compact \n", "3 p compact \n", "4 p compact " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/mpg.csv\")\n", "print(df.shape)\n", "hwy = \"hwy\"\n", "cty = \"cty\"\n", "drv = \"drv\"\n", "df.head()" ] }, { "cell_type": "markdown", "id": "1ededd07", "metadata": {}, "source": [ "## Two types of Q-Q plots" ] }, { "cell_type": "markdown", "id": "ac22b750", "metadata": {}, "source": [ "### `geom_qq()` and `geom_qq_line()` functions." ] }, { "cell_type": "code", "execution_count": 4, "id": "a688fc76", "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:36:12.832421Z", "iopub.status.busy": "2024-04-17T07:36:12.832344Z", "iopub.status.idle": "2024-04-17T07:36:12.868304Z", "shell.execute_reply": "2024-04-17T07:36:12.868002Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(sample=hwy)) + \\\n", " geom_qq(size=5, color=\"#3d3d3d\", alpha=.3) + \\\n", " geom_qq_line(size=1) + \\\n", " ggtitle(\"Distribution of highway miles per gallon\", \\\n", " \"Comparison of sample quantiles with normal distribution quantiles\")" ] }, { "cell_type": "markdown", "id": "af42ae28", "metadata": {}, "source": [ "### `geom_qq2()` and `geom_qq2_line()` functions." ] }, { "cell_type": "code", "execution_count": 5, "id": "db1a3b31", "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:36:12.869526Z", "iopub.status.busy": "2024-04-17T07:36:12.869394Z", "iopub.status.idle": "2024-04-17T07:36:12.873950Z", "shell.execute_reply": "2024-04-17T07:36:12.873770Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(x=cty, y=hwy)) + \\\n", " geom_qq2(size=5, color=\"#3d3d3d\", alpha=.3) + \\\n", " geom_qq2_line(size=1) + \\\n", " ggtitle(\"City miles vs. highway miles (per gallon)\", \\\n", " \"Comparison of quantiles of two sample distributions\")" ] }, { "cell_type": "markdown", "id": "45f606db", "metadata": {}, "source": [ "## Quick Q-Q plot: the `qq_plot()` function\n", "\n", "In the 'bistro' module there is a Q-Q plot in which points and lines of both types combined to the one function with some convenient defaults." ] }, { "cell_type": "code", "execution_count": 6, "id": "be8fa9f7", "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:36:12.875155Z", "iopub.status.busy": "2024-04-17T07:36:12.874878Z", "iopub.status.idle": "2024-04-17T07:36:12.876905Z", "shell.execute_reply": "2024-04-17T07:36:12.876718Z" } }, "outputs": [], "source": [ "from lets_plot.bistro.qq import qq_plot" ] }, { "cell_type": "code", "execution_count": 7, "id": "22cf7ec8", "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:36:12.877701Z", "iopub.status.busy": "2024-04-17T07:36:12.877583Z", "iopub.status.idle": "2024-04-17T07:36:12.881950Z", "shell.execute_reply": "2024-04-17T07:36:12.881770Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "qq_plot(df, sample=hwy) + \\\n", " ggtitle(\"Distribution of highway miles per gallon\", \\\n", " \"Comparison of sample quantiles with normal distribution quantiles\")" ] }, { "cell_type": "markdown", "id": "e000c2f7", "metadata": {}, "source": [ "## Deviations investigation" ] }, { "cell_type": "code", "execution_count": 8, "id": "3c7fdc05", "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:36:12.882804Z", "iopub.status.busy": "2024-04-17T07:36:12.882683Z", "iopub.status.idle": "2024-04-17T07:36:12.885353Z", "shell.execute_reply": "2024-04-17T07:36:12.885174Z" } }, "outputs": [], "source": [ "n = 1_000\n", "norm_df = pd.DataFrame({\"sample\": norm.rvs(size=n, random_state=42)})\n", "skewed_df = pd.DataFrame({\"sample\": skewnorm.rvs(7, size=n, random_state=42)})\n", "neg_kurtosis_df = pd.DataFrame({\"sample\": uniform.rvs(size=n, random_state=42)})\n", "pos_kurtosis_df = pd.DataFrame({\"sample\": laplace.rvs(size=n, random_state=42)})" ] }, { "cell_type": "code", "execution_count": 9, "id": "21b8586b", "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:36:12.886181Z", "iopub.status.busy": "2024-04-17T07:36:12.886040Z", "iopub.status.idle": "2024-04-17T07:36:12.917221Z", "shell.execute_reply": "2024-04-17T07:36:12.917025Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "p1 = ggplot(norm_df)\n", "p11 = p1 + geom_histogram(aes(x=\"sample\")) + ggtitle(\"Normal: histogram\")\n", "p12 = p1 + geom_qq(aes(sample=\"sample\")) + geom_qq_line(aes(sample=\"sample\")) + ggtitle(\"Normal: Q-Q plot\")\n", "p2 = ggplot(skewed_df)\n", "p21 = p2 + geom_histogram(aes(x=\"sample\")) + ggtitle(\"Skewed: histogram\")\n", "p22 = p2 + geom_qq(aes(sample=\"sample\")) + geom_qq_line(aes(sample=\"sample\")) + ggtitle(\"Skewed: Q-Q plot\")\n", "p3 = ggplot(neg_kurtosis_df)\n", "p31 = p3 + geom_histogram(aes(x=\"sample\")) + ggtitle(\"-Kurtosis: histogram\")\n", "p32 = p3 + geom_qq(aes(sample=\"sample\")) + geom_qq_line(aes(sample=\"sample\")) + ggtitle(\"-Kurtosis: Q-Q plot\")\n", "p4 = ggplot(pos_kurtosis_df)\n", "p41 = p4 + geom_histogram(aes(x=\"sample\")) + ggtitle(\"+Kurtosis: histogram\")\n", "p42 = p4 + geom_qq(aes(sample=\"sample\")) + geom_qq_line(aes(sample=\"sample\")) + ggtitle(\"+Kurtosis: Q-Q plot\")\n", "\n", "plot_matrix([p11, p12, p21, p22, p31, p32, p41, p42])" ] }, { "cell_type": "markdown", "id": "076e37c9", "metadata": {}, "source": [ "## Choose a distribution\n", "\n", "The `distribution` parameter of the `qq_plot()` function." ] }, { "cell_type": "code", "execution_count": 10, "id": "a58255a0", "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:36:12.918325Z", "iopub.status.busy": "2024-04-17T07:36:12.918250Z", "iopub.status.idle": "2024-04-17T07:36:12.931847Z", "shell.execute_reply": "2024-04-17T07:36:12.931663Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "p1 = qq_plot(df, hwy, distribution=\"norm\", quantiles=[.1, .9]) + \\\n", " ggtitle(\"Normal distribution\")\n", "p2 = qq_plot(df, hwy, distribution=\"uniform\", quantiles=[.1, .9]) + \\\n", " ggtitle(\"Uniform distribution\")\n", "p3 = qq_plot(df, hwy, distribution=\"t\", quantiles=[.1, .9]) + \\\n", " ggtitle(\"Student's t-distribution distribution\")\n", "p4 = qq_plot(df, hwy, distribution=\"exp\", quantiles=[.1, .9]) + \\\n", " ggtitle(\"Exponential distribution\")\n", "\n", "plot_matrix([p1, p2, p3, p4])" ] }, { "cell_type": "markdown", "id": "e2dced15", "metadata": {}, "source": [ "## Q-Q stats with other geometries" ] }, { "cell_type": "code", "execution_count": 11, "id": "50ae6970", "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:36:12.932708Z", "iopub.status.busy": "2024-04-17T07:36:12.932636Z", "iopub.status.idle": "2024-04-17T07:36:12.939797Z", "shell.execute_reply": "2024-04-17T07:36:12.939626Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(x=cty, y=hwy, color=drv)) + \\\n", " geom_line(stat=\"qq2\") + \\\n", " geom_point(stat=\"qq2\", shape=15) + \\\n", " geom_line(stat=\"qq2_line\", color='#636363', linetype=5) + \\\n", " facet_grid(x=drv, scales=\"free\") + \\\n", " xlab(\"cty quantiles\") + ylab(\"hwy quantiles\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }