{ "cells": [ { "cell_type": "markdown", "id": "8f4391f1", "metadata": {}, "source": [ "# Removed Data Points Messages\n", "\n", "Small examples for checking computation messages from stats." ] }, { "cell_type": "code", "execution_count": 1, "id": "7c632d60", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "from lets_plot import *\n", "\n", "LetsPlot.setup_html()" ] }, { "cell_type": "code", "execution_count": 2, "id": "b13c0ddb", "metadata": {}, "outputs": [], "source": [ "df_x = pd.DataFrame({\n", " \"x\": [1, 2, np.nan, 4, 5, np.nan]\n", "})\n", "\n", "df_xy = pd.DataFrame({\n", " \"x\": [1, 2, np.nan, 4, 5, np.nan],\n", " \"y\": [2, np.nan, 3, 4, 5, np.nan],\n", " \"w\": [1, 1, 1, np.nan, 1, 1],\n", " \"g\": [\"A\", \"A\", \"A\", \"B\", \"B\", \"B\"]\n", "})\n", "\n", "df_qq2 = pd.DataFrame({\n", " \"x\": [4, np.nan, 1, 9, 6, 2, 10, np.nan, 7, 5],\n", " \"y\": [7, 1, 9, 10, 4, np.nan, 3, np.nan, 6, 5]\n", "})\n", "\n", "df_smooth = pd.DataFrame({\n", " \"x\": [1, 2, 3, 4, 5, 6, 7, 8],\n", " \"y\": [1, np.nan, 3, 4, 5, np.nan, 7, 8]\n", "})\n", "\n", "df_ridges = pd.DataFrame({\n", " \"x\": [1, 2, np.nan, 4, 1, 2, 3, np.nan],\n", " \"y\": [0, 0, 0, 0, 1, 1, 1, 1]\n", "})\n", "\n", "df_sampling = pd.DataFrame({\n", " \"x\": list(range(10)),\n", " \"y\": list(range(10))\n", "})" ] }, { "cell_type": "markdown", "id": "afd7ee1b", "metadata": {}, "source": [ "## Count / Count2d" ] }, { "cell_type": "code", "execution_count": 3, "id": "1d94db0f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_x, aes(\"x\")) + geom_bar()" ] }, { "cell_type": "code", "execution_count": 4, "id": "c8393d62", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_xy, aes(\"x\", \"y\")) + geom_pie()" ] }, { "cell_type": "markdown", "id": "22e52109", "metadata": {}, "source": [ "## Bin / Bin2d / Binhex" ] }, { "cell_type": "code", "execution_count": 5, "id": "c179d80d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_x, aes(\"x\")) + geom_histogram(bins=4)" ] }, { "cell_type": "code", "execution_count": 6, "id": "9559e8d5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_xy, aes(\"x\", \"y\")) + geom_bin2d(bins=[3, 3])" ] }, { "cell_type": "code", "execution_count": 7, "id": "6d5b490c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_xy, aes(\"x\", \"y\")) + geom_hex(bins=[3, 3])" ] }, { "cell_type": "markdown", "id": "751f9475", "metadata": {}, "source": [ "## Dotplot / YDotplot" ] }, { "cell_type": "code", "execution_count": 8, "id": "9b4cc90a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_x, aes(\"x\")) + geom_dotplot(binwidth=1.0)" ] }, { "cell_type": "code", "execution_count": 9, "id": "f28119b0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_xy, aes(\"x\", \"y\")) + geom_ydotplot(binwidth=1.0)" ] }, { "cell_type": "markdown", "id": "dd28977c", "metadata": {}, "source": [ "## Summary / Summary Bin" ] }, { "cell_type": "code", "execution_count": 10, "id": "318379c4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_xy, aes(\"g\", \"y\")) + stat_summary()" ] }, { "cell_type": "code", "execution_count": 11, "id": "c9b57274", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_xy, aes(\"x\", \"y\")) + stat_summary_bin(bins=3)" ] }, { "cell_type": "markdown", "id": "399f8ef8", "metadata": {}, "source": [ "## Boxplot\n", "\n", "`geom_boxplot()` exercises boxplot-related stats, including outlier handling." ] }, { "cell_type": "code", "execution_count": 12, "id": "7af092f7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_xy, aes(\"g\", \"y\")) + geom_boxplot()" ] }, { "cell_type": "markdown", "id": "eed10133", "metadata": {}, "source": [ "## Density Family" ] }, { "cell_type": "code", "execution_count": 13, "id": "815a78f2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_x, aes(\"x\")) + geom_density()" ] }, { "cell_type": "code", "execution_count": 14, "id": "7893b527", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_xy, aes(\"x\", \"y\")) + geom_density2d()" ] }, { "cell_type": "code", "execution_count": 15, "id": "37c1d481", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_xy, aes(\"x\", \"y\")) + geom_density2df()" ] }, { "cell_type": "code", "execution_count": 16, "id": "cb8287d8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_xy, aes(\"g\", \"y\")) + geom_violin()" ] }, { "cell_type": "code", "execution_count": 17, "id": "b5424476", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_xy, aes(\"g\", \"y\")) + geom_sina()" ] }, { "cell_type": "code", "execution_count": 18, "id": "a767c52e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_ridges, aes(\"x\", \"y\")) + geom_area_ridges()" ] }, { "cell_type": "markdown", "id": "ec7ffcdd", "metadata": {}, "source": [ "## ECDF" ] }, { "cell_type": "code", "execution_count": 19, "id": "6c71675c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_x, aes(\"x\")) + stat_ecdf()" ] }, { "cell_type": "markdown", "id": "b2709d6d", "metadata": {}, "source": [ "## QQ / QQ Line" ] }, { "cell_type": "code", "execution_count": 20, "id": "f5d328fd", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_x, aes(sample=\"x\")) + geom_qq()" ] }, { "cell_type": "code", "execution_count": 21, "id": "b81bc629", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_x, aes(sample=\"x\")) + geom_qq_line()" ] }, { "cell_type": "markdown", "id": "59b69fe7", "metadata": {}, "source": [ "## QQ2 / QQ2 Line\n", "\n", "For `qq2`/`qq2line`, a row is counted as removed only when both `x` and `y` are non-finite. In `df_qq2`, only row 8 has both values missing." ] }, { "cell_type": "code", "execution_count": 22, "id": "fa4147e1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_qq2, aes(\"x\", \"y\")) + geom_qq2()" ] }, { "cell_type": "code", "execution_count": 23, "id": "bfd03c63", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_qq2, aes(\"x\", \"y\")) + geom_qq2_line()" ] }, { "cell_type": "markdown", "id": "56868c17", "metadata": {}, "source": [ "## Smooth / Smooth Summary\n", "\n", "This example should show non-finite removal. It also uses LOESS sampling (`max_n=4`) to check the sampling message inside stat." ] }, { "cell_type": "code", "execution_count": 24, "id": "da144e56", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_smooth, aes(\"x\", \"y\")) + \\\n", " geom_smooth(method=\"loess\", max_n=6, seed=42, labels=smooth_labels())" ] }, { "cell_type": "markdown", "id": "55927ce7", "metadata": {}, "source": [ "## Point Density" ] }, { "cell_type": "code", "execution_count": 25, "id": "21a83cad", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df_xy, aes(\"x\", \"y\")) + geom_pointdensity()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.20" } }, "nbformat": 4, "nbformat_minor": 5 }