{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Group-aware Sampling\n", "In large dataset with groups, choice of the sampling method may depend on the number of groups and the group size. \n", "\n", "In this example we consider line plot where each line correspons to a group." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:30:26.908842Z", "iopub.status.busy": "2024-04-17T07:30:26.908680Z", "iopub.status.idle": "2024-04-17T07:30:27.222882Z", "shell.execute_reply": "2024-04-17T07:30:27.222431Z" } }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import numpy as np\n", "from lets_plot import *\n", "\n", "LetsPlot.setup_html()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:30:27.236218Z", "iopub.status.busy": "2024-04-17T07:30:27.236058Z", "iopub.status.idle": "2024-04-17T07:30:27.238782Z", "shell.execute_reply": "2024-04-17T07:30:27.238489Z" } }, "outputs": [], "source": [ "def data(n_per_line, n_groups):\n", " x_step = 2 * np.pi / (n_per_line - 1)\n", " little_delta = x_step / 100\n", " x_stops =np.arange(-np.pi, np.pi + little_delta, x_step)\n", "\n", " y_min, y_max = 1, 10\n", " y_step = (y_max - y_min) / (n_groups - 1)\n", " little_delta = y_step / 100\n", " y_multiplier = np.arange(y_min, y_max + little_delta, y_step)\n", "\n", " x = []\n", " y = []\n", " c = []\n", " for i in range(n_groups):\n", " x.extend(x_stops)\n", " y.extend([np.sin(x) * y_multiplier[i] for x in x_stops])\n", " c.extend([str(i) for _ in x_stops])\n", "\n", " return dict(x = x, y = y, cond = c)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:30:27.239851Z", "iopub.status.busy": "2024-04-17T07:30:27.239715Z", "iopub.status.idle": "2024-04-17T07:30:27.241398Z", "shell.execute_reply": "2024-04-17T07:30:27.241055Z" } }, "outputs": [], "source": [ "p = ggplot(mapping=aes('x','y',color='cond'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1. Data with large number of points per group but small number of groups." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:30:27.242830Z", "iopub.status.busy": "2024-04-17T07:30:27.242704Z", "iopub.status.idle": "2024-04-17T07:30:27.248449Z", "shell.execute_reply": "2024-04-17T07:30:27.248259Z" } }, "outputs": [], "source": [ "n_per_line, n_groups = 1000, 10\n", "dat = data(n_per_line, n_groups)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:30:27.249755Z", "iopub.status.busy": "2024-04-17T07:30:27.249617Z", "iopub.status.idle": "2024-04-17T07:30:27.328808Z", "shell.execute_reply": "2024-04-17T07:30:27.328600Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# The default 'systematic' sampling is fine in this case.\n", "p + geom_line(data=dat)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2. Data with small number of points per group but large number of groups." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:30:27.332514Z", "iopub.status.busy": "2024-04-17T07:30:27.332423Z", "iopub.status.idle": "2024-04-17T07:30:27.348942Z", "shell.execute_reply": "2024-04-17T07:30:27.348748Z" } }, "outputs": [], "source": [ "n_per_line, n_groups = 30, 1000\n", "dat = data(n_per_line, n_groups)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:30:27.350478Z", "iopub.status.busy": "2024-04-17T07:30:27.350395Z", "iopub.status.idle": "2024-04-17T07:30:27.509492Z", "shell.execute_reply": "2024-04-17T07:30:27.509264Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# The default systematic line sampling doesn't work very well in this case.\n", "p + geom_line(data=dat)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:30:27.517420Z", "iopub.status.busy": "2024-04-17T07:30:27.517337Z", "iopub.status.idle": "2024-04-17T07:30:27.586763Z", "shell.execute_reply": "2024-04-17T07:30:27.586560Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Random group sampling works better.\n", "p + geom_line(data=dat, sampling=sampling_group_random(10))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:30:27.587914Z", "iopub.status.busy": "2024-04-17T07:30:27.587828Z", "iopub.status.idle": "2024-04-17T07:30:27.655528Z", "shell.execute_reply": "2024-04-17T07:30:27.655330Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Systematic group sampling works even better.\n", "p + geom_line(data=dat, sampling=sampling_group_systematic(10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3. Data with large number of points per group and large number of groups." ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:30:27.656704Z", "iopub.status.busy": "2024-04-17T07:30:27.656588Z", "iopub.status.idle": "2024-04-17T07:30:27.743363Z", "shell.execute_reply": "2024-04-17T07:30:27.743053Z" } }, "outputs": [], "source": [ "n_per_line, n_groups = 1000, 200\n", "dat = data(n_per_line, n_groups)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:30:27.744513Z", "iopub.status.busy": "2024-04-17T07:30:27.744435Z", "iopub.status.idle": "2024-04-17T07:30:28.290179Z", "shell.execute_reply": "2024-04-17T07:30:28.289961Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p + geom_line(data=dat)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:30:28.299529Z", "iopub.status.busy": "2024-04-17T07:30:28.299441Z", "iopub.status.idle": "2024-04-17T07:30:28.753506Z", "shell.execute_reply": "2024-04-17T07:30:28.753303Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# A combination of points and group sampling works.\n", "p + geom_line(data=dat, sampling=sampling_group_systematic(10)+sampling_systematic(200))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 1 }