{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Group-aware Sampling\n", "In large dataset with groups, choice of the sampling method may depend on the number of groups and the group size. \n", "\n", "In this example we consider line plot where each line correspons to a group." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:30:26.908842Z", "iopub.status.busy": "2024-04-17T07:30:26.908680Z", "iopub.status.idle": "2024-04-17T07:30:27.222882Z", "shell.execute_reply": "2024-04-17T07:30:27.222431Z" } }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import numpy as np\n", "from lets_plot import *\n", "\n", "LetsPlot.setup_html()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:30:27.236218Z", "iopub.status.busy": "2024-04-17T07:30:27.236058Z", "iopub.status.idle": "2024-04-17T07:30:27.238782Z", "shell.execute_reply": "2024-04-17T07:30:27.238489Z" } }, "outputs": [], "source": [ "def data(n_per_line, n_groups):\n", " x_step = 2 * np.pi / (n_per_line - 1)\n", " little_delta = x_step / 100\n", " x_stops =np.arange(-np.pi, np.pi + little_delta, x_step)\n", "\n", " y_min, y_max = 1, 10\n", " y_step = (y_max - y_min) / (n_groups - 1)\n", " little_delta = y_step / 100\n", " y_multiplier = np.arange(y_min, y_max + little_delta, y_step)\n", "\n", " x = []\n", " y = []\n", " c = []\n", " for i in range(n_groups):\n", " x.extend(x_stops)\n", " y.extend([np.sin(x) * y_multiplier[i] for x in x_stops])\n", " c.extend([str(i) for _ in x_stops])\n", "\n", " return dict(x = x, y = y, cond = c)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:30:27.239851Z", "iopub.status.busy": "2024-04-17T07:30:27.239715Z", "iopub.status.idle": "2024-04-17T07:30:27.241398Z", "shell.execute_reply": "2024-04-17T07:30:27.241055Z" } }, "outputs": [], "source": [ "p = ggplot(mapping=aes('x','y',color='cond'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1. Data with large number of points per group but small number of groups." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:30:27.242830Z", "iopub.status.busy": "2024-04-17T07:30:27.242704Z", "iopub.status.idle": "2024-04-17T07:30:27.248449Z", "shell.execute_reply": "2024-04-17T07:30:27.248259Z" } }, "outputs": [], "source": [ "n_per_line, n_groups = 1000, 10\n", "dat = data(n_per_line, n_groups)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:30:27.249755Z", "iopub.status.busy": "2024-04-17T07:30:27.249617Z", "iopub.status.idle": "2024-04-17T07:30:27.328808Z", "shell.execute_reply": "2024-04-17T07:30:27.328600Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " " ], "text/plain": [ "