{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:04.757121Z", "iopub.status.busy": "2024-08-23T10:36:04.757038Z", "iopub.status.idle": "2024-08-23T10:36:05.083704Z", "shell.execute_reply": "2024-08-23T10:36:05.083396Z" } }, "outputs": [], "source": [ "from lets_plot import *" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.085338Z", "iopub.status.busy": "2024-08-23T10:36:05.085188Z", "iopub.status.idle": "2024-08-23T10:36:05.087313Z", "shell.execute_reply": "2024-08-23T10:36:05.087138Z" } }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "LetsPlot.setup_html()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.100804Z", "iopub.status.busy": "2024-08-23T10:36:05.100692Z", "iopub.status.idle": "2024-08-23T10:36:05.106168Z", "shell.execute_reply": "2024-08-23T10:36:05.105980Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(400, 2)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
condrating
0A-1.085631
1A0.997345
2A0.282978
3A-1.506295
4A-0.578600
\n", "
" ], "text/plain": [ " cond rating\n", "0 A -1.085631\n", "1 A 0.997345\n", "2 A 0.282978\n", "3 A -1.506295\n", "4 A -0.578600" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# This example was found at: www.cookbook-r.com/Graphs/Plotting_distributions_(ggplot2)\n", "def get_data():\n", " import numpy as np\n", " import pandas as pd\n", "\n", " np.random.seed(123)\n", "\n", " return pd.DataFrame(dict(\n", " cond=np.repeat([\"A\", \"B\"], 200),\n", " rating=np.concatenate((np.random.normal(0, 1, 200), np.random.normal(.8, 1, 200)))\n", " ))\n", "\n", "df = get_data()\n", "print(df.shape)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.107232Z", "iopub.status.busy": "2024-08-23T10:36:05.107159Z", "iopub.status.idle": "2024-08-23T10:36:05.139276Z", "shell.execute_reply": "2024-08-23T10:36:05.138934Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Basic histogram of \"rating\"\n", "p = ggplot(df, aes(x=\"rating\")) + ggsize(500, 250)\n", "p + geom_histogram(binwidth=.5)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.140652Z", "iopub.status.busy": "2024-08-23T10:36:05.140571Z", "iopub.status.idle": "2024-08-23T10:36:05.154259Z", "shell.execute_reply": "2024-08-23T10:36:05.154082Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Histogram overlaid with kernel density curve\n", "# - histogram with density instead of count on y-axis\n", "# - overlay with transparent density plot\n", "p + \\\n", " geom_histogram(aes(y='..density..'), binwidth=.5, colour=\"black\", fill=\"white\") + \\\n", " geom_density(alpha=.2, color=\"#de2d26\", fill=\"#ff6666\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.155425Z", "iopub.status.busy": "2024-08-23T10:36:05.155352Z", "iopub.status.idle": "2024-08-23T10:36:05.158410Z", "shell.execute_reply": "2024-08-23T10:36:05.158241Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p + \\\n", " geom_histogram(binwidth=.5, colour=\"black\", fill=\"white\") + \\\n", " geom_vline(xintercept=df[\"rating\"].mean(), \\\n", " color=\"red\", linetype='dashed', size=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Histogram and density plots with multiple groups" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.159416Z", "iopub.status.busy": "2024-08-23T10:36:05.159344Z", "iopub.status.idle": "2024-08-23T10:36:05.162549Z", "shell.execute_reply": "2024-08-23T10:36:05.162382Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p1 = ggplot(df, aes(x=\"rating\", fill=\"cond\")) + ggsize(500, 250)\n", "\n", "# Default histogram (stacked)\n", "p1 + geom_histogram(binwidth=.5)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.163475Z", "iopub.status.busy": "2024-08-23T10:36:05.163401Z", "iopub.status.idle": "2024-08-23T10:36:05.166334Z", "shell.execute_reply": "2024-08-23T10:36:05.166171Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Overlaid histograms\n", "p1 + geom_histogram(binwidth=.5, alpha=.7, position=\"identity\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.167233Z", "iopub.status.busy": "2024-08-23T10:36:05.167162Z", "iopub.status.idle": "2024-08-23T10:36:05.170368Z", "shell.execute_reply": "2024-08-23T10:36:05.170194Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Interleaved histograms\n", "p1 + geom_histogram(binwidth=.5, position=\"dodge\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.171305Z", "iopub.status.busy": "2024-08-23T10:36:05.171232Z", "iopub.status.idle": "2024-08-23T10:36:05.188019Z", "shell.execute_reply": "2024-08-23T10:36:05.187675Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Density plot\n", "p2 = ggplot(df, aes(x=\"rating\", color=\"cond\")) + ggsize(500, 250)\n", "p2 + geom_density()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.189001Z", "iopub.status.busy": "2024-08-23T10:36:05.188916Z", "iopub.status.idle": "2024-08-23T10:36:05.206855Z", "shell.execute_reply": "2024-08-23T10:36:05.206683Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Density plot with semi-transparent fill\n", "p2 + geom_density(aes(fill=\"cond\"), alpha=.7)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.208172Z", "iopub.status.busy": "2024-08-23T10:36:05.208099Z", "iopub.status.idle": "2024-08-23T10:36:05.211716Z", "shell.execute_reply": "2024-08-23T10:36:05.211552Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
condrating
0A0.003787
1B0.685638
\n", "
" ], "text/plain": [ " cond rating\n", "0 A 0.003787\n", "1 B 0.685638" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Find the mean of each group\n", "cdf = df.groupby([\"cond\"], as_index=False).mean()\n", "cdf.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.212907Z", "iopub.status.busy": "2024-08-23T10:36:05.212830Z", "iopub.status.idle": "2024-08-23T10:36:05.216599Z", "shell.execute_reply": "2024-08-23T10:36:05.216434Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Overlaid histograms with means\n", "p2 + \\\n", " geom_histogram(aes(fill=\"cond\"), alpha=.5, position=\"identity\", size=0) + \\\n", " geom_vline(data=cdf, \\\n", " mapping=aes(xintercept=\"rating\", color=\"cond\"), \\\n", " linetype='dashed', size=1)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.217764Z", "iopub.status.busy": "2024-08-23T10:36:05.217688Z", "iopub.status.idle": "2024-08-23T10:36:05.221277Z", "shell.execute_reply": "2024-08-23T10:36:05.220990Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Use frqpoly instead of histogram\n", "p2 + \\\n", " geom_freqpoly(aes(fill=\"cond\")) + \\\n", " geom_vline(data=cdf, \\\n", " mapping=aes(xintercept=\"rating\", color=\"cond\"), \\\n", " linetype='dashed', size=1)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.222436Z", "iopub.status.busy": "2024-08-23T10:36:05.222357Z", "iopub.status.idle": "2024-08-23T10:36:05.239908Z", "shell.execute_reply": "2024-08-23T10:36:05.239590Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Density plots with means\n", "p2 + \\\n", " geom_density() + \\\n", " geom_vline(data=cdf, \\\n", " mapping=aes(xintercept=\"rating\", color=\"cond\"), \\\n", " linetype='dashed', size=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using facets" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.240825Z", "iopub.status.busy": "2024-08-23T10:36:05.240750Z", "iopub.status.idle": "2024-08-23T10:36:05.244494Z", "shell.execute_reply": "2024-08-23T10:36:05.244234Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(x=\"rating\")) + \\\n", " geom_histogram(binwidth=.5, colour=\"black\", fill=\"white\") + \\\n", " facet_grid(\"cond\")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.245339Z", "iopub.status.busy": "2024-08-23T10:36:05.245258Z", "iopub.status.idle": "2024-08-23T10:36:05.249062Z", "shell.execute_reply": "2024-08-23T10:36:05.248824Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# With mean lines, using 'cdat' computed earlier.\n", "ggplot(df, aes(x=\"rating\")) + \\\n", " geom_histogram(binwidth=.5, colour=\"black\", fill=\"white\") + \\\n", " geom_vline(data=cdf, \\\n", " mapping=aes(xintercept=\"rating\"), \\\n", " linetype='dashed', size=1, colour=\"red\") + \\\n", " facet_grid(None, \"cond\") + \\\n", " ggsize(500, 250)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Box plots" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.249986Z", "iopub.status.busy": "2024-08-23T10:36:05.249912Z", "iopub.status.idle": "2024-08-23T10:36:05.253347Z", "shell.execute_reply": "2024-08-23T10:36:05.253115Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# A basic box plot\n", "p3 = ggplot(df, aes(x=\"cond\", y=\"rating\")) + ggsize(400, 300)\n", "p3 + geom_boxplot()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.254265Z", "iopub.status.busy": "2024-08-23T10:36:05.254186Z", "iopub.status.idle": "2024-08-23T10:36:05.257967Z", "shell.execute_reply": "2024-08-23T10:36:05.257738Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# A basic box with the conditions colored\n", "p3 + geom_boxplot(aes(fill=\"cond\"))" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:36:05.258840Z", "iopub.status.busy": "2024-08-23T10:36:05.258768Z", "iopub.status.idle": "2024-08-23T10:36:05.262998Z", "shell.execute_reply": "2024-08-23T10:36:05.262787Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Style outliers\n", "p3 + geom_boxplot(outlier_color=\"red\", outlier_shape=8, outlier_size=1.5)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 4 }