{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:04.757121Z",
"iopub.status.busy": "2024-08-23T10:36:04.757038Z",
"iopub.status.idle": "2024-08-23T10:36:05.083704Z",
"shell.execute_reply": "2024-08-23T10:36:05.083396Z"
}
},
"outputs": [],
"source": [
"from lets_plot import *"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.085338Z",
"iopub.status.busy": "2024-08-23T10:36:05.085188Z",
"iopub.status.idle": "2024-08-23T10:36:05.087313Z",
"shell.execute_reply": "2024-08-23T10:36:05.087138Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"LetsPlot.setup_html()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.100804Z",
"iopub.status.busy": "2024-08-23T10:36:05.100692Z",
"iopub.status.idle": "2024-08-23T10:36:05.106168Z",
"shell.execute_reply": "2024-08-23T10:36:05.105980Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(400, 2)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" cond | \n",
" rating | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" A | \n",
" -1.085631 | \n",
"
\n",
" \n",
" 1 | \n",
" A | \n",
" 0.997345 | \n",
"
\n",
" \n",
" 2 | \n",
" A | \n",
" 0.282978 | \n",
"
\n",
" \n",
" 3 | \n",
" A | \n",
" -1.506295 | \n",
"
\n",
" \n",
" 4 | \n",
" A | \n",
" -0.578600 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" cond rating\n",
"0 A -1.085631\n",
"1 A 0.997345\n",
"2 A 0.282978\n",
"3 A -1.506295\n",
"4 A -0.578600"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# This example was found at: www.cookbook-r.com/Graphs/Plotting_distributions_(ggplot2)\n",
"def get_data():\n",
" import numpy as np\n",
" import pandas as pd\n",
"\n",
" np.random.seed(123)\n",
"\n",
" return pd.DataFrame(dict(\n",
" cond=np.repeat([\"A\", \"B\"], 200),\n",
" rating=np.concatenate((np.random.normal(0, 1, 200), np.random.normal(.8, 1, 200)))\n",
" ))\n",
"\n",
"df = get_data()\n",
"print(df.shape)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.107232Z",
"iopub.status.busy": "2024-08-23T10:36:05.107159Z",
"iopub.status.idle": "2024-08-23T10:36:05.139276Z",
"shell.execute_reply": "2024-08-23T10:36:05.138934Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Basic histogram of \"rating\"\n",
"p = ggplot(df, aes(x=\"rating\")) + ggsize(500, 250)\n",
"p + geom_histogram(binwidth=.5)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.140652Z",
"iopub.status.busy": "2024-08-23T10:36:05.140571Z",
"iopub.status.idle": "2024-08-23T10:36:05.154259Z",
"shell.execute_reply": "2024-08-23T10:36:05.154082Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Histogram overlaid with kernel density curve\n",
"# - histogram with density instead of count on y-axis\n",
"# - overlay with transparent density plot\n",
"p + \\\n",
" geom_histogram(aes(y='..density..'), binwidth=.5, colour=\"black\", fill=\"white\") + \\\n",
" geom_density(alpha=.2, color=\"#de2d26\", fill=\"#ff6666\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.155425Z",
"iopub.status.busy": "2024-08-23T10:36:05.155352Z",
"iopub.status.idle": "2024-08-23T10:36:05.158410Z",
"shell.execute_reply": "2024-08-23T10:36:05.158241Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"p + \\\n",
" geom_histogram(binwidth=.5, colour=\"black\", fill=\"white\") + \\\n",
" geom_vline(xintercept=df[\"rating\"].mean(), \\\n",
" color=\"red\", linetype='dashed', size=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Histogram and density plots with multiple groups"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.159416Z",
"iopub.status.busy": "2024-08-23T10:36:05.159344Z",
"iopub.status.idle": "2024-08-23T10:36:05.162549Z",
"shell.execute_reply": "2024-08-23T10:36:05.162382Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"p1 = ggplot(df, aes(x=\"rating\", fill=\"cond\")) + ggsize(500, 250)\n",
"\n",
"# Default histogram (stacked)\n",
"p1 + geom_histogram(binwidth=.5)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.163475Z",
"iopub.status.busy": "2024-08-23T10:36:05.163401Z",
"iopub.status.idle": "2024-08-23T10:36:05.166334Z",
"shell.execute_reply": "2024-08-23T10:36:05.166171Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Overlaid histograms\n",
"p1 + geom_histogram(binwidth=.5, alpha=.7, position=\"identity\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.167233Z",
"iopub.status.busy": "2024-08-23T10:36:05.167162Z",
"iopub.status.idle": "2024-08-23T10:36:05.170368Z",
"shell.execute_reply": "2024-08-23T10:36:05.170194Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Interleaved histograms\n",
"p1 + geom_histogram(binwidth=.5, position=\"dodge\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.171305Z",
"iopub.status.busy": "2024-08-23T10:36:05.171232Z",
"iopub.status.idle": "2024-08-23T10:36:05.188019Z",
"shell.execute_reply": "2024-08-23T10:36:05.187675Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Density plot\n",
"p2 = ggplot(df, aes(x=\"rating\", color=\"cond\")) + ggsize(500, 250)\n",
"p2 + geom_density()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.189001Z",
"iopub.status.busy": "2024-08-23T10:36:05.188916Z",
"iopub.status.idle": "2024-08-23T10:36:05.206855Z",
"shell.execute_reply": "2024-08-23T10:36:05.206683Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Density plot with semi-transparent fill\n",
"p2 + geom_density(aes(fill=\"cond\"), alpha=.7)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.208172Z",
"iopub.status.busy": "2024-08-23T10:36:05.208099Z",
"iopub.status.idle": "2024-08-23T10:36:05.211716Z",
"shell.execute_reply": "2024-08-23T10:36:05.211552Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" cond | \n",
" rating | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" A | \n",
" 0.003787 | \n",
"
\n",
" \n",
" 1 | \n",
" B | \n",
" 0.685638 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" cond rating\n",
"0 A 0.003787\n",
"1 B 0.685638"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Find the mean of each group\n",
"cdf = df.groupby([\"cond\"], as_index=False).mean()\n",
"cdf.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.212907Z",
"iopub.status.busy": "2024-08-23T10:36:05.212830Z",
"iopub.status.idle": "2024-08-23T10:36:05.216599Z",
"shell.execute_reply": "2024-08-23T10:36:05.216434Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Overlaid histograms with means\n",
"p2 + \\\n",
" geom_histogram(aes(fill=\"cond\"), alpha=.5, position=\"identity\", size=0) + \\\n",
" geom_vline(data=cdf, \\\n",
" mapping=aes(xintercept=\"rating\", color=\"cond\"), \\\n",
" linetype='dashed', size=1)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.217764Z",
"iopub.status.busy": "2024-08-23T10:36:05.217688Z",
"iopub.status.idle": "2024-08-23T10:36:05.221277Z",
"shell.execute_reply": "2024-08-23T10:36:05.220990Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Use frqpoly instead of histogram\n",
"p2 + \\\n",
" geom_freqpoly(aes(fill=\"cond\")) + \\\n",
" geom_vline(data=cdf, \\\n",
" mapping=aes(xintercept=\"rating\", color=\"cond\"), \\\n",
" linetype='dashed', size=1)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.222436Z",
"iopub.status.busy": "2024-08-23T10:36:05.222357Z",
"iopub.status.idle": "2024-08-23T10:36:05.239908Z",
"shell.execute_reply": "2024-08-23T10:36:05.239590Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Density plots with means\n",
"p2 + \\\n",
" geom_density() + \\\n",
" geom_vline(data=cdf, \\\n",
" mapping=aes(xintercept=\"rating\", color=\"cond\"), \\\n",
" linetype='dashed', size=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Using facets"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.240825Z",
"iopub.status.busy": "2024-08-23T10:36:05.240750Z",
"iopub.status.idle": "2024-08-23T10:36:05.244494Z",
"shell.execute_reply": "2024-08-23T10:36:05.244234Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ggplot(df, aes(x=\"rating\")) + \\\n",
" geom_histogram(binwidth=.5, colour=\"black\", fill=\"white\") + \\\n",
" facet_grid(\"cond\")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.245339Z",
"iopub.status.busy": "2024-08-23T10:36:05.245258Z",
"iopub.status.idle": "2024-08-23T10:36:05.249062Z",
"shell.execute_reply": "2024-08-23T10:36:05.248824Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# With mean lines, using 'cdat' computed earlier.\n",
"ggplot(df, aes(x=\"rating\")) + \\\n",
" geom_histogram(binwidth=.5, colour=\"black\", fill=\"white\") + \\\n",
" geom_vline(data=cdf, \\\n",
" mapping=aes(xintercept=\"rating\"), \\\n",
" linetype='dashed', size=1, colour=\"red\") + \\\n",
" facet_grid(None, \"cond\") + \\\n",
" ggsize(500, 250)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Box plots"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.249986Z",
"iopub.status.busy": "2024-08-23T10:36:05.249912Z",
"iopub.status.idle": "2024-08-23T10:36:05.253347Z",
"shell.execute_reply": "2024-08-23T10:36:05.253115Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# A basic box plot\n",
"p3 = ggplot(df, aes(x=\"cond\", y=\"rating\")) + ggsize(400, 300)\n",
"p3 + geom_boxplot()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.254265Z",
"iopub.status.busy": "2024-08-23T10:36:05.254186Z",
"iopub.status.idle": "2024-08-23T10:36:05.257967Z",
"shell.execute_reply": "2024-08-23T10:36:05.257738Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# A basic box with the conditions colored\n",
"p3 + geom_boxplot(aes(fill=\"cond\"))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:36:05.258840Z",
"iopub.status.busy": "2024-08-23T10:36:05.258768Z",
"iopub.status.idle": "2024-08-23T10:36:05.262998Z",
"shell.execute_reply": "2024-08-23T10:36:05.262787Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Style outliers\n",
"p3 + geom_boxplot(outlier_color=\"red\", outlier_shape=8, outlier_size=1.5)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 4
}