{
"cells": [
{
"cell_type": "markdown",
"id": "4cdbc2ec-9ada-4c7e-bc19-6dfa27b29b02",
"metadata": {},
"source": [
"# Custom Histogram Bins Using the `breaks` Parameter"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "69a974d4-6a4a-432a-95fc-15aa88854461",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"from lets_plot import *"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "bcf65a95-e080-4a75-886f-ab5bc4d94b4b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"LetsPlot.setup_html()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9aa2130b-dbe6-4f4d-bca4-c6ff57a5d329",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(53940, 10)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" carat | \n",
" cut | \n",
" color | \n",
" clarity | \n",
" depth | \n",
" table | \n",
" price | \n",
" x | \n",
" y | \n",
" z | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.23 | \n",
" Ideal | \n",
" E | \n",
" SI2 | \n",
" 61.5 | \n",
" 55.0 | \n",
" 326 | \n",
" 3.95 | \n",
" 3.98 | \n",
" 2.43 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0.21 | \n",
" Premium | \n",
" E | \n",
" SI1 | \n",
" 59.8 | \n",
" 61.0 | \n",
" 326 | \n",
" 3.89 | \n",
" 3.84 | \n",
" 2.31 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0.23 | \n",
" Good | \n",
" E | \n",
" VS1 | \n",
" 56.9 | \n",
" 65.0 | \n",
" 327 | \n",
" 4.05 | \n",
" 4.07 | \n",
" 2.31 | \n",
"
\n",
" \n",
" | 3 | \n",
" 0.29 | \n",
" Premium | \n",
" I | \n",
" VS2 | \n",
" 62.4 | \n",
" 58.0 | \n",
" 334 | \n",
" 4.20 | \n",
" 4.23 | \n",
" 2.63 | \n",
"
\n",
" \n",
" | 4 | \n",
" 0.31 | \n",
" Good | \n",
" J | \n",
" SI2 | \n",
" 63.3 | \n",
" 58.0 | \n",
" 335 | \n",
" 4.34 | \n",
" 4.35 | \n",
" 2.75 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" carat cut color clarity depth table price x y z\n",
"0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43\n",
"1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31\n",
"2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31\n",
"3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63\n",
"4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"https://raw.githubusercontent.com/JetBrains/lets-plot-docs/refs/heads/master/data/diamonds.csv\")\n",
"print(df.shape)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"id": "0730e10d-23ca-45ad-a9ad-6582b95d314e",
"metadata": {},
"source": [
"## Default Bins"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e36cc653-252d-4f83-9d2f-61955bd4cf35",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ggplot(df, aes(\"price\")) + geom_histogram(color=\"black\", fill=\"gray80\")"
]
},
{
"cell_type": "markdown",
"id": "5fe3cbbe-39fb-475f-a8b9-9ce5d7fd9bde",
"metadata": {},
"source": [
"## Equi-probable Bins"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "54ff27a9-aee8-4d07-abae-c98c9a62e47a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"price_bins = df[\"price\"].quantile(q=[i/10 for i in range(11)], interpolation=\"linear\")\n",
"ggplot(df, aes(\"price\")) + geom_histogram(breaks=price_bins, color=\"black\", fill=\"gray80\")"
]
},
{
"cell_type": "markdown",
"id": "1edc91c5-18d6-4353-94f6-2b85bb7938bd",
"metadata": {},
"source": [
"## Identity Stat"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d9e5e106-dbae-4f26-bb14-c6a92e644c9f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" bin_x | \n",
" count | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 384.0 | \n",
" 2004 | \n",
"
\n",
" \n",
" | 1 | \n",
" 768.0 | \n",
" 13041 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1536.0 | \n",
" 9471 | \n",
"
\n",
" \n",
" | 3 | \n",
" 3072.0 | \n",
" 10474 | \n",
"
\n",
" \n",
" | 4 | \n",
" 6144.0 | \n",
" 11605 | \n",
"
\n",
" \n",
" | 5 | \n",
" 12288.0 | \n",
" 6367 | \n",
"
\n",
" \n",
" | 6 | \n",
" 24576.0 | \n",
" 978 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" bin_x count\n",
"0 384.0 2004\n",
"1 768.0 13041\n",
"2 1536.0 9471\n",
"3 3072.0 10474\n",
"4 6144.0 11605\n",
"5 12288.0 6367\n",
"6 24576.0 978"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"breaks = [2**d for d in range(16)]\n",
"bin_x = [(breaks[i] + breaks[i + 1]) / 2.0 for i in range(len(breaks) - 1)] # bin centers\n",
"agg_df = df.assign(\n",
" bin_x=pd.cut(df[\"price\"], bins=breaks, labels=bin_x) # bin center, corresponding to current price\n",
").groupby(\"bin_x\", observed=True)[\"price\"].count().to_frame(\"count\").reset_index() # aggregated dataframe: bin center -> size of bin\n",
"agg_df[\"bin_x\"] = agg_df[\"bin_x\"].astype(float)\n",
"agg_df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "1e2e4ff2-5c4c-47e5-94f8-84ae84a1f388",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ggplot(agg_df) + \\\n",
" geom_histogram(aes(\"bin_x\", \"count\"), stat='identity', breaks=breaks, color=\"black\", fill=\"gray80\") + \\\n",
" xlab(\"price\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.23"
}
},
"nbformat": 4,
"nbformat_minor": 5
}