{ "cells": [ { "cell_type": "markdown", "id": "4cdbc2ec-9ada-4c7e-bc19-6dfa27b29b02", "metadata": {}, "source": [ "# Custom Histogram Bins Using the `breaks` Parameter" ] }, { "cell_type": "code", "execution_count": 1, "id": "69a974d4-6a4a-432a-95fc-15aa88854461", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "from lets_plot import *" ] }, { "cell_type": "code", "execution_count": 2, "id": "bcf65a95-e080-4a75-886f-ab5bc4d94b4b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "LetsPlot.setup_html()" ] }, { "cell_type": "code", "execution_count": 3, "id": "9aa2130b-dbe6-4f4d-bca4-c6ff57a5d329", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(53940, 10)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
caratcutcolorclaritydepthtablepricexyz
00.23IdealESI261.555.03263.953.982.43
10.21PremiumESI159.861.03263.893.842.31
20.23GoodEVS156.965.03274.054.072.31
30.29PremiumIVS262.458.03344.204.232.63
40.31GoodJSI263.358.03354.344.352.75
\n", "
" ], "text/plain": [ " carat cut color clarity depth table price x y z\n", "0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43\n", "1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31\n", "2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31\n", "3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63\n", "4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"https://raw.githubusercontent.com/JetBrains/lets-plot-docs/refs/heads/master/data/diamonds.csv\")\n", "print(df.shape)\n", "df.head()" ] }, { "cell_type": "markdown", "id": "0730e10d-23ca-45ad-a9ad-6582b95d314e", "metadata": {}, "source": [ "## Default Bins" ] }, { "cell_type": "code", "execution_count": 4, "id": "e36cc653-252d-4f83-9d2f-61955bd4cf35", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(\"price\")) + geom_histogram(color=\"black\", fill=\"gray80\")" ] }, { "cell_type": "markdown", "id": "5fe3cbbe-39fb-475f-a8b9-9ce5d7fd9bde", "metadata": {}, "source": [ "## Equi-probable Bins" ] }, { "cell_type": "code", "execution_count": 5, "id": "54ff27a9-aee8-4d07-abae-c98c9a62e47a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "price_bins = df[\"price\"].quantile(q=[i/10 for i in range(11)], interpolation=\"linear\")\n", "ggplot(df, aes(\"price\")) + geom_histogram(breaks=price_bins, color=\"black\", fill=\"gray80\")" ] }, { "cell_type": "markdown", "id": "1edc91c5-18d6-4353-94f6-2b85bb7938bd", "metadata": {}, "source": [ "## Identity Stat" ] }, { "cell_type": "code", "execution_count": 6, "id": "d9e5e106-dbae-4f26-bb14-c6a92e644c9f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bin_xcount
0384.02004
1768.013041
21536.09471
33072.010474
46144.011605
512288.06367
624576.0978
\n", "
" ], "text/plain": [ " bin_x count\n", "0 384.0 2004\n", "1 768.0 13041\n", "2 1536.0 9471\n", "3 3072.0 10474\n", "4 6144.0 11605\n", "5 12288.0 6367\n", "6 24576.0 978" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "breaks = [2**d for d in range(16)]\n", "bin_x = [(breaks[i] + breaks[i + 1]) / 2.0 for i in range(len(breaks) - 1)] # bin centers\n", "agg_df = df.assign(\n", " bin_x=pd.cut(df[\"price\"], bins=breaks, labels=bin_x) # bin center, corresponding to current price\n", ").groupby(\"bin_x\", observed=True)[\"price\"].count().to_frame(\"count\").reset_index() # aggregated dataframe: bin center -> size of bin\n", "agg_df[\"bin_x\"] = agg_df[\"bin_x\"].astype(float)\n", "agg_df" ] }, { "cell_type": "code", "execution_count": 7, "id": "1e2e4ff2-5c4c-47e5-94f8-84ae84a1f388", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(agg_df) + \\\n", " geom_histogram(aes(\"bin_x\", \"count\"), stat='identity', breaks=breaks, color=\"black\", fill=\"gray80\") + \\\n", " xlab(\"price\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.23" } }, "nbformat": 4, "nbformat_minor": 5 }