{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Finding Natural Breaks in Data with the Fisher-Jenks Algorithm\n", "\n", "Notebook accompanying article on [PB Python](https://pbpython.com/natural-breaks.html)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import jenkspy" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create an example dataframe" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "sales = {\n", " 'account': [\n", " 'Jones Inc', 'Alpha Co', 'Blue Inc', 'Super Star Inc', 'Wamo',\n", " 'Next Gen', 'Giga Co', 'IniTech', 'Beta LLC'\n", " ],\n", " 'Total': [1500, 2100, 50, 20, 75, 1100, 950, 1300, 1400]\n", "}\n", "df = pd.DataFrame(sales)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
accountTotal
3Super Star Inc20
2Blue Inc50
4Wamo75
6Giga Co950
5Next Gen1100
7IniTech1300
8Beta LLC1400
0Jones Inc1500
1Alpha Co2100
\n", "
" ], "text/plain": [ " account Total\n", "3 Super Star Inc 20\n", "2 Blue Inc 50\n", "4 Wamo 75\n", "6 Giga Co 950\n", "5 Next Gen 1100\n", "7 IniTech 1300\n", "8 Beta LLC 1400\n", "0 Jones Inc 1500\n", "1 Alpha Co 2100" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sort_values(by='Total')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Try cutting the data using qcut" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df['quantile'] = pd.qcut(df['Total'], q=2, labels=['bucket_1', 'bucket_2'])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
accountTotalquantile
3Super Star Inc20bucket_1
2Blue Inc50bucket_1
4Wamo75bucket_1
6Giga Co950bucket_1
5Next Gen1100bucket_1
7IniTech1300bucket_2
8Beta LLC1400bucket_2
0Jones Inc1500bucket_2
1Alpha Co2100bucket_2
\n", "
" ], "text/plain": [ " account Total quantile\n", "3 Super Star Inc 20 bucket_1\n", "2 Blue Inc 50 bucket_1\n", "4 Wamo 75 bucket_1\n", "6 Giga Co 950 bucket_1\n", "5 Next Gen 1100 bucket_1\n", "7 IniTech 1300 bucket_2\n", "8 Beta LLC 1400 bucket_2\n", "0 Jones Inc 1500 bucket_2\n", "1 Alpha Co 2100 bucket_2" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sort_values(by='Total')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compare with using cut" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "df['cut_bins'] = pd.cut(df['Total'],\n", " bins=2,\n", " labels=['bucket_1', 'bucket_2'])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
accountTotalquantilecut_bins
3Super Star Inc20bucket_1bucket_1
2Blue Inc50bucket_1bucket_1
4Wamo75bucket_1bucket_1
6Giga Co950bucket_1bucket_1
5Next Gen1100bucket_1bucket_2
7IniTech1300bucket_2bucket_2
8Beta LLC1400bucket_2bucket_2
0Jones Inc1500bucket_2bucket_2
1Alpha Co2100bucket_2bucket_2
\n", "
" ], "text/plain": [ " account Total quantile cut_bins\n", "3 Super Star Inc 20 bucket_1 bucket_1\n", "2 Blue Inc 50 bucket_1 bucket_1\n", "4 Wamo 75 bucket_1 bucket_1\n", "6 Giga Co 950 bucket_1 bucket_1\n", "5 Next Gen 1100 bucket_1 bucket_2\n", "7 IniTech 1300 bucket_2 bucket_2\n", "8 Beta LLC 1400 bucket_2 bucket_2\n", "0 Jones Inc 1500 bucket_2 bucket_2\n", "1 Alpha Co 2100 bucket_2 bucket_2" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sort_values(by='Total')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Show how jenkspy works" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[20.0, 75.0, 2100.0]\n" ] } ], "source": [ "breaks = jenkspy.jenks_breaks(df['Total'], nb_class=2)\n", "print(breaks)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
accountTotalquantilecut_binscut_jenks
3Super Star Inc20bucket_1bucket_1NaN
2Blue Inc50bucket_1bucket_1bucket_1
4Wamo75bucket_1bucket_1bucket_1
6Giga Co950bucket_1bucket_1bucket_2
5Next Gen1100bucket_1bucket_2bucket_2
7IniTech1300bucket_2bucket_2bucket_2
8Beta LLC1400bucket_2bucket_2bucket_2
0Jones Inc1500bucket_2bucket_2bucket_2
1Alpha Co2100bucket_2bucket_2bucket_2
\n", "
" ], "text/plain": [ " account Total quantile cut_bins cut_jenks\n", "3 Super Star Inc 20 bucket_1 bucket_1 NaN\n", "2 Blue Inc 50 bucket_1 bucket_1 bucket_1\n", "4 Wamo 75 bucket_1 bucket_1 bucket_1\n", "6 Giga Co 950 bucket_1 bucket_1 bucket_2\n", "5 Next Gen 1100 bucket_1 bucket_2 bucket_2\n", "7 IniTech 1300 bucket_2 bucket_2 bucket_2\n", "8 Beta LLC 1400 bucket_2 bucket_2 bucket_2\n", "0 Jones Inc 1500 bucket_2 bucket_2 bucket_2\n", "1 Alpha Co 2100 bucket_2 bucket_2 bucket_2" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['cut_jenks'] = pd.cut(df['Total'],\n", " bins=breaks,\n", " labels=['bucket_1', 'bucket_2'])\n", "df.sort_values(by='Total')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fix the NaN by using include_lowest" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
accountTotalquantilecut_binscut_jenkscut_jenksv2
3Super Star Inc20bucket_1bucket_1NaNbucket_1
2Blue Inc50bucket_1bucket_1bucket_1bucket_1
4Wamo75bucket_1bucket_1bucket_1bucket_1
6Giga Co950bucket_1bucket_1bucket_2bucket_2
5Next Gen1100bucket_1bucket_2bucket_2bucket_2
7IniTech1300bucket_2bucket_2bucket_2bucket_2
8Beta LLC1400bucket_2bucket_2bucket_2bucket_2
0Jones Inc1500bucket_2bucket_2bucket_2bucket_2
1Alpha Co2100bucket_2bucket_2bucket_2bucket_2
\n", "
" ], "text/plain": [ " account Total quantile cut_bins cut_jenks cut_jenksv2\n", "3 Super Star Inc 20 bucket_1 bucket_1 NaN bucket_1\n", "2 Blue Inc 50 bucket_1 bucket_1 bucket_1 bucket_1\n", "4 Wamo 75 bucket_1 bucket_1 bucket_1 bucket_1\n", "6 Giga Co 950 bucket_1 bucket_1 bucket_2 bucket_2\n", "5 Next Gen 1100 bucket_1 bucket_2 bucket_2 bucket_2\n", "7 IniTech 1300 bucket_2 bucket_2 bucket_2 bucket_2\n", "8 Beta LLC 1400 bucket_2 bucket_2 bucket_2 bucket_2\n", "0 Jones Inc 1500 bucket_2 bucket_2 bucket_2 bucket_2\n", "1 Alpha Co 2100 bucket_2 bucket_2 bucket_2 bucket_2" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['cut_jenksv2'] = pd.cut(df['Total'],\n", " bins=breaks,\n", " labels=['bucket_1', 'bucket_2'],\n", " include_lowest=True)\n", "df.sort_values(by='Total')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Try some other examples" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
accountTotalquantilecut_binscut_jenkscut_jenksv2quantilev2cut_jenksv3
3Super Star Inc20bucket_1bucket_1NaNbucket_1bucket_1bucket_1
2Blue Inc50bucket_1bucket_1bucket_1bucket_1bucket_1bucket_1
4Wamo75bucket_1bucket_1bucket_1bucket_1bucket_1bucket_1
6Giga Co950bucket_1bucket_1bucket_2bucket_2bucket_2bucket_2
5Next Gen1100bucket_1bucket_2bucket_2bucket_2bucket_2bucket_2
7IniTech1300bucket_2bucket_2bucket_2bucket_2bucket_3bucket_3
8Beta LLC1400bucket_2bucket_2bucket_2bucket_2bucket_3bucket_3
0Jones Inc1500bucket_2bucket_2bucket_2bucket_2bucket_4bucket_3
1Alpha Co2100bucket_2bucket_2bucket_2bucket_2bucket_4bucket_4
\n", "
" ], "text/plain": [ " account Total quantile cut_bins cut_jenks cut_jenksv2 quantilev2 \\\n", "3 Super Star Inc 20 bucket_1 bucket_1 NaN bucket_1 bucket_1 \n", "2 Blue Inc 50 bucket_1 bucket_1 bucket_1 bucket_1 bucket_1 \n", "4 Wamo 75 bucket_1 bucket_1 bucket_1 bucket_1 bucket_1 \n", "6 Giga Co 950 bucket_1 bucket_1 bucket_2 bucket_2 bucket_2 \n", "5 Next Gen 1100 bucket_1 bucket_2 bucket_2 bucket_2 bucket_2 \n", "7 IniTech 1300 bucket_2 bucket_2 bucket_2 bucket_2 bucket_3 \n", "8 Beta LLC 1400 bucket_2 bucket_2 bucket_2 bucket_2 bucket_3 \n", "0 Jones Inc 1500 bucket_2 bucket_2 bucket_2 bucket_2 bucket_4 \n", "1 Alpha Co 2100 bucket_2 bucket_2 bucket_2 bucket_2 bucket_4 \n", "\n", " cut_jenksv3 \n", "3 bucket_1 \n", "2 bucket_1 \n", "4 bucket_1 \n", "6 bucket_2 \n", "5 bucket_2 \n", "7 bucket_3 \n", "8 bucket_3 \n", "0 bucket_3 \n", "1 bucket_4 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['quantilev2'] = pd.qcut(\n", " df['Total'], q=4, labels=['bucket_1', 'bucket_2', 'bucket_3', 'bucket_4'])\n", "\n", "df['cut_jenksv3'] = pd.cut(\n", " df['Total'],\n", " bins=jenkspy.jenks_breaks(df['Total'], nb_class=4),\n", " labels=['bucket_1', 'bucket_2', 'bucket_3', 'bucket_4'],\n", " include_lowest=True)\n", "\n", "\n", "df.sort_values(by='Total')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2" } }, "nbformat": 4, "nbformat_minor": 2 }