{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Finding Natural Breaks in Data with the Fisher-Jenks Algorithm\n",
"\n",
"Notebook accompanying article on [PB Python](https://pbpython.com/natural-breaks.html)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import jenkspy"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create an example dataframe"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"sales = {\n",
" 'account': [\n",
" 'Jones Inc', 'Alpha Co', 'Blue Inc', 'Super Star Inc', 'Wamo',\n",
" 'Next Gen', 'Giga Co', 'IniTech', 'Beta LLC'\n",
" ],\n",
" 'Total': [1500, 2100, 50, 20, 75, 1100, 950, 1300, 1400]\n",
"}\n",
"df = pd.DataFrame(sales)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" account | \n",
" Total | \n",
"
\n",
" \n",
" \n",
" \n",
" 3 | \n",
" Super Star Inc | \n",
" 20 | \n",
"
\n",
" \n",
" 2 | \n",
" Blue Inc | \n",
" 50 | \n",
"
\n",
" \n",
" 4 | \n",
" Wamo | \n",
" 75 | \n",
"
\n",
" \n",
" 6 | \n",
" Giga Co | \n",
" 950 | \n",
"
\n",
" \n",
" 5 | \n",
" Next Gen | \n",
" 1100 | \n",
"
\n",
" \n",
" 7 | \n",
" IniTech | \n",
" 1300 | \n",
"
\n",
" \n",
" 8 | \n",
" Beta LLC | \n",
" 1400 | \n",
"
\n",
" \n",
" 0 | \n",
" Jones Inc | \n",
" 1500 | \n",
"
\n",
" \n",
" 1 | \n",
" Alpha Co | \n",
" 2100 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" account Total\n",
"3 Super Star Inc 20\n",
"2 Blue Inc 50\n",
"4 Wamo 75\n",
"6 Giga Co 950\n",
"5 Next Gen 1100\n",
"7 IniTech 1300\n",
"8 Beta LLC 1400\n",
"0 Jones Inc 1500\n",
"1 Alpha Co 2100"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sort_values(by='Total')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Try cutting the data using qcut"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df['quantile'] = pd.qcut(df['Total'], q=2, labels=['bucket_1', 'bucket_2'])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" account | \n",
" Total | \n",
" quantile | \n",
"
\n",
" \n",
" \n",
" \n",
" 3 | \n",
" Super Star Inc | \n",
" 20 | \n",
" bucket_1 | \n",
"
\n",
" \n",
" 2 | \n",
" Blue Inc | \n",
" 50 | \n",
" bucket_1 | \n",
"
\n",
" \n",
" 4 | \n",
" Wamo | \n",
" 75 | \n",
" bucket_1 | \n",
"
\n",
" \n",
" 6 | \n",
" Giga Co | \n",
" 950 | \n",
" bucket_1 | \n",
"
\n",
" \n",
" 5 | \n",
" Next Gen | \n",
" 1100 | \n",
" bucket_1 | \n",
"
\n",
" \n",
" 7 | \n",
" IniTech | \n",
" 1300 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 8 | \n",
" Beta LLC | \n",
" 1400 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 0 | \n",
" Jones Inc | \n",
" 1500 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 1 | \n",
" Alpha Co | \n",
" 2100 | \n",
" bucket_2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" account Total quantile\n",
"3 Super Star Inc 20 bucket_1\n",
"2 Blue Inc 50 bucket_1\n",
"4 Wamo 75 bucket_1\n",
"6 Giga Co 950 bucket_1\n",
"5 Next Gen 1100 bucket_1\n",
"7 IniTech 1300 bucket_2\n",
"8 Beta LLC 1400 bucket_2\n",
"0 Jones Inc 1500 bucket_2\n",
"1 Alpha Co 2100 bucket_2"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sort_values(by='Total')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Compare with using cut"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"df['cut_bins'] = pd.cut(df['Total'],\n",
" bins=2,\n",
" labels=['bucket_1', 'bucket_2'])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" account | \n",
" Total | \n",
" quantile | \n",
" cut_bins | \n",
"
\n",
" \n",
" \n",
" \n",
" 3 | \n",
" Super Star Inc | \n",
" 20 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
"
\n",
" \n",
" 2 | \n",
" Blue Inc | \n",
" 50 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
"
\n",
" \n",
" 4 | \n",
" Wamo | \n",
" 75 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
"
\n",
" \n",
" 6 | \n",
" Giga Co | \n",
" 950 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
"
\n",
" \n",
" 5 | \n",
" Next Gen | \n",
" 1100 | \n",
" bucket_1 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 7 | \n",
" IniTech | \n",
" 1300 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 8 | \n",
" Beta LLC | \n",
" 1400 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 0 | \n",
" Jones Inc | \n",
" 1500 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 1 | \n",
" Alpha Co | \n",
" 2100 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" account Total quantile cut_bins\n",
"3 Super Star Inc 20 bucket_1 bucket_1\n",
"2 Blue Inc 50 bucket_1 bucket_1\n",
"4 Wamo 75 bucket_1 bucket_1\n",
"6 Giga Co 950 bucket_1 bucket_1\n",
"5 Next Gen 1100 bucket_1 bucket_2\n",
"7 IniTech 1300 bucket_2 bucket_2\n",
"8 Beta LLC 1400 bucket_2 bucket_2\n",
"0 Jones Inc 1500 bucket_2 bucket_2\n",
"1 Alpha Co 2100 bucket_2 bucket_2"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sort_values(by='Total')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Show how jenkspy works"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[20.0, 75.0, 2100.0]\n"
]
}
],
"source": [
"breaks = jenkspy.jenks_breaks(df['Total'], nb_class=2)\n",
"print(breaks)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" account | \n",
" Total | \n",
" quantile | \n",
" cut_bins | \n",
" cut_jenks | \n",
"
\n",
" \n",
" \n",
" \n",
" 3 | \n",
" Super Star Inc | \n",
" 20 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" Blue Inc | \n",
" 50 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
"
\n",
" \n",
" 4 | \n",
" Wamo | \n",
" 75 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
"
\n",
" \n",
" 6 | \n",
" Giga Co | \n",
" 950 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 5 | \n",
" Next Gen | \n",
" 1100 | \n",
" bucket_1 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 7 | \n",
" IniTech | \n",
" 1300 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 8 | \n",
" Beta LLC | \n",
" 1400 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 0 | \n",
" Jones Inc | \n",
" 1500 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 1 | \n",
" Alpha Co | \n",
" 2100 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" account Total quantile cut_bins cut_jenks\n",
"3 Super Star Inc 20 bucket_1 bucket_1 NaN\n",
"2 Blue Inc 50 bucket_1 bucket_1 bucket_1\n",
"4 Wamo 75 bucket_1 bucket_1 bucket_1\n",
"6 Giga Co 950 bucket_1 bucket_1 bucket_2\n",
"5 Next Gen 1100 bucket_1 bucket_2 bucket_2\n",
"7 IniTech 1300 bucket_2 bucket_2 bucket_2\n",
"8 Beta LLC 1400 bucket_2 bucket_2 bucket_2\n",
"0 Jones Inc 1500 bucket_2 bucket_2 bucket_2\n",
"1 Alpha Co 2100 bucket_2 bucket_2 bucket_2"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['cut_jenks'] = pd.cut(df['Total'],\n",
" bins=breaks,\n",
" labels=['bucket_1', 'bucket_2'])\n",
"df.sort_values(by='Total')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Fix the NaN by using include_lowest"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" account | \n",
" Total | \n",
" quantile | \n",
" cut_bins | \n",
" cut_jenks | \n",
" cut_jenksv2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 3 | \n",
" Super Star Inc | \n",
" 20 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" NaN | \n",
" bucket_1 | \n",
"
\n",
" \n",
" 2 | \n",
" Blue Inc | \n",
" 50 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
"
\n",
" \n",
" 4 | \n",
" Wamo | \n",
" 75 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
"
\n",
" \n",
" 6 | \n",
" Giga Co | \n",
" 950 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 5 | \n",
" Next Gen | \n",
" 1100 | \n",
" bucket_1 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 7 | \n",
" IniTech | \n",
" 1300 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 8 | \n",
" Beta LLC | \n",
" 1400 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 0 | \n",
" Jones Inc | \n",
" 1500 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 1 | \n",
" Alpha Co | \n",
" 2100 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" account Total quantile cut_bins cut_jenks cut_jenksv2\n",
"3 Super Star Inc 20 bucket_1 bucket_1 NaN bucket_1\n",
"2 Blue Inc 50 bucket_1 bucket_1 bucket_1 bucket_1\n",
"4 Wamo 75 bucket_1 bucket_1 bucket_1 bucket_1\n",
"6 Giga Co 950 bucket_1 bucket_1 bucket_2 bucket_2\n",
"5 Next Gen 1100 bucket_1 bucket_2 bucket_2 bucket_2\n",
"7 IniTech 1300 bucket_2 bucket_2 bucket_2 bucket_2\n",
"8 Beta LLC 1400 bucket_2 bucket_2 bucket_2 bucket_2\n",
"0 Jones Inc 1500 bucket_2 bucket_2 bucket_2 bucket_2\n",
"1 Alpha Co 2100 bucket_2 bucket_2 bucket_2 bucket_2"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['cut_jenksv2'] = pd.cut(df['Total'],\n",
" bins=breaks,\n",
" labels=['bucket_1', 'bucket_2'],\n",
" include_lowest=True)\n",
"df.sort_values(by='Total')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Try some other examples"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" account | \n",
" Total | \n",
" quantile | \n",
" cut_bins | \n",
" cut_jenks | \n",
" cut_jenksv2 | \n",
" quantilev2 | \n",
" cut_jenksv3 | \n",
"
\n",
" \n",
" \n",
" \n",
" 3 | \n",
" Super Star Inc | \n",
" 20 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" NaN | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
"
\n",
" \n",
" 2 | \n",
" Blue Inc | \n",
" 50 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
"
\n",
" \n",
" 4 | \n",
" Wamo | \n",
" 75 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
"
\n",
" \n",
" 6 | \n",
" Giga Co | \n",
" 950 | \n",
" bucket_1 | \n",
" bucket_1 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 5 | \n",
" Next Gen | \n",
" 1100 | \n",
" bucket_1 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
"
\n",
" \n",
" 7 | \n",
" IniTech | \n",
" 1300 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_3 | \n",
" bucket_3 | \n",
"
\n",
" \n",
" 8 | \n",
" Beta LLC | \n",
" 1400 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_3 | \n",
" bucket_3 | \n",
"
\n",
" \n",
" 0 | \n",
" Jones Inc | \n",
" 1500 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_4 | \n",
" bucket_3 | \n",
"
\n",
" \n",
" 1 | \n",
" Alpha Co | \n",
" 2100 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_2 | \n",
" bucket_4 | \n",
" bucket_4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" account Total quantile cut_bins cut_jenks cut_jenksv2 quantilev2 \\\n",
"3 Super Star Inc 20 bucket_1 bucket_1 NaN bucket_1 bucket_1 \n",
"2 Blue Inc 50 bucket_1 bucket_1 bucket_1 bucket_1 bucket_1 \n",
"4 Wamo 75 bucket_1 bucket_1 bucket_1 bucket_1 bucket_1 \n",
"6 Giga Co 950 bucket_1 bucket_1 bucket_2 bucket_2 bucket_2 \n",
"5 Next Gen 1100 bucket_1 bucket_2 bucket_2 bucket_2 bucket_2 \n",
"7 IniTech 1300 bucket_2 bucket_2 bucket_2 bucket_2 bucket_3 \n",
"8 Beta LLC 1400 bucket_2 bucket_2 bucket_2 bucket_2 bucket_3 \n",
"0 Jones Inc 1500 bucket_2 bucket_2 bucket_2 bucket_2 bucket_4 \n",
"1 Alpha Co 2100 bucket_2 bucket_2 bucket_2 bucket_2 bucket_4 \n",
"\n",
" cut_jenksv3 \n",
"3 bucket_1 \n",
"2 bucket_1 \n",
"4 bucket_1 \n",
"6 bucket_2 \n",
"5 bucket_2 \n",
"7 bucket_3 \n",
"8 bucket_3 \n",
"0 bucket_3 \n",
"1 bucket_4 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['quantilev2'] = pd.qcut(\n",
" df['Total'], q=4, labels=['bucket_1', 'bucket_2', 'bucket_3', 'bucket_4'])\n",
"\n",
"df['cut_jenksv3'] = pd.cut(\n",
" df['Total'],\n",
" bins=jenkspy.jenks_breaks(df['Total'], nb_class=4),\n",
" labels=['bucket_1', 'bucket_2', 'bucket_3', 'bucket_4'],\n",
" include_lowest=True)\n",
"\n",
"\n",
"df.sort_values(by='Total')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}