{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Binning Data with Pandas cut and qcut\n",
"\n",
"This notebook accompanies the article posted on [pbpython.com](http://pbpython.com/pandas-qcut-cut.html)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"sns.set_style('whitegrid')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"raw_df = pd.read_excel('https://github.com/chris1610/pbpython/blob/master/data/2018_Sales_Total_v2.xlsx?raw=true')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df = raw_df.groupby(['account number', 'name'])['ext price'].sum().reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" account number | \n",
" name | \n",
" ext price | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 141962 | \n",
" Herman LLC | \n",
" 63626.03 | \n",
"
\n",
" \n",
" 1 | \n",
" 146832 | \n",
" Kiehn-Spinka | \n",
" 99608.77 | \n",
"
\n",
" \n",
" 2 | \n",
" 163416 | \n",
" Purdy-Kunde | \n",
" 77898.21 | \n",
"
\n",
" \n",
" 3 | \n",
" 218895 | \n",
" Kulas Inc | \n",
" 137351.96 | \n",
"
\n",
" \n",
" 4 | \n",
" 239344 | \n",
" Stokes LLC | \n",
" 91535.92 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" account number name ext price\n",
"0 141962 Herman LLC 63626.03\n",
"1 146832 Kiehn-Spinka 99608.77\n",
"2 163416 Purdy-Kunde 77898.21\n",
"3 218895 Kulas Inc 137351.96\n",
"4 239344 Stokes LLC 91535.92"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A histogram is an example of binning data and showing the visual representation of the data distribution"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Introducing qcut"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXYAAAD3CAYAAAAJxX+sAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAUbUlEQVR4nO3df0xV9/3H8deVHxcFlLJJZytUWEfH0jj8MTcTWq1t6ppaYw0oYFlNW+OsVWc1oojMpSI0HW7+SG1t1pig0dpu2exYuml1q2xqjGtt1Nu6UpVVGPXXphC44L2f7x9GvuqAXq/3cLmfPR+JyeV67nm/OPfy8njuPQeXMcYIAGCNfuEOAAAILYodACxDsQOAZSh2ALAMxQ4AlokOdwBJ+uijj+R2ux2d4fV6HZ/hFLKHTyTnJ3t49GZ2r9er7Ozs/7q/TxS72+1WVlaWozM8Ho/jM5xC9vCJ5PxkD4/ezO7xeLq8n0MxAGAZih0ALEOxA4BlKHYAsAzFDgCWodgBwDKOfdzx9ddf1549e9TR0aGCggLl5eU5NQoAcB1Hiv3gwYP68MMPtW3bNrW2turNN990YgwAoAuOFHttba0yMzM1d+5cNTc3a8mSJU6MAQB0weXEL9ooLS1VQ0ODXnvtNX3xxReaM2eO3nvvPblcri6X741LCrS1tSkuLs7RGU7pK9nThmUovn94TvNuafWq/tTnvT63r2z7YJA9PHo7e1dnuTqyx56UlKSMjAzFxsYqIyNDbrdbFy5c0Ne+9rUul+eSAj3rS9mHLa0Jy9xTlY+HZRv0pW1/q8geHtZeUmDUqFHat2+fjDFqampSa2urkpKSnBgFALiJI3vsDz30kA4dOqTc3FwZY1RWVqaoqCgnRgEAbuLYxx15wxQAwoMTlADAMhQ7AFiGYgcAy1DsAGAZih0ALEOxA4BlKHYAsAzFDgCWodgBwDIUOwBYhmIHAMtQ7ABgGYodACxDsQOAZSh2ALAMxQ4AlqHYAcAyFDsAWIZiBwDLUOwAYBmKHQAsQ7EDgGUodgCwDMUOAJah2AHAMhQ7AFiGYgcAy0Q7teIpU6YoMTFRkjR06FBVVFQ4NQoAcB1Hit3r9UqSqqurnVg9AKAHjhT7J598otbWVj3zzDO6cuWKXnzxRWVnZ3e7vNfrlcfjcSJKp7a2NsdnOOXm7GnDMhTf3x3GROERjufPptdNJCH77XGk2OPi4vTss88qLy9Pp06d0qxZs/Tee+8pOrrrcW63W1lZWU5E6eTxeByf4ZSusg9bWtPrOU5VPt7rM68XjufPttdNpCB74LO64kixp6en65577pHL5VJ6erqSkpJ09uxZDRkyxIlxAIDrOPKpmHfeeUeVlZWSpKamJjU3N2vw4MFOjAIA3MSRPfbc3FwtW7ZMBQUFcrlcWr16dbeHYQAAoeVI28bGxqqqqsqJVQMAvgInKAGAZSh2ALAMxQ4AlqHYAcAyFDsAWIZiBwDLUOwAYBmKHQAsQ7EDgGUodgCwDMUOAJah2AHAMhQ7AFiGYgcAy1DsAGAZih0ALEOxA4BlKHYAsAzFDgCWodgBwDIUOwBYhmIHAMtQ7ABgGYodACxDsQOAZSh2ALCMY8V+/vx5jRs3TnV1dU6NAAB0wZFi7+joUFlZmeLi4pxYPQCgB44U+8svv6z8/HylpKQ4sXoAQA+iQ73C3/zmN0pOTtYDDzygTZs2BfQYr9crj8cT6ig3aGtrc3yGU27OnpWVFcY04ROO58+m100kIfvtCXmx//rXv5bL5dL+/fvl8XhUXFysjRs3avDgwd0+xu12O15WHo8nYgsxkrOHUji2QSRve7KHR29m7+4fkJAX+9atWztvFxUVaeXKlT2WOgAgtPi4IwBYJuR77Nerrq52cvUAgC6wxw4AlqHYAcAyFDsAWCagYj937pzTOQAAIRLQm6fz5s1TcnKycnNzNW7cOPXrx44+APRVARX7tm3bVFdXp3feeUcbN27U2LFjlZubq9TUVKfzAQBuUcC73ikpKUpNTVVcXJxOnDih8vJyrV271slsAIAgBLTHvmDBAv3jH//Q5MmT9corr+jOO++UJE2dOlULFixwNCAA4NYEVOzTpk1Tdna24uPj9eWXX3bev23bNseCAQCCE9ChmA8//FDr16+XJK1atarzqo1ut9u5ZACAoARU7Hv27NHSpUslSevWrdOePXscDQUACF5Axe5yudTe3i7p6m9HMsY4GgoAELyAjrHn5+friSeeUGZmpj7//HM999xzTucCAAQpoGLPy8vTww8/rH/+859KTU1VcnKy07kAAEEKqNg9Ho/eeusteb3ezvsqKiocCwUACF5Axb506VI99dRT+sY3vuF0HgDAbQqo2L/+9a8rLy/P6SwAgBAIqNjvvvtubdq0SVlZWXK5XJKknJwcR4MBAIITULF3dHTo5MmTOnnyZOd9FDsA9E0BFXtFRYVOnjyp+vp63XfffUpJSXE6FwAgSAEV+5YtW7Rr1y795z//0ZNPPqnTp0+rrKzM6WwAgCAEdOZpTU2NNm/erMTERD399NM6cuSI07kAAEEKqNivXULg2hunsbGxziUCANyWgA7FTJo0STNmzFBDQ4NmzZqlRx55xOlcAIAgBVTsTz31lMaOHasTJ04oPT1d3/72t53OBQAIUkDFvmHDhs7bdXV12r17t1544QXHQgEAghfwmafS1WPtx48fl9/vdzQUACB4AV+293pfddlen8+n0tJSnTx5UlFRUaqoqFBaWlrwKQEAAQuo2K8/4/Ts2bNqbGzscfm9e/dKkrZv366DBw+qoqJCGzduvI2YAIBABVTs15+M5Ha7tWTJkh6Xf+SRRzR+/HhJUkNDQ+ehnO54vV55PJ5AogStra3N8RlOuTl7VlZWGNOER1uHT3ExUb0+N3VYhjWvm0hC9tsTULFXV1ff+oqjo1VcXKxdu3Zp3bp1PS7rdrsdLyuPxxOxhRjJ2UMlLiZKw5bW9PrcU5WPR+y2j+TXDdkDn9WVgIp98uTJamlpkdvt7vxlG8YYuVwuvf/++90+7uWXX9bixYs1bdo01dTUaMCAAUFEBwDcioCKfcSIEZoyZYpGjBihTz/9VL/61a+0atWqbpf/7W9/q6amJs2ePVv9+/eXy+VSVFTv/zcaAP4XBVTsdXV1GjFihCTpvvvuU2NjY4+XFXj00Ue1bNkyzZgxQ1euXFFJSYncbndoEgMAehRQsScmJuqXv/ylhg8frsOHD+uuu+7qcfkBAwZo7dq1IQkIALg1AV0ErKqqSgkJCdq3b59SU1NVXl7udC4AQJACKna3261BgwbpjjvuUHp6ui5duuR0LgBAkAIq9rKyMjU0NOivf/2rWlpaVFxc7HQuAECQAir2+vp6LViwQLGxsZowYYIuX77sdC4AQJACKnafz6cLFy7I5XKpublZ/foF9DAAQBgE9KmYhQsXqqCgQGfPntX06dO1fPlyp3MBAIIUULE3Njbqj3/8oy5cuKA77rij81fkAQD6noCOqezYsUOSlJycTKkDQB8X0B57e3u7pkyZovT09M7j61VVVY4GAwAEp8dif/XVV/X8889r8eLFampq0p133tlbuQAAQerxUMyBAwckSWPGjNHbb7+tMWPGdP4BAPRNPRa7MabL2wCAvqvHYr/+jVLeNAWAyNDjMfZjx44pPz9fxhh99tlnnbddLpe2b9/eWxkBALegx2LfuXNnb+UAAIRIj8V+991391YOAECIcNEXALAMxQ4AlqHYAcAyFDsAWIZiBwDLUOwAYBmKHQAsQ7EDgGUodgCwDMUOAJYJ6Dco3YqOjg6VlJTozJkzam9v15w5c/Twww+HegwAoBshL/adO3cqKSlJr7zyii5evKgnn3ySYgeAXhTyYv/hD3+oiRMndn4dFRUV6hEAgB6EvNjj4+MlSc3NzZo/f75+8pOffOVjvF6vPB5PUPPShmUovr/7K5fLysoKav3daWn1qv7U5yFdZ3fa2tpu2D6h/l7Qs2Bfm+F28+smkpD99oS82CWpsbFRc+fOVWFhoZ544omvXN7tdt9WWQ1bWhP0Y4N1qvLxXitYj8dDmYdRpG77SH7dkD3wWV0JebGfO3dOzzzzjMrKyjR27NhQrx4A8BVC/nHH1157TZcuXdKrr76qoqIiFRUVqa2tLdRjAADdCPkee2lpqUpLS0O9WgBAgDhBCQAsQ7EDgGUodgCwDMUOAJah2AHAMhQ7AFiGYgcAy1DsAGAZih0ALEOxA4BlKHYAsAzFDgCWodgBwDIUOwBYhmIHAMtQ7ABgGYodACxDsQOAZSh2ALAMxQ4AlqHYAcAyFDsAWIZiBwDLUOwAYBmKHQAsQ7EDgGUodgCwjGPFfuTIERUVFTm1egBAN6KdWOkbb7yhnTt3qn///k6sHgDQA0eKPS0tTevXr9eSJUsCWt7r9crj8QQ1KysrK6jH3a62Dp/iYqJ6ZVa4vkf07vN8s9b2K+ofG/yPaLCvm5ZWr+pPfR703FBoa2sLuhOCkTYsQ/H93SFZ161sd6e2tSPFPnHiRH3xxRcBL+92uyOuvOJiojRsaU1YZp+qfDwsc/8Xhft5DsfsU5WPh/3n0ePx9HqGSNzW3f3jx5unAGAZih0ALEOxA4BlHCv2oUOHaseOHU6tHgDQDfbYAcAyFDsAWIZiBwDLUOwAYBmKHQAsQ7EDgGUodgCwDMUOAJah2AHAMhQ7AFiGYgcAy1DsAGAZih0ALEOxA4BlKHYAsAzFDgCWodgBwDIUOwBYhmIHAMtQ7ABgGYodACxDsQOAZSh2ALAMxQ4AlqHYAcAyFDsAWCbaiZX6/X6tXLlSn376qWJjY7Vq1Srdc889TowCANzEkT323bt3q729XW+99ZYWLVqkyspKJ8YAALrgSLEfPnxYDzzwgCQpOztbR48edWIMAKALLmOMCfVKly9frkcffVTjxo2TJI0fP167d+9WdHTXR34++ugjud3uUMcAAKt5vV5lZ2f/1/2OHGNPSEhQS0tL59d+v7/bUpfUZTAAQHAcORQzcuRIffDBB5Ku7o1nZmY6MQYA0AVHDsVc+1TMiRMnZIzR6tWr9c1vfjPUYwAAXXCk2AEA4cMJSgBgGYodACxDsQOAZSKy2F9//XVNnz5dU6dO1dtvv63Tp0+roKBAhYWF+ulPfyq/3y9J2rFjh6ZOnapp06Zp7969kqS2tjbNmzdPhYWFmjVrli5cuCDp6qd38vLylJ+frw0bNjiWvaOjQ4sWLVJ+fr4KCwtVV1cXEfmPHDmioqIiSXI074YNG5Sbm6v8/Hx9/PHHIc/u8XhUWFiooqIiPfvsszp37lzEZL/m3Xff1fTp0zu/joTs58+f15w5czRjxgzl5+ervr4+YrJ7PB5NmzZNBQUFWrZsWZ9/vUuSTIQ5cOCAmT17tvH5fKa5udmsW7fOzJ492xw4cMAYY8yKFSvMn/70J/Pll1+aSZMmGa/Xay5dutR5+8033zTr1q0zxhjz+9//3rz00kvGGGMmT55sTp8+bfx+v3nuuefM0aNHHcm/a9cuM3/+fGOMMbW1teaFF17o8/k3bdpkJk2aZPLy8owxxrG8R48eNUVFRcbv95szZ86YqVOnhjz7jBkzzPHjx40xxmzbts2sXr06YrIbY8zx48fNj370o877IiV7cXGxqampMcYYs3//frN3796Iyf7888+bP//5z8YYY1588UXz/vvv99ns10TcHnttba0yMzM1d+5c/fjHP9b48eN17NgxjRkzRpL04IMP6m9/+5s+/vhjjRgxQrGxsUpMTFRaWpo++eSTGy538OCDD2r//v1qbm5We3u70tLS5HK5lJOTo/379zuSPz09XT6fT36/X83NzYqOju7z+dPS0rR+/frOr53Ke/jwYeXk5Mjlcumuu+6Sz+fr3OMJVfY1a9YoKytLkuTz+eR2uyMm+8WLF/Xzn/9cJSUlnfdFSva///3vampq0syZM/Xuu+9qzJgxEZM9KytL//73v2WMUUtLi6Kjo/ts9msirtgvXryoo0ePau3atfrZz36mxYsXyxgjl8slSYqPj9fly5fV3NysxMTEzsfFx8erubn5hvuvXzYhIeGGZS9fvuxI/gEDBujMmTN67LHHtGLFChUVFfX5/BMnTrzhzGGn8jrxfdycPSUlRdLVotmyZYtmzpwZEdl9Pp+WL1+ukpISxcfHdy4TCdkl6cyZMxo4cKA2b96sIUOG6I033oiY7MOGDVN5ebkee+wxnT9/Xt///vf7bPZrHLmkgJOSkpKUkZGh2NhYZWRkyO1261//+lfn37e0tGjgwIH/dVmDlpYWJSYm3nB/T8sOHDjQkfybN29WTk6OFi1apMbGRj399NPq6OiImPyS1K/f/+8PhDJvTExMl+sItT/84Q/auHGjNm3apOTk5IjIfuzYMZ0+fVorV66U1+vVZ599pvLycv3gBz/o89mlqz+3EyZMkCRNmDBBv/jFL3T//fdHRPby8nJt3bpV3/rWt7R161ZVVlYqJyenT2ePuD32UaNGad++fTLGqKmpSa2trRo7dqwOHjwoSfrggw80evRoDR8+XIcPH5bX69Xly5dVV1enzMxMjRw5Un/5y186lx01apQSEhIUExOj+vp6GWNUW1ur0aNHO5J/4MCBnU/eoEGDdOXKFX3nO9+JmPySHMs7cuRI1dbWyu/3q6GhQX6/X8nJySHN/rvf/U5btmxRdXW1UlNTJSkisg8fPlw1NTWqrq7WmjVrdO+992r58uURkV26+nN7Lc+hQ4d07733Rkz2QYMGde5Zp6Sk6NKlS30+e8TtsT/00EM6dOiQcnNzZYxRWVmZhg4dqhUrVmjNmjXKyMjQxIkTFRUVpaKiIhUWFsoYo4ULF8rtdqugoEDFxcUqKChQTEyMqqqqJKnzsI7P51NOTo6++93vOpJ/5syZKikpUWFhoTo6OrRw4ULdf//9EZNfkoqLix3LO3r0aE2fPl1+v19lZWUhze3z+VReXq4hQ4Zo3rx5kqTvfe97mj9/fp/P3p3BgwdHRPbi4mKVlpZq+/btSkhIUFVVlQYNGhQR2VetWqWFCxcqOjpaMTExeumll/r8dueSAgBgmYg7FAMA6BnFDgCWodgBwDIUOwBYhmIHAMtQ7ABgGYodACzzfxcvBS+/c6VNAAAAAElFTkSuQmCC\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df['ext price'].plot(kind='hist')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Describe shows how data can be cut by percentiles"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 20.000000\n",
"mean 101711.287500\n",
"std 27037.449673\n",
"min 55733.050000\n",
"25% 89137.707500\n",
"50% 100271.535000\n",
"75% 110132.552500\n",
"max 184793.700000\n",
"Name: ext price, dtype: float64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['ext price'].describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here is an example of using [qcut](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 (55733.049000000006, 89137.708]\n",
"1 (89137.708, 100271.535]\n",
"2 (55733.049000000006, 89137.708]\n",
"3 (110132.552, 184793.7]\n",
"4 (89137.708, 100271.535]\n",
"5 (89137.708, 100271.535]\n",
"6 (55733.049000000006, 89137.708]\n",
"7 (100271.535, 110132.552]\n",
"8 (110132.552, 184793.7]\n",
"9 (110132.552, 184793.7]\n",
"10 (89137.708, 100271.535]\n",
"11 (55733.049000000006, 89137.708]\n",
"12 (55733.049000000006, 89137.708]\n",
"13 (89137.708, 100271.535]\n",
"14 (100271.535, 110132.552]\n",
"15 (110132.552, 184793.7]\n",
"16 (100271.535, 110132.552]\n",
"17 (110132.552, 184793.7]\n",
"18 (100271.535, 110132.552]\n",
"19 (100271.535, 110132.552]\n",
"Name: ext price, dtype: category\n",
"Categories (4, interval[float64]): [(55733.049000000006, 89137.708] < (89137.708, 100271.535] < (100271.535, 110132.552] < (110132.552, 184793.7]]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.qcut(df['ext price'], q=4)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Assign the results of the values back to the original dataframe"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"df['quantile_ex_1'] = pd.qcut(df['ext price'], q=4)\n",
"df['quantile_ex_2'] = pd.qcut(df['ext price'], q=10, precision=0)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" account number | \n",
" name | \n",
" ext price | \n",
" quantile_ex_1 | \n",
" quantile_ex_2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 141962 | \n",
" Herman LLC | \n",
" 63626.03 | \n",
" (55733.049000000006, 89137.708] | \n",
" (55732.0, 76471.0] | \n",
"
\n",
" \n",
" 1 | \n",
" 146832 | \n",
" Kiehn-Spinka | \n",
" 99608.77 | \n",
" (89137.708, 100271.535] | \n",
" (95908.0, 100272.0] | \n",
"
\n",
" \n",
" 2 | \n",
" 163416 | \n",
" Purdy-Kunde | \n",
" 77898.21 | \n",
" (55733.049000000006, 89137.708] | \n",
" (76471.0, 87168.0] | \n",
"
\n",
" \n",
" 3 | \n",
" 218895 | \n",
" Kulas Inc | \n",
" 137351.96 | \n",
" (110132.552, 184793.7] | \n",
" (124778.0, 184794.0] | \n",
"
\n",
" \n",
" 4 | \n",
" 239344 | \n",
" Stokes LLC | \n",
" 91535.92 | \n",
" (89137.708, 100271.535] | \n",
" (90686.0, 95908.0] | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" account number name ext price quantile_ex_1 \\\n",
"0 141962 Herman LLC 63626.03 (55733.049000000006, 89137.708] \n",
"1 146832 Kiehn-Spinka 99608.77 (89137.708, 100271.535] \n",
"2 163416 Purdy-Kunde 77898.21 (55733.049000000006, 89137.708] \n",
"3 218895 Kulas Inc 137351.96 (110132.552, 184793.7] \n",
"4 239344 Stokes LLC 91535.92 (89137.708, 100271.535] \n",
"\n",
" quantile_ex_2 \n",
"0 (55732.0, 76471.0] \n",
"1 (95908.0, 100272.0] \n",
"2 (76471.0, 87168.0] \n",
"3 (124778.0, 184794.0] \n",
"4 (90686.0, 95908.0] "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Look at the distribution"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(110132.552, 184793.7] 5\n",
"(100271.535, 110132.552] 5\n",
"(89137.708, 100271.535] 5\n",
"(55733.049000000006, 89137.708] 5\n",
"Name: quantile_ex_1, dtype: int64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['quantile_ex_1'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(124778.0, 184794.0] 2\n",
"(112290.0, 124778.0] 2\n",
"(105938.0, 112290.0] 2\n",
"(103606.0, 105938.0] 2\n",
"(100272.0, 103606.0] 2\n",
"(95908.0, 100272.0] 2\n",
"(90686.0, 95908.0] 2\n",
"(87168.0, 90686.0] 2\n",
"(76471.0, 87168.0] 2\n",
"(55732.0, 76471.0] 2\n",
"Name: quantile_ex_2, dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['quantile_ex_2'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" account number | \n",
" name | \n",
" ext price | \n",
" quantile_ex_1 | \n",
" quantile_ex_2 | \n",
" quantile_ex_3 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 141962 | \n",
" Herman LLC | \n",
" 63626.03 | \n",
" (55733.049000000006, 89137.708] | \n",
" (55732.0, 76471.0] | \n",
" Bronze | \n",
"
\n",
" \n",
" 1 | \n",
" 146832 | \n",
" Kiehn-Spinka | \n",
" 99608.77 | \n",
" (89137.708, 100271.535] | \n",
" (95908.0, 100272.0] | \n",
" Gold | \n",
"
\n",
" \n",
" 2 | \n",
" 163416 | \n",
" Purdy-Kunde | \n",
" 77898.21 | \n",
" (55733.049000000006, 89137.708] | \n",
" (76471.0, 87168.0] | \n",
" Bronze | \n",
"
\n",
" \n",
" 3 | \n",
" 218895 | \n",
" Kulas Inc | \n",
" 137351.96 | \n",
" (110132.552, 184793.7] | \n",
" (124778.0, 184794.0] | \n",
" Diamond | \n",
"
\n",
" \n",
" 4 | \n",
" 239344 | \n",
" Stokes LLC | \n",
" 91535.92 | \n",
" (89137.708, 100271.535] | \n",
" (90686.0, 95908.0] | \n",
" Silver | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" account number name ext price quantile_ex_1 \\\n",
"0 141962 Herman LLC 63626.03 (55733.049000000006, 89137.708] \n",
"1 146832 Kiehn-Spinka 99608.77 (89137.708, 100271.535] \n",
"2 163416 Purdy-Kunde 77898.21 (55733.049000000006, 89137.708] \n",
"3 218895 Kulas Inc 137351.96 (110132.552, 184793.7] \n",
"4 239344 Stokes LLC 91535.92 (89137.708, 100271.535] \n",
"\n",
" quantile_ex_2 quantile_ex_3 \n",
"0 (55732.0, 76471.0] Bronze \n",
"1 (95908.0, 100272.0] Gold \n",
"2 (76471.0, 87168.0] Bronze \n",
"3 (124778.0, 184794.0] Diamond \n",
"4 (90686.0, 95908.0] Silver "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bin_labels_5 = ['Bronze', 'Silver', 'Gold', 'Platinum', 'Diamond']\n",
"df['quantile_ex_3'] = pd.qcut(df['ext price'],\n",
" q=[0, .2, .4, .6, .8, 1],\n",
" labels=bin_labels_5)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Diamond 4\n",
"Platinum 4\n",
"Gold 4\n",
"Silver 4\n",
"Bronze 4\n",
"Name: quantile_ex_3, dtype: int64"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['quantile_ex_3'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"results, bin_edges = pd.qcut(df['ext price'],\n",
" q=[0, .2, .4, .6, .8, 1],\n",
" labels=bin_labels_5,\n",
" retbins=True)\n",
"\n",
"results_table = pd.DataFrame(zip(bin_edges, bin_labels_5),\n",
" columns=['Threshold', 'Tier'])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Threshold | \n",
" Tier | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 55733.050 | \n",
" Bronze | \n",
"
\n",
" \n",
" 1 | \n",
" 87167.958 | \n",
" Silver | \n",
"
\n",
" \n",
" 2 | \n",
" 95908.156 | \n",
" Gold | \n",
"
\n",
" \n",
" 3 | \n",
" 103605.970 | \n",
" Platinum | \n",
"
\n",
" \n",
" 4 | \n",
" 112290.054 | \n",
" Diamond | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Threshold Tier\n",
"0 55733.050 Bronze\n",
"1 87167.958 Silver\n",
"2 95908.156 Gold\n",
"3 103605.970 Platinum\n",
"4 112290.054 Diamond"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results_table"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" quantile_ex_1 | \n",
" quantile_ex_2 | \n",
" quantile_ex_3 | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 20 | \n",
" 20 | \n",
" 20 | \n",
"
\n",
" \n",
" unique | \n",
" 4 | \n",
" 10 | \n",
" 5 | \n",
"
\n",
" \n",
" top | \n",
" (110132.552, 184793.7] | \n",
" (124778.0, 184794.0] | \n",
" Diamond | \n",
"
\n",
" \n",
" freq | \n",
" 5 | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" quantile_ex_1 quantile_ex_2 quantile_ex_3\n",
"count 20 20 20\n",
"unique 4 10 5\n",
"top (110132.552, 184793.7] (124778.0, 184794.0] Diamond\n",
"freq 5 2 4"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe(include='category')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can pass the percentiles to use to describe"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" account number | \n",
" ext price | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 20.000000 | \n",
" 20.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 476998.750000 | \n",
" 101711.287500 | \n",
"
\n",
" \n",
" std | \n",
" 231499.208970 | \n",
" 27037.449673 | \n",
"
\n",
" \n",
" min | \n",
" 141962.000000 | \n",
" 55733.050000 | \n",
"
\n",
" \n",
" 0% | \n",
" 141962.000000 | \n",
" 55733.050000 | \n",
"
\n",
" \n",
" 33.3% | \n",
" 332759.333333 | \n",
" 91241.493333 | \n",
"
\n",
" \n",
" 50% | \n",
" 476006.500000 | \n",
" 100271.535000 | \n",
"
\n",
" \n",
" 66.7% | \n",
" 662511.000000 | \n",
" 104178.580000 | \n",
"
\n",
" \n",
" 100% | \n",
" 786968.000000 | \n",
" 184793.700000 | \n",
"
\n",
" \n",
" max | \n",
" 786968.000000 | \n",
" 184793.700000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" account number ext price\n",
"count 20.000000 20.000000\n",
"mean 476998.750000 101711.287500\n",
"std 231499.208970 27037.449673\n",
"min 141962.000000 55733.050000\n",
"0% 141962.000000 55733.050000\n",
"33.3% 332759.333333 91241.493333\n",
"50% 476006.500000 100271.535000\n",
"66.7% 662511.000000 104178.580000\n",
"100% 786968.000000 184793.700000\n",
"max 786968.000000 184793.700000"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe(percentiles=[0, 1/3, 2/3, 1])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"labels=False will return integers for each bin"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" account number | \n",
" name | \n",
" ext price | \n",
" quantile_ex_1 | \n",
" quantile_ex_2 | \n",
" quantile_ex_3 | \n",
" quantile_ex_4 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 141962 | \n",
" Herman LLC | \n",
" 63626.03 | \n",
" (55733.049000000006, 89137.708] | \n",
" (55732.0, 76471.0] | \n",
" Bronze | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 146832 | \n",
" Kiehn-Spinka | \n",
" 99608.77 | \n",
" (89137.708, 100271.535] | \n",
" (95908.0, 100272.0] | \n",
" Gold | \n",
" 2 | \n",
"
\n",
" \n",
" 2 | \n",
" 163416 | \n",
" Purdy-Kunde | \n",
" 77898.21 | \n",
" (55733.049000000006, 89137.708] | \n",
" (76471.0, 87168.0] | \n",
" Bronze | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 218895 | \n",
" Kulas Inc | \n",
" 137351.96 | \n",
" (110132.552, 184793.7] | \n",
" (124778.0, 184794.0] | \n",
" Diamond | \n",
" 4 | \n",
"
\n",
" \n",
" 4 | \n",
" 239344 | \n",
" Stokes LLC | \n",
" 91535.92 | \n",
" (89137.708, 100271.535] | \n",
" (90686.0, 95908.0] | \n",
" Silver | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" account number name ext price quantile_ex_1 \\\n",
"0 141962 Herman LLC 63626.03 (55733.049000000006, 89137.708] \n",
"1 146832 Kiehn-Spinka 99608.77 (89137.708, 100271.535] \n",
"2 163416 Purdy-Kunde 77898.21 (55733.049000000006, 89137.708] \n",
"3 218895 Kulas Inc 137351.96 (110132.552, 184793.7] \n",
"4 239344 Stokes LLC 91535.92 (89137.708, 100271.535] \n",
"\n",
" quantile_ex_2 quantile_ex_3 quantile_ex_4 \n",
"0 (55732.0, 76471.0] Bronze 0 \n",
"1 (95908.0, 100272.0] Gold 2 \n",
"2 (76471.0, 87168.0] Bronze 0 \n",
"3 (124778.0, 184794.0] Diamond 4 \n",
"4 (90686.0, 95908.0] Silver 1 "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['quantile_ex_4'] = pd.qcut(df['ext price'],\n",
" q=[0, .2, .4, .6, .8, 1],\n",
" labels=False)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## cut"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Remove the added columns to make the examples shorter"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"df = df.drop(columns = ['quantile_ex_1','quantile_ex_2', 'quantile_ex_3', 'quantile_ex_4'])"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 (55603.989, 87998.212]\n",
"1 (87998.212, 120263.375]\n",
"2 (55603.989, 87998.212]\n",
"3 (120263.375, 152528.538]\n",
"4 (87998.212, 120263.375]\n",
"5 (87998.212, 120263.375]\n",
"6 (55603.989, 87998.212]\n",
"7 (87998.212, 120263.375]\n",
"8 (87998.212, 120263.375]\n",
"9 (152528.538, 184793.7]\n",
"10 (87998.212, 120263.375]\n",
"11 (55603.989, 87998.212]\n",
"12 (55603.989, 87998.212]\n",
"13 (87998.212, 120263.375]\n",
"14 (87998.212, 120263.375]\n",
"15 (120263.375, 152528.538]\n",
"16 (87998.212, 120263.375]\n",
"17 (87998.212, 120263.375]\n",
"18 (87998.212, 120263.375]\n",
"19 (87998.212, 120263.375]\n",
"Name: ext price, dtype: category\n",
"Categories (4, interval[float64]): [(55603.989, 87998.212] < (87998.212, 120263.375] < (120263.375, 152528.538] < (152528.538, 184793.7]]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.cut(df['ext price'], bins=4)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(87998.212, 120263.375] 12\n",
"(55603.989, 87998.212] 5\n",
"(120263.375, 152528.538] 2\n",
"(152528.538, 184793.7] 1\n",
"Name: ext price, dtype: int64"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.cut(df['ext price'], bins=4).value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"cut_labels_4 = ['silver', 'gold', 'platinum', 'diamond']\n",
"cut_bins = [0, 70000, 100000, 130000, 200000]\n",
"df['cut_ex1'] = pd.cut(df['ext price'], bins=cut_bins, labels=cut_labels_4)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" account number | \n",
" name | \n",
" ext price | \n",
" cut_ex1 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 141962 | \n",
" Herman LLC | \n",
" 63626.03 | \n",
" silver | \n",
"
\n",
" \n",
" 1 | \n",
" 146832 | \n",
" Kiehn-Spinka | \n",
" 99608.77 | \n",
" gold | \n",
"
\n",
" \n",
" 2 | \n",
" 163416 | \n",
" Purdy-Kunde | \n",
" 77898.21 | \n",
" gold | \n",
"
\n",
" \n",
" 3 | \n",
" 218895 | \n",
" Kulas Inc | \n",
" 137351.96 | \n",
" diamond | \n",
"
\n",
" \n",
" 4 | \n",
" 239344 | \n",
" Stokes LLC | \n",
" 91535.92 | \n",
" gold | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" account number name ext price cut_ex1\n",
"0 141962 Herman LLC 63626.03 silver\n",
"1 146832 Kiehn-Spinka 99608.77 gold\n",
"2 163416 Purdy-Kunde 77898.21 gold\n",
"3 218895 Kulas Inc 137351.96 diamond\n",
"4 239344 Stokes LLC 91535.92 gold"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can use nump.linspace to define the ranges"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0., 25000., 50000., 75000., 100000., 125000., 150000.,\n",
" 175000., 200000.])"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.linspace(0, 200000, 9)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 (50000.0, 75000.0]\n",
"1 (75000.0, 100000.0]\n",
"2 (75000.0, 100000.0]\n",
"3 (125000.0, 150000.0]\n",
"4 (75000.0, 100000.0]\n",
"5 (75000.0, 100000.0]\n",
"6 (75000.0, 100000.0]\n",
"7 (100000.0, 125000.0]\n",
"8 (100000.0, 125000.0]\n",
"9 (175000.0, 200000.0]\n",
"10 (75000.0, 100000.0]\n",
"11 (50000.0, 75000.0]\n",
"12 (75000.0, 100000.0]\n",
"13 (75000.0, 100000.0]\n",
"14 (100000.0, 125000.0]\n",
"15 (100000.0, 125000.0]\n",
"16 (100000.0, 125000.0]\n",
"17 (100000.0, 125000.0]\n",
"18 (100000.0, 125000.0]\n",
"19 (100000.0, 125000.0]\n",
"Name: ext price, dtype: category\n",
"Categories (8, interval[float64]): [(0.0, 25000.0] < (25000.0, 50000.0] < (50000.0, 75000.0] < (75000.0, 100000.0] < (100000.0, 125000.0] < (125000.0, 150000.0] < (150000.0, 175000.0] < (175000.0, 200000.0]]"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.cut(df['ext price'], bins=np.linspace(0, 200000, 9))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"numpy arange is another option"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0, 10000, 20000, 30000, 40000, 50000, 60000, 70000,\n",
" 80000, 90000, 100000, 110000, 120000, 130000, 140000, 150000,\n",
" 160000, 170000, 180000, 190000])"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.arange(0, 200000, 10000)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"IntervalIndex([[0, 10000), [10000, 20000), [20000, 30000), [30000, 40000), [40000, 50000) ... [150000, 160000), [160000, 170000), [170000, 180000), [180000, 190000), [190000, 200000)],\n",
" closed='left',\n",
" dtype='interval[int64]')"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.interval_range(start=0, freq=10000, end=200000, closed='left')"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" account number | \n",
" name | \n",
" ext price | \n",
" cut_ex1 | \n",
" cut_ex2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 141962 | \n",
" Herman LLC | \n",
" 63626.03 | \n",
" silver | \n",
" (60000, 70000] | \n",
"
\n",
" \n",
" 1 | \n",
" 146832 | \n",
" Kiehn-Spinka | \n",
" 99608.77 | \n",
" gold | \n",
" (90000, 100000] | \n",
"
\n",
" \n",
" 2 | \n",
" 163416 | \n",
" Purdy-Kunde | \n",
" 77898.21 | \n",
" gold | \n",
" (70000, 80000] | \n",
"
\n",
" \n",
" 3 | \n",
" 218895 | \n",
" Kulas Inc | \n",
" 137351.96 | \n",
" diamond | \n",
" (130000, 140000] | \n",
"
\n",
" \n",
" 4 | \n",
" 239344 | \n",
" Stokes LLC | \n",
" 91535.92 | \n",
" gold | \n",
" (90000, 100000] | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" account number name ext price cut_ex1 cut_ex2\n",
"0 141962 Herman LLC 63626.03 silver (60000, 70000]\n",
"1 146832 Kiehn-Spinka 99608.77 gold (90000, 100000]\n",
"2 163416 Purdy-Kunde 77898.21 gold (70000, 80000]\n",
"3 218895 Kulas Inc 137351.96 diamond (130000, 140000]\n",
"4 239344 Stokes LLC 91535.92 gold (90000, 100000]"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"interval_range = pd.interval_range(start=0, freq=10000, end=200000)\n",
"df['cut_ex2'] = pd.cut(df['ext price'], bins=interval_range, labels=[1,2,3])\n",
"df.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}