{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Binning Data with Pandas cut and qcut\n", "\n", "This notebook accompanies the article posted on [pbpython.com](http://pbpython.com/pandas-qcut-cut.html)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "sns.set_style('whitegrid')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "raw_df = pd.read_excel('https://github.com/chris1610/pbpython/blob/master/data/2018_Sales_Total_v2.xlsx?raw=true')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df = raw_df.groupby(['account number', 'name'])['ext price'].sum().reset_index()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
account numbernameext price
0141962Herman LLC63626.03
1146832Kiehn-Spinka99608.77
2163416Purdy-Kunde77898.21
3218895Kulas Inc137351.96
4239344Stokes LLC91535.92
\n", "
" ], "text/plain": [ " account number name ext price\n", "0 141962 Herman LLC 63626.03\n", "1 146832 Kiehn-Spinka 99608.77\n", "2 163416 Purdy-Kunde 77898.21\n", "3 218895 Kulas Inc 137351.96\n", "4 239344 Stokes LLC 91535.92" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A histogram is an example of binning data and showing the visual representation of the data distribution" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Introducing qcut" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXYAAAD3CAYAAAAJxX+sAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAUbUlEQVR4nO3df0xV9/3H8deVHxcFlLJJZytUWEfH0jj8MTcTWq1t6ppaYw0oYFlNW+OsVWc1oojMpSI0HW7+SG1t1pig0dpu2exYuml1q2xqjGtt1Nu6UpVVGPXXphC44L2f7x9GvuqAXq/3cLmfPR+JyeV67nm/OPfy8njuPQeXMcYIAGCNfuEOAAAILYodACxDsQOAZSh2ALAMxQ4AlokOdwBJ+uijj+R2ux2d4fV6HZ/hFLKHTyTnJ3t49GZ2r9er7Ozs/7q/TxS72+1WVlaWozM8Ho/jM5xC9vCJ5PxkD4/ezO7xeLq8n0MxAGAZih0ALEOxA4BlKHYAsAzFDgCWodgBwDKOfdzx9ddf1549e9TR0aGCggLl5eU5NQoAcB1Hiv3gwYP68MMPtW3bNrW2turNN990YgwAoAuOFHttba0yMzM1d+5cNTc3a8mSJU6MAQB0weXEL9ooLS1VQ0ODXnvtNX3xxReaM2eO3nvvPblcri6X741LCrS1tSkuLs7RGU7pK9nThmUovn94TvNuafWq/tTnvT63r2z7YJA9PHo7e1dnuTqyx56UlKSMjAzFxsYqIyNDbrdbFy5c0Ne+9rUul+eSAj3rS9mHLa0Jy9xTlY+HZRv0pW1/q8geHtZeUmDUqFHat2+fjDFqampSa2urkpKSnBgFALiJI3vsDz30kA4dOqTc3FwZY1RWVqaoqCgnRgEAbuLYxx15wxQAwoMTlADAMhQ7AFiGYgcAy1DsAGAZih0ALEOxA4BlKHYAsAzFDgCWodgBwDIUOwBYhmIHAMtQ7ABgGYodACxDsQOAZSh2ALAMxQ4AlqHYAcAyFDsAWIZiBwDLUOwAYBmKHQAsQ7EDgGUodgCwDMUOAJah2AHAMhQ7AFiGYgcAy0Q7teIpU6YoMTFRkjR06FBVVFQ4NQoAcB1Hit3r9UqSqqurnVg9AKAHjhT7J598otbWVj3zzDO6cuWKXnzxRWVnZ3e7vNfrlcfjcSJKp7a2NsdnOOXm7GnDMhTf3x3GROERjufPptdNJCH77XGk2OPi4vTss88qLy9Pp06d0qxZs/Tee+8pOrrrcW63W1lZWU5E6eTxeByf4ZSusg9bWtPrOU5VPt7rM68XjufPttdNpCB74LO64kixp6en65577pHL5VJ6erqSkpJ09uxZDRkyxIlxAIDrOPKpmHfeeUeVlZWSpKamJjU3N2vw4MFOjAIA3MSRPfbc3FwtW7ZMBQUFcrlcWr16dbeHYQAAoeVI28bGxqqqqsqJVQMAvgInKAGAZSh2ALAMxQ4AlqHYAcAyFDsAWIZiBwDLUOwAYBmKHQAsQ7EDgGUodgCwDMUOAJah2AHAMhQ7AFiGYgcAy1DsAGAZih0ALEOxA4BlKHYAsAzFDgCWodgBwDIUOwBYhmIHAMtQ7ABgGYodACxDsQOAZSh2ALCMY8V+/vx5jRs3TnV1dU6NAAB0wZFi7+joUFlZmeLi4pxYPQCgB44U+8svv6z8/HylpKQ4sXoAQA+iQ73C3/zmN0pOTtYDDzygTZs2BfQYr9crj8cT6ig3aGtrc3yGU27OnpWVFcY04ROO58+m100kIfvtCXmx//rXv5bL5dL+/fvl8XhUXFysjRs3avDgwd0+xu12O15WHo8nYgsxkrOHUji2QSRve7KHR29m7+4fkJAX+9atWztvFxUVaeXKlT2WOgAgtPi4IwBYJuR77Nerrq52cvUAgC6wxw4AlqHYAcAyFDsAWCagYj937pzTOQAAIRLQm6fz5s1TcnKycnNzNW7cOPXrx44+APRVARX7tm3bVFdXp3feeUcbN27U2LFjlZubq9TUVKfzAQBuUcC73ikpKUpNTVVcXJxOnDih8vJyrV271slsAIAgBLTHvmDBAv3jH//Q5MmT9corr+jOO++UJE2dOlULFixwNCAA4NYEVOzTpk1Tdna24uPj9eWXX3bev23bNseCAQCCE9ChmA8//FDr16+XJK1atarzqo1ut9u5ZACAoARU7Hv27NHSpUslSevWrdOePXscDQUACF5Axe5yudTe3i7p6m9HMsY4GgoAELyAjrHn5+friSeeUGZmpj7//HM999xzTucCAAQpoGLPy8vTww8/rH/+859KTU1VcnKy07kAAEEKqNg9Ho/eeusteb3ezvsqKiocCwUACF5Axb506VI99dRT+sY3vuF0HgDAbQqo2L/+9a8rLy/P6SwAgBAIqNjvvvtubdq0SVlZWXK5XJKknJwcR4MBAIITULF3dHTo5MmTOnnyZOd9FDsA9E0BFXtFRYVOnjyp+vp63XfffUpJSXE6FwAgSAEV+5YtW7Rr1y795z//0ZNPPqnTp0+rrKzM6WwAgCAEdOZpTU2NNm/erMTERD399NM6cuSI07kAAEEKqNivXULg2hunsbGxziUCANyWgA7FTJo0STNmzFBDQ4NmzZqlRx55xOlcAIAgBVTsTz31lMaOHasTJ04oPT1d3/72t53OBQAIUkDFvmHDhs7bdXV12r17t1544QXHQgEAghfwmafS1WPtx48fl9/vdzQUACB4AV+293pfddlen8+n0tJSnTx5UlFRUaqoqFBaWlrwKQEAAQuo2K8/4/Ts2bNqbGzscfm9e/dKkrZv366DBw+qoqJCGzduvI2YAIBABVTs15+M5Ha7tWTJkh6Xf+SRRzR+/HhJUkNDQ+ehnO54vV55PJ5AogStra3N8RlOuTl7VlZWGNOER1uHT3ExUb0+N3VYhjWvm0hC9tsTULFXV1ff+oqjo1VcXKxdu3Zp3bp1PS7rdrsdLyuPxxOxhRjJ2UMlLiZKw5bW9PrcU5WPR+y2j+TXDdkDn9WVgIp98uTJamlpkdvt7vxlG8YYuVwuvf/++90+7uWXX9bixYs1bdo01dTUaMCAAUFEBwDcioCKfcSIEZoyZYpGjBihTz/9VL/61a+0atWqbpf/7W9/q6amJs2ePVv9+/eXy+VSVFTv/zcaAP4XBVTsdXV1GjFihCTpvvvuU2NjY4+XFXj00Ue1bNkyzZgxQ1euXFFJSYncbndoEgMAehRQsScmJuqXv/ylhg8frsOHD+uuu+7qcfkBAwZo7dq1IQkIALg1AV0ErKqqSgkJCdq3b59SU1NVXl7udC4AQJACKna3261BgwbpjjvuUHp6ui5duuR0LgBAkAIq9rKyMjU0NOivf/2rWlpaVFxc7HQuAECQAir2+vp6LViwQLGxsZowYYIuX77sdC4AQJACKnafz6cLFy7I5XKpublZ/foF9DAAQBgE9KmYhQsXqqCgQGfPntX06dO1fPlyp3MBAIIUULE3Njbqj3/8oy5cuKA77rij81fkAQD6noCOqezYsUOSlJycTKkDQB8X0B57e3u7pkyZovT09M7j61VVVY4GAwAEp8dif/XVV/X8889r8eLFampq0p133tlbuQAAQerxUMyBAwckSWPGjNHbb7+tMWPGdP4BAPRNPRa7MabL2wCAvqvHYr/+jVLeNAWAyNDjMfZjx44pPz9fxhh99tlnnbddLpe2b9/eWxkBALegx2LfuXNnb+UAAIRIj8V+991391YOAECIcNEXALAMxQ4AlqHYAcAyFDsAWIZiBwDLUOwAYBmKHQAsQ7EDgGUodgCwDMUOAJYJ6Dco3YqOjg6VlJTozJkzam9v15w5c/Twww+HegwAoBshL/adO3cqKSlJr7zyii5evKgnn3ySYgeAXhTyYv/hD3+oiRMndn4dFRUV6hEAgB6EvNjj4+MlSc3NzZo/f75+8pOffOVjvF6vPB5PUPPShmUovr/7K5fLysoKav3daWn1qv7U5yFdZ3fa2tpu2D6h/l7Qs2Bfm+F28+smkpD99oS82CWpsbFRc+fOVWFhoZ544omvXN7tdt9WWQ1bWhP0Y4N1qvLxXitYj8dDmYdRpG77SH7dkD3wWV0JebGfO3dOzzzzjMrKyjR27NhQrx4A8BVC/nHH1157TZcuXdKrr76qoqIiFRUVqa2tLdRjAADdCPkee2lpqUpLS0O9WgBAgDhBCQAsQ7EDgGUodgCwDMUOAJah2AHAMhQ7AFiGYgcAy1DsAGAZih0ALEOxA4BlKHYAsAzFDgCWodgBwDIUOwBYhmIHAMtQ7ABgGYodACxDsQOAZSh2ALAMxQ4AlqHYAcAyFDsAWIZiBwDLUOwAYBmKHQAsQ7EDgGUodgCwjGPFfuTIERUVFTm1egBAN6KdWOkbb7yhnTt3qn///k6sHgDQA0eKPS0tTevXr9eSJUsCWt7r9crj8QQ1KysrK6jH3a62Dp/iYqJ6ZVa4vkf07vN8s9b2K+ofG/yPaLCvm5ZWr+pPfR703FBoa2sLuhOCkTYsQ/H93SFZ161sd6e2tSPFPnHiRH3xxRcBL+92uyOuvOJiojRsaU1YZp+qfDwsc/8Xhft5DsfsU5WPh/3n0ePx9HqGSNzW3f3jx5unAGAZih0ALEOxA4BlHCv2oUOHaseOHU6tHgDQDfbYAcAyFDsAWIZiBwDLUOwAYBmKHQAsQ7EDgGUodgCwDMUOAJah2AHAMhQ7AFiGYgcAy1DsAGAZih0ALEOxA4BlKHYAsAzFDgCWodgBwDIUOwBYhmIHAMtQ7ABgGYodACxDsQOAZSh2ALAMxQ4AlqHYAcAyFDsAWCbaiZX6/X6tXLlSn376qWJjY7Vq1Srdc889TowCANzEkT323bt3q729XW+99ZYWLVqkyspKJ8YAALrgSLEfPnxYDzzwgCQpOztbR48edWIMAKALLmOMCfVKly9frkcffVTjxo2TJI0fP167d+9WdHTXR34++ugjud3uUMcAAKt5vV5lZ2f/1/2OHGNPSEhQS0tL59d+v7/bUpfUZTAAQHAcORQzcuRIffDBB5Ku7o1nZmY6MQYA0AVHDsVc+1TMiRMnZIzR6tWr9c1vfjPUYwAAXXCk2AEA4cMJSgBgGYodACxDsQOAZSKy2F9//XVNnz5dU6dO1dtvv63Tp0+roKBAhYWF+ulPfyq/3y9J2rFjh6ZOnapp06Zp7969kqS2tjbNmzdPhYWFmjVrli5cuCDp6qd38vLylJ+frw0bNjiWvaOjQ4sWLVJ+fr4KCwtVV1cXEfmPHDmioqIiSXI074YNG5Sbm6v8/Hx9/PHHIc/u8XhUWFiooqIiPfvsszp37lzEZL/m3Xff1fTp0zu/joTs58+f15w5czRjxgzl5+ervr4+YrJ7PB5NmzZNBQUFWrZsWZ9/vUuSTIQ5cOCAmT17tvH5fKa5udmsW7fOzJ492xw4cMAYY8yKFSvMn/70J/Pll1+aSZMmGa/Xay5dutR5+8033zTr1q0zxhjz+9//3rz00kvGGGMmT55sTp8+bfx+v3nuuefM0aNHHcm/a9cuM3/+fGOMMbW1teaFF17o8/k3bdpkJk2aZPLy8owxxrG8R48eNUVFRcbv95szZ86YqVOnhjz7jBkzzPHjx40xxmzbts2sXr06YrIbY8zx48fNj370o877IiV7cXGxqampMcYYs3//frN3796Iyf7888+bP//5z8YYY1588UXz/vvv99ns10TcHnttba0yMzM1d+5c/fjHP9b48eN17NgxjRkzRpL04IMP6m9/+5s+/vhjjRgxQrGxsUpMTFRaWpo++eSTGy538OCDD2r//v1qbm5We3u70tLS5HK5lJOTo/379zuSPz09XT6fT36/X83NzYqOju7z+dPS0rR+/frOr53Ke/jwYeXk5Mjlcumuu+6Sz+fr3OMJVfY1a9YoKytLkuTz+eR2uyMm+8WLF/Xzn/9cJSUlnfdFSva///3vampq0syZM/Xuu+9qzJgxEZM9KytL//73v2WMUUtLi6Kjo/ts9msirtgvXryoo0ePau3atfrZz36mxYsXyxgjl8slSYqPj9fly5fV3NysxMTEzsfFx8erubn5hvuvXzYhIeGGZS9fvuxI/gEDBujMmTN67LHHtGLFChUVFfX5/BMnTrzhzGGn8jrxfdycPSUlRdLVotmyZYtmzpwZEdl9Pp+WL1+ukpISxcfHdy4TCdkl6cyZMxo4cKA2b96sIUOG6I033oiY7MOGDVN5ebkee+wxnT9/Xt///vf7bPZrHLmkgJOSkpKUkZGh2NhYZWRkyO1261//+lfn37e0tGjgwIH/dVmDlpYWJSYm3nB/T8sOHDjQkfybN29WTk6OFi1apMbGRj399NPq6OiImPyS1K/f/+8PhDJvTExMl+sItT/84Q/auHGjNm3apOTk5IjIfuzYMZ0+fVorV66U1+vVZ599pvLycv3gBz/o89mlqz+3EyZMkCRNmDBBv/jFL3T//fdHRPby8nJt3bpV3/rWt7R161ZVVlYqJyenT2ePuD32UaNGad++fTLGqKmpSa2trRo7dqwOHjwoSfrggw80evRoDR8+XIcPH5bX69Xly5dVV1enzMxMjRw5Un/5y186lx01apQSEhIUExOj+vp6GWNUW1ur0aNHO5J/4MCBnU/eoEGDdOXKFX3nO9+JmPySHMs7cuRI1dbWyu/3q6GhQX6/X8nJySHN/rvf/U5btmxRdXW1UlNTJSkisg8fPlw1NTWqrq7WmjVrdO+992r58uURkV26+nN7Lc+hQ4d07733Rkz2QYMGde5Zp6Sk6NKlS30+e8TtsT/00EM6dOiQcnNzZYxRWVmZhg4dqhUrVmjNmjXKyMjQxIkTFRUVpaKiIhUWFsoYo4ULF8rtdqugoEDFxcUqKChQTEyMqqqqJKnzsI7P51NOTo6++93vOpJ/5syZKikpUWFhoTo6OrRw4ULdf//9EZNfkoqLix3LO3r0aE2fPl1+v19lZWUhze3z+VReXq4hQ4Zo3rx5kqTvfe97mj9/fp/P3p3BgwdHRPbi4mKVlpZq+/btSkhIUFVVlQYNGhQR2VetWqWFCxcqOjpaMTExeumll/r8dueSAgBgmYg7FAMA6BnFDgCWodgBwDIUOwBYhmIHAMtQ7ABgGYodACzzfxcvBS+/c6VNAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df['ext price'].plot(kind='hist')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Describe shows how data can be cut by percentiles" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 20.000000\n", "mean 101711.287500\n", "std 27037.449673\n", "min 55733.050000\n", "25% 89137.707500\n", "50% 100271.535000\n", "75% 110132.552500\n", "max 184793.700000\n", "Name: ext price, dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['ext price'].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here is an example of using [qcut](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 (55733.049000000006, 89137.708]\n", "1 (89137.708, 100271.535]\n", "2 (55733.049000000006, 89137.708]\n", "3 (110132.552, 184793.7]\n", "4 (89137.708, 100271.535]\n", "5 (89137.708, 100271.535]\n", "6 (55733.049000000006, 89137.708]\n", "7 (100271.535, 110132.552]\n", "8 (110132.552, 184793.7]\n", "9 (110132.552, 184793.7]\n", "10 (89137.708, 100271.535]\n", "11 (55733.049000000006, 89137.708]\n", "12 (55733.049000000006, 89137.708]\n", "13 (89137.708, 100271.535]\n", "14 (100271.535, 110132.552]\n", "15 (110132.552, 184793.7]\n", "16 (100271.535, 110132.552]\n", "17 (110132.552, 184793.7]\n", "18 (100271.535, 110132.552]\n", "19 (100271.535, 110132.552]\n", "Name: ext price, dtype: category\n", "Categories (4, interval[float64]): [(55733.049000000006, 89137.708] < (89137.708, 100271.535] < (100271.535, 110132.552] < (110132.552, 184793.7]]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.qcut(df['ext price'], q=4)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Assign the results of the values back to the original dataframe" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "df['quantile_ex_1'] = pd.qcut(df['ext price'], q=4)\n", "df['quantile_ex_2'] = pd.qcut(df['ext price'], q=10, precision=0)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
account numbernameext pricequantile_ex_1quantile_ex_2
0141962Herman LLC63626.03(55733.049000000006, 89137.708](55732.0, 76471.0]
1146832Kiehn-Spinka99608.77(89137.708, 100271.535](95908.0, 100272.0]
2163416Purdy-Kunde77898.21(55733.049000000006, 89137.708](76471.0, 87168.0]
3218895Kulas Inc137351.96(110132.552, 184793.7](124778.0, 184794.0]
4239344Stokes LLC91535.92(89137.708, 100271.535](90686.0, 95908.0]
\n", "
" ], "text/plain": [ " account number name ext price quantile_ex_1 \\\n", "0 141962 Herman LLC 63626.03 (55733.049000000006, 89137.708] \n", "1 146832 Kiehn-Spinka 99608.77 (89137.708, 100271.535] \n", "2 163416 Purdy-Kunde 77898.21 (55733.049000000006, 89137.708] \n", "3 218895 Kulas Inc 137351.96 (110132.552, 184793.7] \n", "4 239344 Stokes LLC 91535.92 (89137.708, 100271.535] \n", "\n", " quantile_ex_2 \n", "0 (55732.0, 76471.0] \n", "1 (95908.0, 100272.0] \n", "2 (76471.0, 87168.0] \n", "3 (124778.0, 184794.0] \n", "4 (90686.0, 95908.0] " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Look at the distribution" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(110132.552, 184793.7] 5\n", "(100271.535, 110132.552] 5\n", "(89137.708, 100271.535] 5\n", "(55733.049000000006, 89137.708] 5\n", "Name: quantile_ex_1, dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['quantile_ex_1'].value_counts()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(124778.0, 184794.0] 2\n", "(112290.0, 124778.0] 2\n", "(105938.0, 112290.0] 2\n", "(103606.0, 105938.0] 2\n", "(100272.0, 103606.0] 2\n", "(95908.0, 100272.0] 2\n", "(90686.0, 95908.0] 2\n", "(87168.0, 90686.0] 2\n", "(76471.0, 87168.0] 2\n", "(55732.0, 76471.0] 2\n", "Name: quantile_ex_2, dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['quantile_ex_2'].value_counts()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
account numbernameext pricequantile_ex_1quantile_ex_2quantile_ex_3
0141962Herman LLC63626.03(55733.049000000006, 89137.708](55732.0, 76471.0]Bronze
1146832Kiehn-Spinka99608.77(89137.708, 100271.535](95908.0, 100272.0]Gold
2163416Purdy-Kunde77898.21(55733.049000000006, 89137.708](76471.0, 87168.0]Bronze
3218895Kulas Inc137351.96(110132.552, 184793.7](124778.0, 184794.0]Diamond
4239344Stokes LLC91535.92(89137.708, 100271.535](90686.0, 95908.0]Silver
\n", "
" ], "text/plain": [ " account number name ext price quantile_ex_1 \\\n", "0 141962 Herman LLC 63626.03 (55733.049000000006, 89137.708] \n", "1 146832 Kiehn-Spinka 99608.77 (89137.708, 100271.535] \n", "2 163416 Purdy-Kunde 77898.21 (55733.049000000006, 89137.708] \n", "3 218895 Kulas Inc 137351.96 (110132.552, 184793.7] \n", "4 239344 Stokes LLC 91535.92 (89137.708, 100271.535] \n", "\n", " quantile_ex_2 quantile_ex_3 \n", "0 (55732.0, 76471.0] Bronze \n", "1 (95908.0, 100272.0] Gold \n", "2 (76471.0, 87168.0] Bronze \n", "3 (124778.0, 184794.0] Diamond \n", "4 (90686.0, 95908.0] Silver " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bin_labels_5 = ['Bronze', 'Silver', 'Gold', 'Platinum', 'Diamond']\n", "df['quantile_ex_3'] = pd.qcut(df['ext price'],\n", " q=[0, .2, .4, .6, .8, 1],\n", " labels=bin_labels_5)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Diamond 4\n", "Platinum 4\n", "Gold 4\n", "Silver 4\n", "Bronze 4\n", "Name: quantile_ex_3, dtype: int64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['quantile_ex_3'].value_counts()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "results, bin_edges = pd.qcut(df['ext price'],\n", " q=[0, .2, .4, .6, .8, 1],\n", " labels=bin_labels_5,\n", " retbins=True)\n", "\n", "results_table = pd.DataFrame(zip(bin_edges, bin_labels_5),\n", " columns=['Threshold', 'Tier'])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ThresholdTier
055733.050Bronze
187167.958Silver
295908.156Gold
3103605.970Platinum
4112290.054Diamond
\n", "
" ], "text/plain": [ " Threshold Tier\n", "0 55733.050 Bronze\n", "1 87167.958 Silver\n", "2 95908.156 Gold\n", "3 103605.970 Platinum\n", "4 112290.054 Diamond" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results_table" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
quantile_ex_1quantile_ex_2quantile_ex_3
count202020
unique4105
top(110132.552, 184793.7](124778.0, 184794.0]Diamond
freq524
\n", "
" ], "text/plain": [ " quantile_ex_1 quantile_ex_2 quantile_ex_3\n", "count 20 20 20\n", "unique 4 10 5\n", "top (110132.552, 184793.7] (124778.0, 184794.0] Diamond\n", "freq 5 2 4" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe(include='category')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can pass the percentiles to use to describe" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
account numberext price
count20.00000020.000000
mean476998.750000101711.287500
std231499.20897027037.449673
min141962.00000055733.050000
0%141962.00000055733.050000
33.3%332759.33333391241.493333
50%476006.500000100271.535000
66.7%662511.000000104178.580000
100%786968.000000184793.700000
max786968.000000184793.700000
\n", "
" ], "text/plain": [ " account number ext price\n", "count 20.000000 20.000000\n", "mean 476998.750000 101711.287500\n", "std 231499.208970 27037.449673\n", "min 141962.000000 55733.050000\n", "0% 141962.000000 55733.050000\n", "33.3% 332759.333333 91241.493333\n", "50% 476006.500000 100271.535000\n", "66.7% 662511.000000 104178.580000\n", "100% 786968.000000 184793.700000\n", "max 786968.000000 184793.700000" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe(percentiles=[0, 1/3, 2/3, 1])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "labels=False will return integers for each bin" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
account numbernameext pricequantile_ex_1quantile_ex_2quantile_ex_3quantile_ex_4
0141962Herman LLC63626.03(55733.049000000006, 89137.708](55732.0, 76471.0]Bronze0
1146832Kiehn-Spinka99608.77(89137.708, 100271.535](95908.0, 100272.0]Gold2
2163416Purdy-Kunde77898.21(55733.049000000006, 89137.708](76471.0, 87168.0]Bronze0
3218895Kulas Inc137351.96(110132.552, 184793.7](124778.0, 184794.0]Diamond4
4239344Stokes LLC91535.92(89137.708, 100271.535](90686.0, 95908.0]Silver1
\n", "
" ], "text/plain": [ " account number name ext price quantile_ex_1 \\\n", "0 141962 Herman LLC 63626.03 (55733.049000000006, 89137.708] \n", "1 146832 Kiehn-Spinka 99608.77 (89137.708, 100271.535] \n", "2 163416 Purdy-Kunde 77898.21 (55733.049000000006, 89137.708] \n", "3 218895 Kulas Inc 137351.96 (110132.552, 184793.7] \n", "4 239344 Stokes LLC 91535.92 (89137.708, 100271.535] \n", "\n", " quantile_ex_2 quantile_ex_3 quantile_ex_4 \n", "0 (55732.0, 76471.0] Bronze 0 \n", "1 (95908.0, 100272.0] Gold 2 \n", "2 (76471.0, 87168.0] Bronze 0 \n", "3 (124778.0, 184794.0] Diamond 4 \n", "4 (90686.0, 95908.0] Silver 1 " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['quantile_ex_4'] = pd.qcut(df['ext price'],\n", " q=[0, .2, .4, .6, .8, 1],\n", " labels=False)\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## cut" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Remove the added columns to make the examples shorter" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "df = df.drop(columns = ['quantile_ex_1','quantile_ex_2', 'quantile_ex_3', 'quantile_ex_4'])" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 (55603.989, 87998.212]\n", "1 (87998.212, 120263.375]\n", "2 (55603.989, 87998.212]\n", "3 (120263.375, 152528.538]\n", "4 (87998.212, 120263.375]\n", "5 (87998.212, 120263.375]\n", "6 (55603.989, 87998.212]\n", "7 (87998.212, 120263.375]\n", "8 (87998.212, 120263.375]\n", "9 (152528.538, 184793.7]\n", "10 (87998.212, 120263.375]\n", "11 (55603.989, 87998.212]\n", "12 (55603.989, 87998.212]\n", "13 (87998.212, 120263.375]\n", "14 (87998.212, 120263.375]\n", "15 (120263.375, 152528.538]\n", "16 (87998.212, 120263.375]\n", "17 (87998.212, 120263.375]\n", "18 (87998.212, 120263.375]\n", "19 (87998.212, 120263.375]\n", "Name: ext price, dtype: category\n", "Categories (4, interval[float64]): [(55603.989, 87998.212] < (87998.212, 120263.375] < (120263.375, 152528.538] < (152528.538, 184793.7]]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.cut(df['ext price'], bins=4)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(87998.212, 120263.375] 12\n", "(55603.989, 87998.212] 5\n", "(120263.375, 152528.538] 2\n", "(152528.538, 184793.7] 1\n", "Name: ext price, dtype: int64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.cut(df['ext price'], bins=4).value_counts()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "cut_labels_4 = ['silver', 'gold', 'platinum', 'diamond']\n", "cut_bins = [0, 70000, 100000, 130000, 200000]\n", "df['cut_ex1'] = pd.cut(df['ext price'], bins=cut_bins, labels=cut_labels_4)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
account numbernameext pricecut_ex1
0141962Herman LLC63626.03silver
1146832Kiehn-Spinka99608.77gold
2163416Purdy-Kunde77898.21gold
3218895Kulas Inc137351.96diamond
4239344Stokes LLC91535.92gold
\n", "
" ], "text/plain": [ " account number name ext price cut_ex1\n", "0 141962 Herman LLC 63626.03 silver\n", "1 146832 Kiehn-Spinka 99608.77 gold\n", "2 163416 Purdy-Kunde 77898.21 gold\n", "3 218895 Kulas Inc 137351.96 diamond\n", "4 239344 Stokes LLC 91535.92 gold" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can use nump.linspace to define the ranges" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0., 25000., 50000., 75000., 100000., 125000., 150000.,\n", " 175000., 200000.])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.linspace(0, 200000, 9)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 (50000.0, 75000.0]\n", "1 (75000.0, 100000.0]\n", "2 (75000.0, 100000.0]\n", "3 (125000.0, 150000.0]\n", "4 (75000.0, 100000.0]\n", "5 (75000.0, 100000.0]\n", "6 (75000.0, 100000.0]\n", "7 (100000.0, 125000.0]\n", "8 (100000.0, 125000.0]\n", "9 (175000.0, 200000.0]\n", "10 (75000.0, 100000.0]\n", "11 (50000.0, 75000.0]\n", "12 (75000.0, 100000.0]\n", "13 (75000.0, 100000.0]\n", "14 (100000.0, 125000.0]\n", "15 (100000.0, 125000.0]\n", "16 (100000.0, 125000.0]\n", "17 (100000.0, 125000.0]\n", "18 (100000.0, 125000.0]\n", "19 (100000.0, 125000.0]\n", "Name: ext price, dtype: category\n", "Categories (8, interval[float64]): [(0.0, 25000.0] < (25000.0, 50000.0] < (50000.0, 75000.0] < (75000.0, 100000.0] < (100000.0, 125000.0] < (125000.0, 150000.0] < (150000.0, 175000.0] < (175000.0, 200000.0]]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.cut(df['ext price'], bins=np.linspace(0, 200000, 9))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "numpy arange is another option" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0, 10000, 20000, 30000, 40000, 50000, 60000, 70000,\n", " 80000, 90000, 100000, 110000, 120000, 130000, 140000, 150000,\n", " 160000, 170000, 180000, 190000])" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.arange(0, 200000, 10000)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "IntervalIndex([[0, 10000), [10000, 20000), [20000, 30000), [30000, 40000), [40000, 50000) ... [150000, 160000), [160000, 170000), [170000, 180000), [180000, 190000), [190000, 200000)],\n", " closed='left',\n", " dtype='interval[int64]')" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.interval_range(start=0, freq=10000, end=200000, closed='left')" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
account numbernameext pricecut_ex1cut_ex2
0141962Herman LLC63626.03silver(60000, 70000]
1146832Kiehn-Spinka99608.77gold(90000, 100000]
2163416Purdy-Kunde77898.21gold(70000, 80000]
3218895Kulas Inc137351.96diamond(130000, 140000]
4239344Stokes LLC91535.92gold(90000, 100000]
\n", "
" ], "text/plain": [ " account number name ext price cut_ex1 cut_ex2\n", "0 141962 Herman LLC 63626.03 silver (60000, 70000]\n", "1 146832 Kiehn-Spinka 99608.77 gold (90000, 100000]\n", "2 163416 Purdy-Kunde 77898.21 gold (70000, 80000]\n", "3 218895 Kulas Inc 137351.96 diamond (130000, 140000]\n", "4 239344 Stokes LLC 91535.92 gold (90000, 100000]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "interval_range = pd.interval_range(start=0, freq=10000, end=200000)\n", "df['cut_ex2'] = pd.cut(df['ext price'], bins=interval_range, labels=[1,2,3])\n", "df.head()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2" } }, "nbformat": 4, "nbformat_minor": 2 }