{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Analysis - Instakart" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Before carrying out MBA on the given dataset, we can study the given dataset, its attributes and summarize the dataset for a better understanding." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#imports\n", "import pandas as pd #Python data analysis library\n", "import numpy as np #Python scientific computing\n", "import matplotlib.pyplot as plt #For plotting\n", "import matplotlib.mlab as mlab\n", "import seaborn as sns #Python visualization library\n", "from scipy.optimize import curve_fit\n", "from IPython.display import display, HTML\n", "\n", "#Plots inline\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Citation - “The Instacart Online Grocery Shopping Dataset 2017”, Accessed from https://www.instacart.com/datasets/grocery-shopping-2017\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#import dataset\n", "trainDf = pd.read_csv(\"../data/raw/order_products__train.csv\")\n", "orderDf = pd.read_csv(\"../data/raw/orders.csv\")\n", "depDf = pd.read_csv(\"../data/raw/departments.csv\")\n", "aisleDf = pd.read_csv(\"../data/raw/aisles.csv\")\n", "productDf = pd.read_csv(\"../data/raw/products.csv\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(3421083, 7)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "orderDf.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are 3421083 orders, roughly 35 lakh orders" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(49688, 4)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "productDf.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are 49688 products, roughly 50000 products" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "scrolled": true }, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD0CAYAAACLpN0/AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAFfxJREFUeJzt3X9sG3f9x/FXEtehi72Wah3SAFdLNksZk5WkaBKk2SghK1Sd0CLmBmtmaPyRlqJC6EZLtnZeCCHtRtmvrqJQMskDkmhBaH+AChFhodk6wMKEDKeIgIroCnNZgZ5H7KS+7x/99r7ku3R2l7jZPvd8/HX2+3L3ebXVKyfnrimzbdsWAOBtr3ypFwAAWBwUOgAYgkIHAENQ6ABgCAodAAxBoQOAITxLdeJEIrFUpwaAt7W1a9fO+/6SFbr0f4tKpVKqra1dyqUsGbK7M7vk7vxuzi4tLP8bXQzzkQsAGIJCBwBDUOgAYAgKHQAMQaEDgCEodAAwBIUOAIag0AHAEEv6YNGbFYu589wA8Ea4QgcAQ1DoAGAICh0ADEGhA4AhKHQAMASFDgCGoNABwBAFCz2fz2vPnj3avHmzotGoTpw4MWc+ODio1tZWhcNhjYyMSJJee+01felLX1IkEtEdd9yh8fHx0qweAOAo+GDR8PCwcrmcBgYGlEwm1dvbq4MHD0qS0um04vG4hoaGlM1mFYlE1NjYqMOHD+v666/Xvn37NDk5qcnJSYVCoZKHAQA3K3iFnkgk1NTUJEmqq6vTxMSEMxsfH1d9fb28Xq/8fr8CgYAmJyd19OhRLVu2TJ/5zGf05JNPOl8PACidglfolmXJ5/M5rysqKjQ7OyuPxyPLsuT3+51ZVVWVLMvSmTNn9O9//1uHDx/WD3/4Q+3du1f79u173bFTqZQkaXp62tkuRjp9VdH7LrZU6vSiHu9Ss5vEzdkld+d3c3apdPkLFrrP51Mmk3Fe5/N5eTyeeWeZTEZ+v18rV67Uhz/8YUnS+vXrdejQoXmPfeGXpF7qL0xdvbroXRddbe3intzNvyzXzdkld+d3c3ZpCX9JdENDg0ZHRyVJyWRSwWDQmYVCISUSCWWzWZ09e1ZTU1MKBoNau3atnnvuOUnSr371K1133XVvauEAgOIVvEJvaWnR2NiY2traZNu2enp61NfXp0AgoObmZkWjUUUiEdm2rY6ODlVWVqq9vV3333+/Nm/eLI/Ho717916OLADgagULvby8XF1dXXPeq6mpcbbD4bDC4fCc+cqVK/XEE08s0hIBAMXgwSIAMASFDgCGoNABwBAUOgAYgkIHAENQ6ABgCAodAAxBoQOAISh0ADAEhQ4AhqDQAcAQFDoAGIJCBwBDUOgAYAgKHQAMQaEDgCEodAAwBIUOAIag0AHAEBQ6ABiCQgcAQ1DoAGAICh0ADEGhA4AhKHQAMISn0A75fF6xWEzHjx+X1+tVd3e31qxZ48wHBwfV398vj8ejrVu3av369frnP/+pDRs2KBgMSpI+8pGP6K677ipdCgBA4UIfHh5WLpfTwMCAksmkent7dfDgQUlSOp1WPB7X0NCQstmsIpGIGhsb9fvf/16bNm3S7t27Sx4AAHBewY9cEomEmpqaJEl1dXWamJhwZuPj46qvr5fX65Xf71cgENDk5KQmJib00ksv6c4779T27dv1yiuvlC4BAEBSEVfolmXJ5/M5rysqKjQ7OyuPxyPLsuT3+51ZVVWVLMtSdXW1brzxRn3wgx/Us88+q+7ubj322GOvO3YqlZIkTU9PO9vFSKevKnrfxZZKnV7U411qdpO4Obvk7vxuzi6VLn/BQvf5fMpkMs7rfD4vj8cz7yyTycjv9ysUCmn58uWSpJaWlnnLXJJqa2slnS/2C9vFWL266F0XXW3t4p78UrObxM3ZJXfnd3N2aWH5E4nERWcFP3JpaGjQ6OioJCmZTDo/6JSkUCikRCKhbDars2fPampqSsFgUPfff7+OHDkiSXrhhRf0vve9700tHABQvIJX6C0tLRobG1NbW5ts21ZPT4/6+voUCATU3NysaDSqSCQi27bV0dGhyspK7dixQ52dnfr+97+v5cuXq7u7+3JkAQBXK1jo5eXl6urqmvNeTU2Nsx0OhxUOh+fM3/ve9yoejy/SEgEAxeDBIgAwBIUOAIag0AHAEBQ6ABiCQgcAQ1DoAGAICh0ADEGhA4AhKHQAMASFDgCGoNABwBAUOgAYgkIHAENQ6ABgCAodAAxBoQOAISh0ADAEhQ4AhqDQAcAQFDoAGIJCBwBDUOgAYAgKHQAMQaEDgCEKFno+n9eePXu0efNmRaNRnThxYs58cHBQra2tCofDGhkZmTP75S9/qVtuuWVxVwwAmJen0A7Dw8PK5XIaGBhQMplUb2+vDh48KElKp9OKx+MaGhpSNptVJBJRY2OjvF6vTp06pb6+Ps3OzpY8BACgiCv0RCKhpqYmSVJdXZ0mJiac2fj4uOrr6+X1euX3+xUIBDQ5OalsNqsHHnhAsVisZAsHAMxVsNAty5LP53NeV1RUOFfdlmXJ7/c7s6qqKlmWpa6uLt19991617veVYIlAwDmU/AjF5/Pp0wm47zO5/PyeDzzzjKZjJYtW6Zf//rX+stf/qIDBw7oX//6lzo6OvSNb3zjdcdOpVKSpOnpaWe7GOn0VUXvu9hSqdOLerxLzW4SN2eX3J3fzdml0uUvWOgNDQ0aGRnRxo0blUwmFQwGnVkoFNIjjzyibDarXC6nqakphUIhHTlyxNmnsbFx3jKXpNraWknni/3CdjFWry5610VXW7u4J7/U7CZxc3bJ3fndnF1aWP5EInHRWcFCb2lp0djYmNra2mTbtnp6etTX16dAIKDm5mZFo1FFIhHZtq2Ojg5VVla+qUUCABamYKGXl5erq6trzns1NTXOdjgcVjgcvujXj42NLWB5AIBi8WARABiCQgcAQ1DoAGAICh0ADEGhA4AhKHQAMASFDgCGoNABwBAUOgAYgkIHAENQ6ABgCAodAAxBoQOAISh0ADAEhQ4AhqDQAcAQFDoAGIJCBwBDUOgAYAgKHQAMQaEDgCEodAAwBIUOAIag0AHAEBQ6ABjCU2iHfD6vWCym48ePy+v1qru7W2vWrHHmg4OD6u/vl8fj0datW7V+/Xq98soruvfeezUzM6MVK1booYceks/nK2kQAHC7glfow8PDyuVyGhgY0I4dO9Tb2+vM0um04vG4+vv7dfjwYe3fv1+5XE7f+ta3dPvtt+t73/uebrjhBj3zzDMlDQEAKOIKPZFIqKmpSZJUV1eniYkJZzY+Pq76+np5vV55vV4FAgFNTk6qs7NTtm0rn8/r1KlTuuaaa0qXAAAgqYhCtyxrzsclFRUVmp2dlcfjkWVZ8vv9zqyqqkqWZamsrEyzs7P6+Mc/rmw2q23bts177FQqJUmanp52touRTl9V9L6LLZU6vajHu9TsJnFzdsnd+d2cXSpd/oKF7vP5lMlknNf5fF4ej2feWSaTcQp+2bJl+tGPfqTnn39eO3fu1NNPP/26Y9fW1ko6X+wXtouxenXRuy662trFPfmlZjeJm7NL7s7v5uzSwvInEomLzgp+ht7Q0KDR0VFJUjKZVDAYdGahUEiJRELZbFZnz57V1NSUgsGgYrGYjh07Jun8VXtZWdmbWjgAoHgFr9BbWlo0NjamtrY22batnp4e9fX1KRAIqLm5WdFoVJFIRLZtq6OjQ5WVlYpGo4rFYjpw4IDKy8sVi8UuQxQAcLeChV5eXq6urq4579XU1Djb4XBY4XD4dfN4PL5ISwQAFIMHiwDAEBQ6ABiCQgcAQ1DoAGAICh0ADEGhA4AhKHQAMASFDgCGoNABwBAUOgAYgkIHAENQ6ABgCAodAAxBoQOAISh0ADAEhQ4AhqDQAcAQFDoAGIJCBwBDUOgAYAgKHQAMQaEDgCEodAAwBIUOAIag0AHAEJ5CO+TzecViMR0/flxer1fd3d1as2aNMx8cHFR/f788Ho+2bt2q9evX6+WXX1ZnZ6fOnTsn27bV1dWl6urqkgYBALcreIU+PDysXC6ngYEB7dixQ729vc4snU4rHo+rv79fhw8f1v79+5XL5fToo4/qzjvvVDweV3t7u/bv31/SEACAIq7QE4mEmpqaJEl1dXWamJhwZuPj46qvr5fX65XX61UgENDk5KR27twpv98vSTp37pwqKytLtHwAwAUFC92yLPl8Pud1RUWFZmdn5fF4ZFmWU9ySVFVVJcuytGrVKknSn/70J+3du1cHDhyY99ipVEqSND097WwXI52+quh9F1sqdXpRj3ep2U3i5uySu/O7ObtUuvwFC93n8ymTyTiv8/m8PB7PvLNMJuMU/LFjx/Tggw9q3759F/38vLa2VtL5Yr+wXYzVq4veddHV1i7uyS81u0ncnF1yd343Z5cWlj+RSFx0VrDQGxoaNDIyoo0bNyqZTCoYDDqzUCikRx55RNlsVrlcTlNTUwoGgzp27Ji++tWv6tvf/rbe/e53v6lFv1XFYot7vHT6qqK+QS32eQGYp2Cht7S0aGxsTG1tbbJtWz09Perr61MgEFBzc7Oi0agikYhs21ZHR4cqKyvV09OjmZkZ7dq1S5J07bXXqqurq+RhAMDNChZ6eXn568q4pqbG2Q6HwwqHw3Pmzz777CItDwBQLB4sAgBDUOgAYAgKHQAMQaEDgCEodAAwBIUOAIag0AHAEBQ6ABiCQgcAQ1DoAGAICh0ADEGhA4AhKHQAMASFDgCGoNABwBAUOgAYgkIHAENQ6ABgCAodAAxBoQOAISh0ADAEhQ4AhqDQAcAQFDoAGKJgoefzee3Zs0ebN29WNBrViRMn5swHBwfV2tqqcDiskZGRObOnnnpKDz/88OKuGAAwL0+hHYaHh5XL5TQwMKBkMqne3l4dPHhQkpROpxWPxzU0NKRsNqtIJKLGxkbl83ndd999+t3vfqdbb7215CEAAEVcoScSCTU1NUmS6urqNDEx4czGx8dVX18vr9crv9+vQCCgyclJZbNZ3X777dqyZUvpVg4AmKNgoVuWJZ/P57yuqKjQ7OysM/P7/c6sqqpKlmVpxYoVWrduXQmWCwC4mIIfufh8PmUyGed1Pp+Xx+OZd5bJZOYUfCGpVEqSND097WwXI52+quh93+pmZ2eVTqcL7pdKnb4Mq7m8LvXv3TRuzu/m7FLp8hcs9IaGBo2MjGjjxo1KJpMKBoPOLBQK6ZFHHlE2m1Uul9PU1NSceSG1tbWSzhf7he1irF5d9K5veel0WquLCFRba1Do/3Wpf++mcXN+N2eXFpY/kUhcdFaw0FtaWjQ2Nqa2tjbZtq2enh719fUpEAioublZ0WhUkUhEtm2ro6NDlZWVb2qRAICFKVjo5eXl6urqmvNeTU2Nsx0OhxUOh+f92tbW1gUuDwBQLB4sAgBDUOgAYIiCH7ngrSEWc+e5ARSPK3QAMASFDgCGoNABwBAUOgAYgkIHAENQ6ABgCAodAAxBoQOAISh0ADAET4qioFI9KZpOX/WG/xUyT6gCl4YrdAAwBIUOAIag0AHAEBQ6ABiCQgcAQ1DoAGAIblvEW9ZS3bbI7ZJ4u+IKHQAMQaEDgCH4yAX4fy7XRy7zPSnLxz1YCK7QAcAQFDoAGKLgRy75fF6xWEzHjx+X1+tVd3e31qxZ48wHBwfV398vj8ejrVu3av369Xr11Vd1zz33aHp6WldffbW+9rWvafny5SUNApiAO3sun6XMvHlzaY5bsNCHh4eVy+U0MDCgZDKp3t5eHTx4UJKUTqcVj8c1NDSkbDarSCSixsZGPfnkk9q0aZNaW1t16NAhDQwM6NOf/nRpEgBYsMtdbv/98wM3fjMplYKFnkgk1NTUJEmqq6vTxMSEMxsfH1d9fb28Xq+8Xq8CgYAmJyeVSCTU3t4uSbr55pu1f/9+Ch3AvCj0xVOw0C3Lks/nc15XVFRodnZWHo9HlmXJ7/c7s6qqKlmWNef9qqoqnT17dt5jJxKJebcLue22ond9m/jLUi9gCbk5u+Tu/O7N/tprl9Z5xSpY6D6fT5lMxnmdz+fl8XjmnWUyGfn9fuf9d7zjHcpkMrryyitfd9y1a9cuxvoBAP+r4F0uDQ0NGh0dlSQlk0kFg0FnFgqFlEgklM1mdfbsWU1NTSkYDKqhoUHPPfecJGl0dJTyBoDLoMy2bfuNdrhwl8sf/vAH2batnp4ejY6OKhAIqLm5WYODgxoYGJBt22pvb9eGDRt0+vRp7dy5U5lMRu985zv19a9/XVdcccXlygQArlSw0Eul0O2QJpqZmVFnZ6dOnjypXC6nrVu36rrrrtOuXbtUVlam66+/Xg888IDKy819POAf//iHWltb9Z3vfEcej8dV2b/5zW/qZz/7mWZmZvTJT35SN910kyvyz8zMaNeuXTp58qTKy8v1la98xRV/97/97W/18MMPKx6P68SJE/PmfeKJJ/Tzn/9cHo9HnZ2dCoVCCzupvUSOHDli79y507Zt2/7Nb35jb9myZamWctk888wzdnd3t23btn3mzBn7lltusdvb2+1jx47Ztm3bu3fvtn/yk58s5RJLKpfL2Z/97GftW2+91f7jH//oquzHjh2z29vb7XPnztmWZdmPPfaYa/L/9Kc/tbdv327btm0fPXrU/tznPmd89kOHDtmbNm2y77jjDtu27XnzTkxM2NFo1M7n8/bJkyft1tbWBZ93yb4lvtHtkKb66Ec/qs9//vOSJNu2VVFRoZdeekk33XSTpPO3eD7//PNLucSS2rt3r9ra2nT11VdLkquyHz16VMFgUNu2bdOWLVv0oQ99yDX5r732Wp07d075fF6WZcnj8RifPRAI6PHHH3dez5c3kUho3bp1Kisr0zXXXKNz587p1VdfXdB5l6zQL3Y7pMmqqqrk8/lkWZa2b9+uL3zhC7JtW2VlZc78Yrd4vt394Ac/0KpVq5xv4pJck12Szpw5o4mJCT366KN68MEHdc8997gm/xVXXKGTJ0/qYx/7mHbv3q1oNGp89g0bNjh3A0rz/1v//x24GH8OS/a/Lb7R7ZAmO3XqlLZt26ZIJKLbbrtNDz30kDO72C2eJhgaGlJZWZleeOEFpVIp7dy5c87ViMnZJWnlypWqrq6W1+tVdXW1Kisr9be//c2Zm5z/qaee0rp167Rjxw6dOnVKd911l2ZmZpy5ydkv+O+fD1zIe7Hbvhd0ngV99QK80e2Qpjp9+rTuvvtu3XvvvfrEJz4hSbrhhhv04osvSjp/i+f73//+pVxiyXz3u9/V008/rXg8rtraWu3du1c333yzK7JL55+7+MUvfiHbtvX3v/9d//nPf/SBD3zAFfmvvPJKp6hWrFih2dlZ1/y7v2C+vA0NDTp69Kjy+bxefvll5fN5rVq1akHnWfK7XP77dsiampqlWMpl093drR//+Meqrq523rvvvvvU3d2tmZkZVVdXq7u7WxUVFUu4ytKLRqOKxWIqLy/X7t27XZN93759evHFF2Xbtjo6OvSe97zHFfkzmYw6OzuVTqc1MzOjT33qU7rxxhuNz/7Xv/5VX/ziFzU4OKg///nP8+Z9/PHHNTo6qnw+ry9/+csL/sa2ZIUOAFhcZt34CQAuRqEDgCEodAAwBIUOAIag0AHAEBQ6ABiCQgcAQ1DoAGCI/wF5dXO4KuzpXQAAAABJRU5ErkJggg==\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#get distribution of number of orders per customer\n", "sns.set_style('whitegrid')\n", "customerNumOrderFrame = orderDf.groupby(\"user_id\",as_index = False)[\"order_number\"].max()\n", "num_bins = 10\n", "n, bins, patches = plt.hist(customerNumOrderFrame[\"order_number\"] , num_bins, normed=1, color='blue', alpha=0.5)\n", "mu = customerNumOrderFrame[\"order_number\"].mean()\n", "sigma = customerNumOrderFrame[\"order_number\"].std()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Looking at the histogram it seems to be skewed and appropriate distribution will be an exponential function." ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0.5,1,u'Number of Orders per Customer Distribution')" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "n, bins, patches = plt.hist(customerNumOrderFrame[\"order_number\"] , num_bins, normed=1, facecolor='yellow', alpha=0.5)\n", "\n", "bins = np.delete(bins,10) #to make dimensions of x & y axis values equal\n", "bins = bins+5 #to ge the central value of each bar\n", "\n", "def exponenial_func(x, a, b, c):\n", " return a*np.exp(-b*x)+c\n", "\n", "popt, pcov = curve_fit(exponenial_func, bins, n, p0=(1, 1e-6, 1))\n", "\n", "xx = np.linspace(8, 100, 30)\n", "yy = exponenial_func(xx, *popt)\n", "\n", "plt.plot(xx, yy ,'r--')\n", "plt.xlabel(\"No. of Orders\")\n", "plt.ylabel(\"Count\")\n", "plt.title(\"Number of Orders per Customer Distribution\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now to find the top frequently purchased products we will merge the prior and train dataset to get the complete order dataset. We will use append() for doing so." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "priorDf = pd.read_csv(\"../data/raw/order_products__prior.csv\")\n", "trainDf = trainDf.append(priorDf,ignore_index = True)\n", "#Now a product count data frame can be created by counting the order_id for each product_id\n", "productCountDf = trainDf.groupby(\"product_id\",as_index = False)[\"order_id\"].count()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(49685, 2)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "productCountDf.shape" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
product_name
0Banana
1Bag of Organic Bananas
2Organic Strawberries
3Organic Baby Spinach
4Organic Hass Avocado
5Organic Avocado
6Large Lemon
7Strawberries
8Limes
9Organic Whole Milk
10Organic Raspberries
11Organic Yellow Onion
12Organic Garlic
13Organic Zucchini
14Organic Blueberries
15Cucumber Kirby
16Organic Fuji Apple
17Organic Lemon
18Organic Grape Tomatoes
19Apple Honeycrisp Organic
\n", "
" ], "text/plain": [ " product_name\n", "0 Banana\n", "1 Bag of Organic Bananas\n", "2 Organic Strawberries\n", "3 Organic Baby Spinach\n", "4 Organic Hass Avocado\n", "5 Organic Avocado\n", "6 Large Lemon\n", "7 Strawberries\n", "8 Limes\n", "9 Organic Whole Milk\n", "10 Organic Raspberries\n", "11 Organic Yellow Onion\n", "12 Organic Garlic\n", "13 Organic Zucchini\n", "14 Organic Blueberries\n", "15 Cucumber Kirby\n", "16 Organic Fuji Apple\n", "17 Organic Lemon\n", "18 Organic Grape Tomatoes\n", "19 Apple Honeycrisp Organic" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#Top 20 most frequently purchased products\n", "topLev = 20\n", "\n", "#Here order_id is the count so we need to sort the data frame w.r.t order_id\n", "productCountDf = productCountDf.sort_values(\"order_id\",ascending = False)\n", "\n", "topProdFrame = productCountDf.iloc[0:topLev,:]\n", "topProdFrame = topProdFrame.merge(productDf,on = \"product_id\")\n", "\n", "display(topProdFrame.loc[:,[\"product_name\"]])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A density plot for product count can give an idea about to what extend we can perform smoothing of the dataset " ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0,0.5,u'Density')" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEWCAYAAABMoxE0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAIABJREFUeJzt3XucXVV99/HPd2aSELmEEIYqSSCxROlgK8I01UfaUmMleIt9Cm2oF7BU2gpqvdSGahGpqdKnAvoCWqNQAyoJTWs72rTIVeWxBAaIQoDIGAJJiDIEEq65zMyvf+w1yc7hzNknc2Znbt/363Ves8/aa6+z1pmZ8z17r7P3UURgZmY2WE3D3QEzMxvdHCRmZtYQB4mZmTXEQWJmZg1xkJiZWUMcJGZm1hAHiY0Ikn5T0tr99Fgh6ZghauuXJP1A0rOSvjgUbZZJ0smSNg53P2xscZDYXiStl/RiemHcKulHkv5MUql/KxHxw4h4dUU/3jyYtiTNSmHxXLqtl7RoEO2cJen2gmrnAE8Ch0TExwfT39FI0m2S/mS4+5GX+723DHdfxhs/4VbNOyLiJklTgN8GvgT8BvD+4e3WPjs0InokvQG4WdLqiPjvIX6Mo4EHYhBn9kpqiYiewT6wJAGKiL7BtmE2JCLCN99234D1wJsryuYCfcBr0v1JwD8AjwG/AP4JmJzWnQxsBD4OPAFsBt6fa+utwAPAs8Am4BP57dLytenxXgSeAz4J/CfwoYp+/QT4vSpjmAUE0JIruyv3WAEck5anANcA3cCjwKfJ9tR/BdgO9KY+bK3yOF8HdgE7U503p+fmMuDxdLsMmFTx3PwV8HPg2iptngX8f+ByYBvwEDAvt/42YHGq8yJwDHAk0AE8BXQBH8jVn5z6+XR63v+y/3mufC5yY/pc7v4CYDXwDPAzYH56/N70/DwHXD7A39JJwI+ArcAG4Kxaz3ladyHwjYF+l2n8f5vG/yzwPeDwtO6xVPe5dHtDen6+n57LJ4Hlw/0/NhZvw94B30bWjSpBksofA/48LV+aXrgOAw4GvgN8Pq07GegBLgImkAXHC8DUtH4z8JtpeSpwQm67jQP1A/gDYFXu/muBLcDEKn3d/eIDCHhj6sO8tD4fJNcA/5HGMQv4KXB2WncWcHvB81X5wnsRcAdwBNCaXkj/tuK5uZgscCZXae+sVOej6fn7w/QieFhaf1v6XRyXxjcB+AFwJXAAcDzZC/SbUv0vAD9Mv6uZwP3UGSRkbyC2Ab9LFq7TgWNz/fiTGs/L0WQv9GekPk4Djq/jOb+Q4iD5GfAqspC8DfhCtbqp7DrgU6n/BwAnDff/2Fi8eY7E6vU4cFg6nHIO8NGIeCoingX+DliYq7sLuCgidkXESrJ3h6/OrWuTdEhEPB0R99T5+B3AqyTNSfffS/bucmeNbZ4ke5f+NWBRRNycXympOfX7/Ih4NiLWA19MbQ/Wu8nG/kREdAOfrWivD/hMROyIiBcHaOMJ4LL0/C0H1gJvy63/ekSsieyw2MvJgvKvImJ7RKxO431fqvsHwOL0u9oAfHkfxnI2cHVE3BgRfRGxKSIeqnPbPwJuiojr0ji2RMTqIXrO/zkifpqev+vJwnMgu8hC7cj0/BTNedkgOEisXtPJXpRbgZcBd6fJ+K3Af6fyflti72P/LwAHpeXfJ9tLeVTS99P8RaGI2A4sB96TJv7PIDsEVsvhETE1In4lIqq9gB5O9m750VzZo2msg3VklfaOzN3vTmOpZVNE5OdcKtvYUPF4/YGerz89t35Dxbp6zSR79z8YA207FM/5z3PL+b+taj5Jtld6p6Q1kv54Hx7H6uQgsUKSfp3sH/12snf5LwLHRcSh6TYlImr9M+8WEXdFxAKyQz//TvaOsmrVKmVLyd7xzwNeiIj/2cehVHqSPe9Y+x1FNnczUB+KPF6lvcdz9+tpc3ra86unjf49xYMr6vePYTPZi3p+Xd4LZG8M+r08t7wB+OUB+lg0joG2LXrOn6/RnyIv6VNE/DwiPhARRwJ/Clw5VB/9tj0cJDYgSYdIejuwjOy49X2RfULoq8Clko5I9aZLOqWO9iZKerekKRGxi2wCd6BPHP0CeGW+IAVHH9mhkKK9kUIR0UsWZIslHSzpaOBjwDdyfZghaeI+NHsd8GlJrZIOBy7ItVevI4APS5og6XSyif+VA4xhA9k8zOclHSDp18gOSfU/5vXA+ZKmSpoBfKiiidXAH0lqljSf7FN6/a4C3i9pnqSm9Hs+Nq17ye+nwjeBN0v6A0ktkqZJOr6O53w18FuSjkqfGjy/9lO1l26yv4/d/ZJ0eho3ZB84CAb+m7NBcpBYNd+R9CzZu8pPAZew90d//4rs00F3SHoGuIk9cyBF3gusT9v9GdkeRjWfJ3tB3irpE7nya4BfZd9fnAfyIbJ3wevI9ri+BVyd1t0CrAF+LunJOtv7HNBJ9omy+4B7Utm+WAXMIXv3vhg4LSK21Kh/BtlE8+PAt8nmYG5K6z5LdujoEbJPOFUG8EeAd5B9surdZHuJAETEnWS/90vJJt2/z549iS8Bp0l6WtJLDhtGxGNkhzA/TnZIdDXZBySgxnMeETeSHcL8CXA38N0a4658zBdIn2hLfzevB34dWCXpObJ5to9ExLp627T6aO9DsWYjm6T3AedExEnD3ZcySDqL7NNQY3J8NjZ5j8RGDUkvAz4ILBnuvpjZHg4SGxXSHEw32bH5bw1zd8wsx4e2zMysId4jMTOzhoyLizYefvjhMWvWrOHuhpnZqHL33Xc/GRGtRfXGRZDMmjWLzs7O4e6GmdmoIqmuKyH40JaZmTXEQWJmZg1xkJiZWUMcJGZm1hAHiZmZNcRBYmZmDXGQmJlZQxwkNSz90Xq+8+PHiyuamY1jDpIavnHHo/zX/ZuHuxtmZiOag8TMzBriIDEzs4Y4SMzMrCGlBomk+ZLWSuqStKjK+kmSlqf1qyTNSuXTJN0q6TlJlw/Qdoek+8vsP4C/rsXMrLbSgkRSM3AFcCrQBpwhqa2i2tnA0xFxDHApcHEq3w78DfCJAdr+v8BzZfR778cp+xHMzEa/MvdI5gJdEbEuInYCy4AFFXUWAEvT8gpgniRFxPMRcTtZoOxF0kHAx4DPldd1MzOrV5lBMh3YkLu/MZVVrRMRPcA2YFpBu38LfBF4oVYlSedI6pTU2d3dvS/9NjOzfTCqJtslHQ/8ckR8u6huRCyJiPaIaG9tLfyCrxrtDHpTM7Nxocwg2QTMzN2fkcqq1pHUAkwBttRo8w1Au6T1wO3AqyTdNkT9fQnhSRIzsyJlBsldwBxJsyVNBBYCHRV1OoAz0/JpwC0RA+8DRMQ/RsSRETELOAn4aUScPOQ9NzOzupX2ne0R0SPpPOAGoBm4OiLWSLoI6IyIDuAq4FpJXcBTZGEDQNrrOASYKOldwFsi4oGy+mtmZoNTWpAARMRKYGVF2QW55e3A6QNsO6ug7fXAaxrupJmZNWRUTbYPh8Cz7WZmtThIavAJiWZmxRwkZmbWEAeJmZk1xEFiZmYNcZAU8JntZma1OUjMzKwhDhIzM2uIg8TMzBriIDEzs4Y4SAp4rt3MrDYHSQ3yqe1mZoUcJGZm1hAHiZmZNcRBUsAnJJqZ1eYgqcEzJGZmxRwkZmbWEAeJmZk1pNQgkTRf0lpJXZIWVVk/SdLytH6VpFmpfJqkWyU9J+nyXP2XSfpPSQ9JWiPpC2X238zMipUWJJKagSuAU4E24AxJbRXVzgaejohjgEuBi1P5duBvgE9UafofIuJY4HXAGyWdWkb/9/Bsu5lZLWXukcwFuiJiXUTsBJYBCyrqLACWpuUVwDxJiojnI+J2skDZLSJeiIhb0/JO4B5gRlkD8PmIZmbFygyS6cCG3P2NqaxqnYjoAbYB0+ppXNKhwDuAmxvuqZmZDdqonGyX1AJcB3w5ItYNUOccSZ2SOru7u/dvB83MxpEyg2QTMDN3f0Yqq1onhcMUYEsdbS8BHo6IywaqEBFLIqI9ItpbW1v3qeNmZla/MoPkLmCOpNmSJgILgY6KOh3AmWn5NOCWiNrnkkv6HFng/MUQ97cqn9luZlZbS1kNR0SPpPOAG4Bm4OqIWCPpIqAzIjqAq4BrJXUBT5GFDQCS1gOHABMlvQt4C/AM8CngIeCedHXeyyPia2WMwZPtZmbFSgsSgIhYCaysKLsgt7wdOH2AbWcN0Kxf3s3MRpBROdluZmYjh4OkgKdIzMxqc5DUIB9FMzMr5CAxM7OGOEjMzKwhDhIzM2uIg6RAwfmRZmbjnoOkBp+QaGZWzEFiZmYNcZCYmVlDHCRmZtYQB0kBT7WbmdXmIKnBc+1mZsUcJGZm1hAHiZmZNcRBYmZmDXGQFPCJ7WZmtTlIavGp7WZmhRwkZmbWkFKDRNJ8SWsldUlaVGX9JEnL0/pVkmal8mmSbpX0nKTLK7Y5UdJ9aZsvS95tMDMbTqUFiaRm4ArgVKANOENSW0W1s4GnI+IY4FLg4lS+Hfgb4BNVmv5H4APAnHSbP/S938NTJGZmtZW5RzIX6IqIdRGxE1gGLKioswBYmpZXAPMkKSKej4jbyQJlN0mvAA6JiDsiu777NcC7yhqAd3XMzIqVGSTTgQ25+xtTWdU6EdEDbAOmFbS5saBNACSdI6lTUmd3d/c+dt3MzOo1ZifbI2JJRLRHRHtra+twd8fMbMwqM0g2ATNz92eksqp1JLUAU4AtBW3OKGjTzMz2ozKD5C5gjqTZkiYCC4GOijodwJlp+TTglqjx3bYRsRl4RtLr06e13gf8x9B3fa/HLLN5M7NRr6WshiOiR9J5wA1AM3B1RKyRdBHQGREdwFXAtZK6gKfIwgYASeuBQ4CJkt4FvCUiHgA+CHwdmAz8V7qVwh8sNjMrVlqQAETESmBlRdkFueXtwOkDbDtrgPJO4DVD10szM2vEmJ1sNzOz/cNBYmZmDXGQmJlZQxwkNXiu3cysmIPEzMwa4iAxM7OGOEgK+HxEM7PaHCQ1+KtOzMyKOUjMzKwhDhIzM2uIg8TMzBriICkQ/rJdM7OaHCQ1eKrdzKyYg8TMzBriIDEzs4bUFSSS/k3S2yQ5eMzMbC/1BsOVwB8BD0v6gqRXl9inEcVntpuZ1VZXkETETRHxbuAEYD1wk6QfSXq/pAlldnA4+cR2M7NidR+qkjQNOAv4E+Be4EtkwXJjjW3mS1orqUvSoirrJ0lantavkjQrt+78VL5W0im58o9KWiPpfknXSTqg3jGYmdnQq3eO5NvAD4GXAe+IiHdGxPKI+BBw0ADbNANXAKcCbcAZktoqqp0NPB0RxwCXAhenbduAhcBxwHzgSknNkqYDHwbaI+I1QHOqZ2Zmw6TePZKvRkRbRHw+IjZDtjcBEBHtA2wzF+iKiHURsRNYBiyoqLMAWJqWVwDzlF0pcQGwLCJ2RMQjQFdqD6AFmCyphSzYHq9zDGZmVoJ6g+RzVcr+p2Cb6cCG3P2NqaxqnYjoAbYB0wbaNiI2Af8APAZsBrZFxPeqPbikcyR1Surs7u4u6OrAPNluZlZbzSCR9HJJJ5LtAbxO0gnpdjLZ3sB+JWkq2d7KbOBI4EBJ76lWNyKWRER7RLS3trYO7vF8bruZWaGWgvWnkE2wzwAuyZU/C/x1wbabgJm5+zNSWbU6G9OhqinAlhrbvhl4JCK6ITu/Bfg/wDcK+mJmZiWpGSQRsRRYKun3I+Jf97Htu4A5kmaThcBCsnNR8jqAM8kOk50G3BIRIakD+JakS8j2POYAdwJ9wOslvQx4EZgHdO5jv8zMbAjVDBJJ74mIbwCzJH2scn1EXFJls/51PZLOA24g+3TV1RGxRtJFQGdEdABXAddK6gKeIn0CK9W7HngA6AHOjYheYJWkFcA9qfxeYMk+j3of+Oq/Zma1FR3aOjD9rPoR3yIRsRJYWVF2QW55O3D6ANsuBhZXKf8M8JnB9GefeYrEzKxQ0aGtr6Sfn90/3TEzs9Gm3hMS/17SIZImSLpZUvdAn5YyM7Pxpd7zSN4SEc8Abye71tYxwF+W1SkzMxs96g2S/kNgbwP+JSK2ldSfEccnJJqZ1VY02d7vu5IeIvvI7Z9LagW2l9etkcFz7WZmxeq9jPwishP/2iNiF/A8L71ulpmZjUP17pEAHEt2Pkl+m2uGuD9mZjbK1BUkkq4FfhlYDfSm4sBBYmY27tW7R9IOtEWMv6nncTdgM7N9VO+ntu4HXl5mR0Yif9WumVmxevdIDgcekHQnsKO/MCLeWUqvzMxs1Kg3SC4ssxNmZjZ61RUkEfF9SUcDcyLipnQZ9+ZyuzZCeJLEzKymeq+19QGy71T/SiqaDvx7WZ0aKfwNiWZmxeqdbD8XeCPwDEBEPAwcUVanzMxs9Kg3SHZExM7+O+mkRB/0MTOzuoPk+5L+Gpgs6XeBfwG+U163zMxstKg3SBYB3cB9wJ+Sfevhp8vq1Ejir9o1M6ut3os29pFNrn8wIk6LiK/Wc5a7pPmS1krqkrSoyvpJkpan9askzcqtOz+Vr5V0Sq78UEkrJD0k6UFJb6hnDIPhExLNzIrVDBJlLpT0JLAWWJu+HfGCWtulbZuBK4BTgTbgDEltFdXOBp6OiGOAS4GL07ZtwELgOGA+cGVqD+BLwH9HxLHAa4EH6xuqmZmVoWiP5KNkn9b69Yg4LCIOA34DeKOkjxZsOxfoioh1aaJ+GS+99PwCYGlaXgHMk6RUviwidkTEI0AXMFfSFOC3gKsAImJnRGyta6RmZlaKoiB5L3BGejEHICLWAe8B3lew7XRgQ+7+xlRWtU5E9ADbgGk1tp1NNlfzz5LulfQ1SQdWe3BJ50jqlNTZ3d1d0FUzMxusoiCZEBFPVhZGRDcwoZwu1dQCnAD8Y0S8juwLtl4y9wIQEUsioj0i2ltbWwf9gOPvesdmZvumKEh2DnIdwCZgZu7+jFRWtU46N2UKsKXGthuBjRGxKpWvIAuWUniy3cysWFGQvFbSM1VuzwK/WrDtXcAcSbMlTSSbPO+oqNMBnJmWTwNuSZ8G6wAWpk91zQbmAHdGxM+BDZJenbaZBzxQ10jNzKwUNS/aGBGDvjBjRPRIOg+4gewCj1dHxBpJFwGdEdFBNml+raQu4CmysCHVu54sJHqAcyOi/5sZPwR8M4XTOuD9g+2jmZk1bl++s32fRcRKspMX82UX5Ja3A6cPsO1iYHGV8tVk39i4X3iKxMystnrPbB+XfPVfM7NiDhIzM2uIg8TMzBriIDEzs4Y4SArUcW1KM7NxzUFSg09INDMr5iAxM7OGOEjMzKwhDhIzM2uIg6SAp9rNzGpzkJiZWUMcJGZm1hAHiZmZNcRBYmZmDXGQFPCJ7WZmtTlIapBPbTczK+QgMTOzhjhIzMysIaUGiaT5ktZK6pK0qMr6SZKWp/WrJM3KrTs/la+VdErFds2S7pX03TL7Dz4h0cysSGlBIqkZuAI4FWgDzpDUVlHtbODpiDgGuBS4OG3bBiwEjgPmA1em9vp9BHiwrL738wyJmVmxMvdI5gJdEbEuInYCy4AFFXUWAEvT8gpgnrIZ7gXAsojYERGPAF2pPSTNAN4GfK3EvpuZWZ3KDJLpwIbc/Y2prGqdiOgBtgHTCra9DPgk0FfrwSWdI6lTUmd3d/dgx2BmZgVG1WS7pLcDT0TE3UV1I2JJRLRHRHtra+t+6J2Z2fhUZpBsAmbm7s9IZVXrSGoBpgBbamz7RuCdktaTHSp7k6RvlNH53XxGoplZTWUGyV3AHEmzJU0kmzzvqKjTAZyZlk8DbonsS9I7gIXpU12zgTnAnRFxfkTMiIhZqb1bIuI9ZQ3A5yOamRVrKavhiOiRdB5wA9AMXB0RayRdBHRGRAdwFXCtpC7gKbJwINW7HngA6AHOjYjesvpqZmaDV1qQAETESmBlRdkFueXtwOkDbLsYWFyj7duA24ain2ZmNnijarLdzMxGHgdJAU+1m5nV5iCpwXPtZmbFHCRmZtYQB4mZmTXEQVLA5yOamdXmIKnB35BoZlbMQWJmZg1xkJiZWUMcJGZm1hAHSYHwKYlmZjU5SGrwVLuZWTEHiZmZNcRBYmZmDXGQmJlZQxwkNUjQ1zfcvTAzG9kcJDU0N4k+XyPFzKwmB0kNLU1N9PQ5SMzManGQ1NDcJHodJGZmNZUaJJLmS1orqUvSoirrJ0lantavkjQrt+78VL5W0impbKakWyU9IGmNpI+U2f+WJtHjSRIzs5pKCxJJzcAVwKlAG3CGpLaKamcDT0fEMcClwMVp2zZgIXAcMB+4MrXXA3w8ItqA1wPnVmlzyDQ3id5e75GYmdVS5h7JXKArItZFxE5gGbCgos4CYGlaXgHMU3bt9gXAsojYERGPAF3A3IjYHBH3AETEs8CDwPSyBtDSLM+RmJkVKDNIpgMbcvc38tIX/d11IqIH2AZMq2fbdBjsdcCqag8u6RxJnZI6u7u7BzUAz5GYmRUblZPtkg4C/hX4i4h4plqdiFgSEe0R0d7a2jqox/GntszMipUZJJuAmbn7M1JZ1TqSWoApwJZa20qaQBYi34yIfyul54n3SMzMipUZJHcBcyTNljSRbPK8o6JOB3BmWj4NuCUiIpUvTJ/qmg3MAe5M8ydXAQ9GxCUl9h2ACc1N7OjpLfthzMxGtZayGo6IHknnATcAzcDVEbFG0kVAZ0R0kIXCtZK6gKfIwoZU73rgAbJPap0bEb2STgLeC9wnaXV6qL+OiJVljOGACU3s6g16+4LmJl9U3sysmtKCBCC9wK+sKLsgt7wdOH2AbRcDiyvKbmc/fk3IxJZsh21Xbx/NTc3762HNzEaVUTnZvr9MbM6enh09PinRzGwgDpIaJqU9kp0OEjOzATlIapjQvOfQlpmZVecgqWGi90jMzAo5SGrYHSTeIzEzG5CDpIbdk+27HCRmZgNxkNQw9cCJADz9ws5h7omZ2cjlIKmh9aBJADz53I5h7omZ2cjlIKmh9eAsSLqfdZCYmQ3EQVLDgZNamDyh2UFiZlaDg6RA68GT6PahLTOzATlICrQePMl7JGZmNThIChxx8CQ2bX1xuLthZjZiOUgKnHDUVB7d8oLDxMxsAA6SAr/96uxrem996Ilh7omZ2cjkICkw54iDOPblB/OVH/yM53f0DHd3zMxGHAdJAUlc+M7jeHzrds771j28uNNfvWtmlucgqcPrXzmNC9/Rxq1ru3nzJd/nm6se5TnvnZiZAaCIKK9xaT7wJbLvbP9aRHyhYv0k4BrgRGAL8IcRsT6tOx84G+gFPhwRN9TTZjXt7e3R2dnZ8HhWrdvC4pUP8pON22hpEr/yikM44ahDOeaIg3jFlMm8fMoBHHnoZKa+bAKSv+PdzEY3SXdHRHthvbKCRFIz8FPgd4GNwF3AGRHxQK7OB4Ffi4g/k7QQ+L2I+ENJbcB1wFzgSOAm4FVps5ptVjNUQQIQEdzz2FZufvAX3PvYVn68cSsvVBzumtTSxCumHMAhkydw4MQWDpzUwkGTmtPPlt1nzE9oFhNbmpnY0pQtNzcxobmJ5mYxoamJ5ibR0qzsZ1P/zyaam6BJ2n2ToKlJNAmaJaRseff63fWzn3uefxBKP7PDeMqvcxiajWv1BklLiX2YC3RFxLrUoWXAAiD/or8AuDAtrwAuV/bqtQBYFhE7gEckdaX2qKPNUknixKOncuLRUwHo6wuefG4Hm7dtZ/O2F3l8a/Zz87btPLu9h+d39LDx6Rd4fmcPz+/o5fkdPaPuO+D7gyZbVgqdLITYa91LgwnS+nxYVbRXVx/2oa911hzS9uqpVn9b+79vWXtD+8ah7v4N4fMyXM/JEP/ZDWn//vPDJzGppbnOFgenzCCZDmzI3d8I/MZAdSKiR9I2YFoqv6Ni2+lpuahNACSdA5wDcNRRRw1uBHVoahJHHHIARxxyAK+deWhd2+zq7WP7rl529QY7e/rY1dvHjp4+evr62NnTR09f0NsX9PSmn3199PQGPX1BX2Q/I7Llvj7oiyACevvLIttz6uvLlivXA/TviEZaF/ky9pQRQf8+a1a2d/3da3eX5bat2Gb34+XW1WPPFgX16m6vznp176wXV6y7b2PkOam3f/VUq79vdT4ndbdXZ7262xva/tVbsd43Jo0oM0iGVUQsAZZAdmhrmLuzlwnpEJaZ2VhQ5qvZJmBm7v6MVFa1jqQWYArZpPtA29bTppmZ7UdlBsldwBxJsyVNBBYCHRV1OoAz0/JpwC2R7f91AAslTZI0G5gD3Flnm2Zmth+VdmgrzXmcB9xA9lHdqyNijaSLgM6I6ACuAq5Nk+lPkQUDqd71ZJPoPcC5EdELUK3NssZgZmbFSj2PZKQYyo//mpmNF/V+/NczvmZm1hAHiZmZNcRBYmZmDXGQmJlZQ8bFZLukbuDRQW5+OPDkEHZnNPCYx4/xOG6PuX5HR0RrUaVxESSNkNRZz6cWxhKPefwYj+P2mIeeD22ZmVlDHCRmZtYQB0mxJcPdgWHgMY8f43HcHvMQ8xyJmZk1xHskZmbWEAeJmZk1xEEyAEnzJa2V1CVp0XD3ZzAkXS3pCUn358oOk3SjpIfTz6mpXJK+nMb7E0kn5LY5M9V/WNKZufITJd2XtvmyRsCXvEuaKelWSQ9IWiPpI6l8zI5b0gGS7pT04zTmz6by2ZJWpX4uT1+9QPp6huWpfJWkWbm2zk/layWdkisfkf8Pkpol3Svpu+n+mB6zpPXpb2+1pM5UNvx/29lXo/qWv5Fdov5nwCuBicCPgbbh7tcgxvFbwAnA/bmyvwcWpeVFwMVp+a3Af5F9XfTrgVWp/DBgXfo5NS1PTevuTHWVtj11BIz5FcAJaflg4KdA21ged+rHQWl5ArAq9e96YGEq/yfgz9PyB4F/SssLgeVpuS39rU8CZqf/geaR/P8AfAz4FvDddH9MjxlYDxxeUTbsf9veI6luLtAVEesiYiewDFgwzH3aZxHxA7LveclbACxNy0uBd+XKr4nMHcChkl4BnALcGBFPRcTTwI3A/LTukIi4I7K/wGtybQ2biNgcEfek5WeBB4HpjOFxp74/l+5OSLcA3gSsSOWVY+5/LlYA89I7zwXAsojYERGPAF02d1zUAAAD3UlEQVRk/wsj8v9B0gzgbcDX0n0xxsc8gGH/23aQVDcd2JC7vzGVjQW/FBGb0/LPgV9KywONuVb5xirlI0Y6fPE6snfoY3rc6RDPauAJsheGnwFbI6InVcn3c/fY0vptwDT2/bkYbpcBnwT60v1pjP0xB/A9SXdLOieVDfvfdmnfkGgjX0SEpDH5+W9JBwH/CvxFRDyTP9Q7Fscd2TeIHi/pUODbwLHD3KVSSXo78ERE3C3p5OHuz350UkRsknQEcKOkh/Irh+tv23sk1W0CZubuz0hlY8Ev0i4s6ecTqXygMdcqn1GlfNhJmkAWIt+MiH9LxWN+3AARsRW4FXgD2aGM/jeL+X7uHltaPwXYwr4/F8PpjcA7Ja0nO+z0JuBLjO0xExGb0s8nyN4wzGUk/G0P9+TRSLyR7amtI5t8659oO264+zXIscxi78n2/8feE3N/n5bfxt4Tc3em8sOAR8gm5aam5cPSusqJubeOgPGK7NjuZRXlY3bcQCtwaFqeDPwQeDvwL+w98fzBtHwue088X5+Wj2Pvied1ZJPOI/r/ATiZPZPtY3bMwIHAwbnlHwHzR8Lf9rD/EYzUG9knHn5Kdqz5U8Pdn0GO4TpgM7CL7Hjn2WTHhW8GHgZuyv0BCbgijfc+oD3Xzh+TTUJ2Ae/PlbcD96dtLiddKWGYx3wS2XHknwCr0+2tY3ncwK8B96Yx3w9ckMpfmV4YutIL7KRUfkC635XWvzLX1qfSuNaS+8TOSP5/YO8gGbNjTmP7cbqt6e/TSPjb9iVSzMysIZ4jMTOzhjhIzMysIQ4SMzNriIPEzMwa4iAxM7OGOEjMhpik3nR11vslfSedbT7Ytm6T1D6U/TMbag4Ss6H3YkQcHxGvIbto5rnD3SGzMjlIzMr1P6QL30k6SNLNku5J3/mwIJXPkvSgpK+m7xP5nqTJ+UYkNUn6uqTPDcMYzGpykJiVRFIzMA/oSEXbgd+LiBOA3wG+mPvioDnAFRFxHLAV+P1cUy3AN4GHI+LT+6XzZvvAQWI29CanS7r3X9L7xlQu4O8k/YTsUhbT2XPJ70ciYnVavpvsGmn9vkJ2vbTFZXfcbDAcJGZD78WIOB44miw8+udI3k12gcUT0/pfkF0DCmBHbvte9v6Khx8BvyPpAMxGIAeJWUki4gXgw8DHc5cufyIidkn6HbKgqcdVwErg+twl0s1GDAeJWYkiov+qvGeQzXO0S7oPeB/wUK1tK9q5hOwKv9dK8v+tjSi++q+ZmTXE72zMzKwhDhIzM2uIg8TMzBriIDEzs4Y4SMzMrCEOEjMza4iDxMzMGvK/bdT5+uVIQe8AAAAASUVORK5CYII=\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#Calculate the density\n", "productCountDf[\"density\"] = (productCountDf[\"order_id\"]/np.sum(productCountDf[\"order_id\"]))\n", "#Calculate the rank\n", "productCountDf[\"rank\"] = range(productCountDf.shape[0])\n", "plt.plot(productCountDf[\"rank\"],productCountDf[\"density\"])\n", "plt.title(\"Density Plot for product counts\")\n", "plt.xlabel(\"Rank\")\n", "plt.ylabel(\"Density\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It is hard to interpret much from this graph due to wide-ranging ranks (0-50000), So we can go for logarthmic scale for better visualization" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#Calculate log(Rank) also we add 1 to avoid log(0)\n", "productCountDf[\"logRank\"] = np.log(productCountDf[\"rank\"] + 1) \n", "plt.title(\"Density Plot for product counts\")\n", "plt.xlabel(\"$\\log(Rank)$\")\n", "plt.ylabel(\"Density\")\n", "plt.plot(productCountDf[\"logRank\"],productCountDf[\"density\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can see a very steep distribuiton and we can perform smoothening on the sparse distribuition area. e^6 = 403 products define most of the distribution. Products lying under (e^6, e^12) range are not significant since their respective density is very less." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Days of Orders in a week" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "grouped = orderDf.groupby(\"order_id\")[\"order_dow\"].aggregate(\"sum\").reset_index()\n", "grouped = grouped.order_dow.value_counts()\n", "\n", "sns.barplot(grouped.index, grouped.values)\n", "plt.ylabel('Number of orders', fontsize=13)\n", "plt.xlabel('Days of order in a week', fontsize=13)\n", "plt.show()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Number of unique customers in the whole dataset-" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "206209" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(set(orderDf.user_id))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 2 }