{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Visualization\n", "\n", "This notebook showcases following data viz using :\n", "+ pandas\n", " + plots and sub plots\n", " + hist\n", " + box\n", " + scatter\n", " + timeseries\n", "+ matplotlib" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%matplotlib inline\n", "# import required libraries\n", "import random\n", "import datetime \n", "import numpy as np\n", "import pandas as pd\n", "from random import randrange\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "import seaborn as sns\n", "\n", "sns.set_style('whitegrid')\n", "sns.set_context('talk')\n", "\n", "params = {'legend.fontsize': 'small',\n", " 'figure.figsize': (10, 5),\n", " 'axes.labelsize': 'medium',\n", " 'axes.titlesize':'medium',\n", " 'xtick.labelsize':'medium',\n", " 'ytick.labelsize':'medium'}\n", "\n", "plt.rcParams.update(params)\n", "\n", "from IPython.display import display, HTML" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Utilities" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def _random_date(start,date_count):\n", " \"\"\"This function generates a random date based on params\n", " Args:\n", " start (date object): the base date\n", " date_count (int): number of dates to be generated\n", " Returns:\n", " list of random dates\n", "\n", " \"\"\"\n", " current = start\n", " while date_count > 0:\n", " curr = current + datetime.timedelta(days=randrange(42))\n", " yield curr\n", " date_count-=1\n", "\n", "\n", "def generate_sample_data(row_count=100):\n", " \"\"\"This function generates a random transaction dataset\n", " Args:\n", " row_count (int): number of rows for the dataframe\n", " Returns:\n", " a pandas dataframe\n", "\n", " \"\"\"\n", " \n", " # sentinels\n", " startDate = datetime.datetime(2016, 1, 1,13)\n", " serial_number_sentinel = 1000\n", " user_id_sentinel = 5001\n", " product_id_sentinel = 101\n", " price_sentinel = 2000\n", " \n", " \n", " # base list of attributes\n", " data_dict = {\n", " 'Serial No': np.arange(row_count)+serial_number_sentinel,\n", " 'Date': np.random.permutation(pd.to_datetime([x.strftime(\"%d-%m-%Y\") \n", " for x in _random_date(startDate,\n", " row_count)]).date\n", " ),\n", " 'User ID': np.random.permutation(np.random.randint(0,\n", " row_count,\n", " size=int(row_count/10)) + user_id_sentinel).tolist()*10,\n", " 'Product ID': np.random.permutation(np.random.randint(0,\n", " row_count,\n", " size=int(row_count/10)) + product_id_sentinel).tolist()*10 ,\n", " 'Quantity Purchased': np.random.permutation(np.random.randint(1,\n", " 42,\n", " size=row_count)),\n", " 'Price': np.round(np.abs(np.random.randn(row_count)+1)*price_sentinel,\n", " decimals=2),\n", " 'User Type':np.random.permutation([chr(random.randrange(97, 97 + 3 + 1)) \n", " for i in range(row_count)])\n", " }\n", " \n", " # introduce missing values\n", " for index in range(int(np.sqrt(row_count))): \n", " data_dict['Price'][np.argmax(data_dict['Price'] == random.choice(data_dict['Price']))] = np.nan\n", " data_dict['User Type'][np.argmax(data_dict['User Type'] == random.choice(data_dict['User Type']))] = np.nan\n", " data_dict['Date'][np.argmax(data_dict['Date'] == random.choice(data_dict['Date']))] = np.nan\n", " data_dict['Product ID'][np.argmax(data_dict['Product ID'] == random.choice(data_dict['Product ID']))] = 0\n", " data_dict['Serial No'][np.argmax(data_dict['Serial No'] == random.choice(data_dict['Serial No']))] = -1\n", " data_dict['User ID'][np.argmax(data_dict['User ID'] == random.choice(data_dict['User ID']))] = -101\n", " \n", " \n", " # create data frame\n", " df = pd.DataFrame(data_dict)\n", " \n", " return df\n", " \n", " \n", "def cleanup_column_names(df,rename_dict={},do_inplace=True):\n", " \"\"\"This function renames columns of a pandas dataframe\n", " It converts column names to snake case if rename_dict is not passed. \n", " Args:\n", " rename_dict (dict): keys represent old column names and values point to \n", " newer ones\n", " do_inplace (bool): flag to update existing dataframe or return a new one\n", " Returns:\n", " pandas dataframe if do_inplace is set to False, None otherwise\n", "\n", " \"\"\"\n", " if not rename_dict:\n", " return df.rename(columns={col: col.lower().replace(' ','_') \n", " for col in df.columns.values.tolist()}, \n", " inplace=do_inplace)\n", " else:\n", " return df.rename(columns=rename_dict,inplace=do_inplace)\n", "\n", "def expand_user_type(u_type):\n", " \"\"\"This function maps user types to user classes\n", " Args:\n", " u_type (str): user type value\n", " Returns:\n", " (str) user_class value\n", "\n", " \"\"\"\n", " if u_type in ['a','b']:\n", " return 'new'\n", " elif u_type == 'c':\n", " return 'existing'\n", " elif u_type == 'd':\n", " return 'loyal_existing'\n", " else:\n", " return 'error'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prepare the dataset" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = generate_sample_data(row_count=1000)\n", "cleanup_column_names(df)\n", "\n", "df['date'] = pd.to_datetime(df.date)\n", "\n", "\n", "df['user_class'] = df['user_type'].map(expand_user_type)\n", "\n", "df['purchase_week'] = df[['date']].applymap(lambda dt:dt.week \n", " if not pd.isnull(dt.week) \n", " else 0)\n", "\n", "df = df.dropna(subset=['date'])\n", "df['price'].fillna(value=np.round(df.price.mean(),decimals=2),\n", " inplace=True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | date | \n", "price | \n", "product_id | \n", "quantity_purchased | \n", "serial_no | \n", "user_id | \n", "user_type | \n", "user_class | \n", "purchase_week | \n", "
---|---|---|---|---|---|---|---|---|---|
1 | \n", "2016-08-02 | \n", "1469.29 | \n", "236 | \n", "35 | \n", "1001 | \n", "5045 | \n", "n | \n", "error | \n", "31 | \n", "
4 | \n", "2016-11-02 | \n", "3380.21 | \n", "377 | \n", "39 | \n", "1004 | \n", "5855 | \n", "a | \n", "new | \n", "44 | \n", "
8 | \n", "2016-02-02 | \n", "2379.62 | \n", "568 | \n", "21 | \n", "1008 | \n", "5189 | \n", "n | \n", "error | \n", "5 | \n", "
12 | \n", "2016-01-25 | \n", "1972.20 | \n", "602 | \n", "34 | \n", "1012 | \n", "5335 | \n", "n | \n", "error | \n", "4 | \n", "
13 | \n", "2016-10-01 | \n", "444.75 | \n", "382 | \n", "17 | \n", "1013 | \n", "5830 | \n", "n | \n", "error | \n", "39 | \n", "