{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Wrangling\n", "\n", "This notebook presents different data wrangling techniques used commonly" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# import required libraries\n", "import random\n", "import datetime \n", "import numpy as np\n", "import pandas as pd\n", "from random import randrange\n", "from sklearn import preprocessing\n", "\n", "from IPython.display import display\n", "\n", "pd.options.mode.chained_assignment = None" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Utilities" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def _random_date(start,date_count):\n", " \"\"\"This function generates a random date based on params\n", " Args:\n", " start (date object): the base date\n", " date_count (int): number of dates to be generated\n", " Returns:\n", " list of random dates\n", "\n", " \"\"\"\n", " current = start\n", " while date_count > 0:\n", " curr = current + datetime.timedelta(days=randrange(42))\n", " yield curr\n", " date_count-=1\n", "\n", "\n", "def generate_sample_data(row_count=100):\n", " \"\"\"This function generates a random transaction dataset\n", " Args:\n", " row_count (int): number of rows for the dataframe\n", " Returns:\n", " a pandas dataframe\n", "\n", " \"\"\"\n", " \n", " # sentinels\n", " startDate = datetime.datetime(2016, 1, 1,13)\n", " serial_number_sentinel = 1000\n", " user_id_sentinel = 5001\n", " product_id_sentinel = 101\n", " price_sentinel = 2000\n", " \n", " \n", " # base list of attributes\n", " data_dict = {\n", " 'Serial No': np.arange(row_count)+serial_number_sentinel,\n", " 'Date': np.random.permutation(pd.to_datetime([x.strftime(\"%d-%m-%Y\") \n", " for x in _random_date(startDate,\n", " row_count)]).date\n", " ),\n", " 'User ID': np.random.permutation(np.random.randint(0,\n", " row_count,\n", " size=int(row_count/10)) + user_id_sentinel).tolist()*10,\n", " 'Product ID': np.random.permutation(np.random.randint(0,\n", " row_count,\n", " size=int(row_count/10))+ product_id_sentinel).tolist()*10 ,\n", " 'Quantity Purchased': np.random.permutation(np.random.randint(1,\n", " 42,\n", " size=row_count)),\n", " 'Price': np.round(np.abs(np.random.randn(row_count)+1)*price_sentinel,\n", " decimals=2),\n", " 'User Type':np.random.permutation([chr(random.randrange(97, 97 + 3 + 1)) \n", " for i in range(row_count)])\n", " }\n", " \n", " # introduce missing values\n", " for index in range(int(np.sqrt(row_count))): \n", " data_dict['Price'][np.argmax(data_dict['Price'] == random.choice(data_dict['Price']))] = np.nan\n", " data_dict['User Type'][np.argmax(data_dict['User Type'] == random.choice(data_dict['User Type']))] = np.nan\n", " data_dict['Date'][np.argmax(data_dict['Date'] == random.choice(data_dict['Date']))] = np.nan\n", " data_dict['Product ID'][np.argmax(data_dict['Product ID'] == random.choice(data_dict['Product ID']))] = 0\n", " data_dict['Serial No'][np.argmax(data_dict['Serial No'] == random.choice(data_dict['Serial No']))] = -1\n", " data_dict['User ID'][np.argmax(data_dict['User ID'] == random.choice(data_dict['User ID']))] = -101\n", " \n", " \n", " # create data frame\n", " df = pd.DataFrame(data_dict)\n", " \n", " return df\n", " \n", "\n", "def describe_dataframe(df=pd.DataFrame()):\n", " \"\"\"This function generates descriptive stats of a dataframe\n", " Args:\n", " df (dataframe): the dataframe to be analyzed\n", " Returns:\n", " None\n", "\n", " \"\"\"\n", " print(\"\\n\\n\")\n", " print(\"*\"*30)\n", " print(\"About the Data\")\n", " print(\"*\"*30)\n", " \n", " print(\"Number of rows::\",df.shape[0])\n", " print(\"Number of columns::\",df.shape[1])\n", " print(\"\\n\")\n", " \n", " print(\"Column Names::\",df.columns.values.tolist())\n", " print(\"\\n\")\n", " \n", " print(\"Column Data Types::\\n\",df.dtypes)\n", " print(\"\\n\")\n", " \n", " print(\"Columns with Missing Values::\",df.columns[df.isnull().any()].tolist())\n", " print(\"\\n\")\n", " \n", " print(\"Number of rows with Missing Values::\",len(pd.isnull(df).any(1).nonzero()[0].tolist()))\n", " print(\"\\n\")\n", " \n", " print(\"Sample Indices with missing data::\",pd.isnull(df).any(1).nonzero()[0].tolist()[0:5])\n", " print(\"\\n\")\n", " \n", " print(\"General Stats::\")\n", " print(df.info())\n", " print(\"\\n\")\n", " \n", " print(\"Summary Stats::\")\n", " print(df.describe())\n", " print(\"\\n\")\n", " \n", " print(\"Dataframe Sample Rows::\")\n", " display(df.head(5))\n", " \n", "def cleanup_column_names(df,rename_dict={},do_inplace=True):\n", " \"\"\"This function renames columns of a pandas dataframe\n", " It converts column names to snake case if rename_dict is not passed. \n", " Args:\n", " rename_dict (dict): keys represent old column names and values point to \n", " newer ones\n", " do_inplace (bool): flag to update existing dataframe or return a new one\n", " Returns:\n", " pandas dataframe if do_inplace is set to False, None otherwise\n", "\n", " \"\"\"\n", " if not rename_dict:\n", " return df.rename(columns={col: col.lower().replace(' ','_') \n", " for col in df.columns.values.tolist()}, \n", " inplace=do_inplace)\n", " else:\n", " return df.rename(columns=rename_dict,inplace=do_inplace)\n", "\n", "def expand_user_type(u_type):\n", " \"\"\"This function maps user types to user classes\n", " Args:\n", " u_type (str): user type value\n", " Returns:\n", " (str) user_class value\n", "\n", " \"\"\"\n", " if u_type in ['a','b']:\n", " return 'new'\n", " elif u_type == 'c':\n", " return 'existing'\n", " elif u_type == 'd':\n", " return 'loyal_existing'\n", " else:\n", " return 'error'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Generate a Sample Dataset" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = generate_sample_data(row_count=1000)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Describe the Dataset" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "******************************\n", "About the Data\n", "******************************\n", "Number of rows:: 1000\n", "Number of columns:: 7\n", "\n", "\n", "Column Names:: ['Date', 'Price', 'Product ID', 'Quantity Purchased', 'Serial No', 'User ID', 'User Type']\n", "\n", "\n", "Column Data Types::\n", " Date object\n", "Price float64\n", "Product ID int64\n", "Quantity Purchased int32\n", "Serial No int32\n", "User ID int64\n", "User Type object\n", "dtype: object\n", "\n", "\n", "Columns with Missing Values:: ['Date', 'Price']\n", "\n", "\n", "Number of rows with Missing Values:: 61\n", "\n", "\n", "Sample Indices with missing data:: [1, 2, 3, 9, 11]\n", "\n", "\n", "General Stats::\n", "\n", "RangeIndex: 1000 entries, 0 to 999\n", "Data columns (total 7 columns):\n", "Date 969 non-null object\n", "Price 969 non-null float64\n", "Product ID 1000 non-null int64\n", "Quantity Purchased 1000 non-null int32\n", "Serial No 1000 non-null int32\n", "User ID 1000 non-null int64\n", "User Type 1000 non-null object\n", "dtypes: float64(1), int32(2), int64(2), object(2)\n", "memory usage: 47.0+ KB\n", "None\n", "\n", "\n", "Summary Stats::\n", " Price Product ID Quantity Purchased Serial No User ID\n", "count 969.000000 1000.00000 1000.000000 1000.000000 1000.000000\n", "mean 2468.147967 618.90100 21.063000 1454.554000 5478.014000\n", "std 1657.607501 274.46151 12.170092 385.901616 340.779522\n", "min 2.600000 0.00000 1.000000 -1.000000 -101.000000\n", "25% 1086.990000 382.75000 10.000000 1227.750000 5245.000000\n", "50% 2248.270000 628.00000 21.000000 1483.500000 5412.000000\n", "75% 3543.000000 838.75000 32.000000 1744.250000 5744.500000\n", "max 8493.210000 1099.00000 41.000000 1999.000000 5992.000000\n", "\n", "\n", "Dataframe Sample Rows::\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DatePriceProduct IDQuantity PurchasedSerial NoUser IDUser Type
02016-01-231395.65031000-101n
1NaN1352.999061910015632n
2NaN3997.326252110025240n
3NaN3681.488653510035557n
42016-01-273850.22929310045489n
\n", "
" ], "text/plain": [ " Date Price Product ID Quantity Purchased Serial No User ID \\\n", "0 2016-01-23 1395.65 0 3 1000 -101 \n", "1 NaN 1352.99 906 19 1001 5632 \n", "2 NaN 3997.32 625 21 1002 5240 \n", "3 NaN 3681.48 865 35 1003 5557 \n", "4 2016-01-27 3850.22 929 3 1004 5489 \n", "\n", " User Type \n", "0 n \n", "1 n \n", "2 n \n", "3 n \n", "4 n " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "describe_dataframe(df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Rename Columns" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dataframe columns:\n", "['Date', 'Price', 'Product ID', 'Quantity Purchased', 'Serial No', 'User ID', 'User Type']\n" ] } ], "source": [ "print(\"Dataframe columns:\\n{}\".format(df.columns.tolist()))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "cleanup_column_names(df)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dataframe columns:\n", "['date', 'price', 'product_id', 'quantity_purchased', 'serial_no', 'user_id', 'user_type']\n" ] } ], "source": [ "print(\"Dataframe columns:\\n{}\".format(df.columns.tolist()))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sort Rows on defined attributes" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepriceproduct_idquantity_purchasedserial_nouser_iduser_type
5022016-03-026168.6662522-15240b
7272016-01-285483.5544511-15016c
6802016-01-135163.7218541-15679d
5992016-01-014903.915512-15688d
752016-07-024584.9753412-15351a
\n", "
" ], "text/plain": [ " date price product_id quantity_purchased serial_no user_id \\\n", "502 2016-03-02 6168.66 625 22 -1 5240 \n", "727 2016-01-28 5483.55 445 11 -1 5016 \n", "680 2016-01-13 5163.72 185 41 -1 5679 \n", "599 2016-01-01 4903.91 551 2 -1 5688 \n", "75 2016-07-02 4584.97 534 12 -1 5351 \n", "\n", " user_type \n", "502 b \n", "727 c \n", "680 d \n", "599 d \n", "75 a " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df.sort_values(['serial_no', 'price'], \n", " ascending=[True, False]).head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Rearrange Columns in a Dataframe" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
serial_nodateuser_iduser_typeproduct_idquantity_purchasedprice
010002016-01-23-101n031395.65
11001NaN5632n906191352.99
21002NaN5240n625213997.32
31003NaN5557n865353681.48
410042016-01-275489n92933850.22
\n", "
" ], "text/plain": [ " serial_no date user_id user_type product_id quantity_purchased \\\n", "0 1000 2016-01-23 -101 n 0 3 \n", "1 1001 NaN 5632 n 906 19 \n", "2 1002 NaN 5240 n 625 21 \n", "3 1003 NaN 5557 n 865 35 \n", "4 1004 2016-01-27 5489 n 929 3 \n", "\n", " price \n", "0 1395.65 \n", "1 1352.99 \n", "2 3997.32 \n", "3 3681.48 \n", "4 3850.22 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df[['serial_no','date','user_id','user_type',\n", " 'product_id','quantity_purchased','price']].head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Filtering Columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using Column Index" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ 3 19 21 35 3 6 36 14 25 32]\n" ] } ], "source": [ "# print 10 values from column at index 3\n", "print(df.iloc[:,3].values[0:10])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using Column Name" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ 3 19 21 35 3 6 36 14 25 32]\n" ] } ], "source": [ "# print 10 values of quantity purchased\n", "print(df.quantity_purchased.values[0:10])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using Column Datatype" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ 1395.65 1352.99 3997.32 3681.48 3850.22 786.27 2725.81 4857.7\n", " 2884.57 3138.58]\n" ] } ], "source": [ "# print 10 values of columns with data type float\n", "print(df.select_dtypes(include=['float64']).values[:10,0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Filtering Rows" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Select specific rows" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepriceproduct_idquantity_purchasedserial_nouser_iduser_type
102016-04-023027.868562410105381n
5012016-01-212017.56906515015632d
202016-03-021920.211723010205865n
\n", "
" ], "text/plain": [ " date price product_id quantity_purchased serial_no user_id \\\n", "10 2016-04-02 3027.86 856 24 1010 5381 \n", "501 2016-01-21 2017.56 906 5 1501 5632 \n", "20 2016-03-02 1920.21 172 30 1020 5865 \n", "\n", " user_type \n", "10 n \n", "501 d \n", "20 n " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df.iloc[[10,501,20]])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Exclude Specific Row indices" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepriceproduct_idquantity_purchasedserial_nouser_iduser_type
1NaN1352.999061910015632n
2NaN3997.326252110025240n
3NaN3681.488653510035557n
42016-01-273850.22929310045489n
52016-01-28786.27300610055262n
\n", "
" ], "text/plain": [ " date price product_id quantity_purchased serial_no user_id \\\n", "1 NaN 1352.99 906 19 1001 5632 \n", "2 NaN 3997.32 625 21 1002 5240 \n", "3 NaN 3681.48 865 35 1003 5557 \n", "4 2016-01-27 3850.22 929 3 1004 5489 \n", "5 2016-01-28 786.27 300 6 1005 5262 \n", "\n", " user_type \n", "1 n \n", "2 n \n", "3 n \n", "4 n \n", "5 n " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df.drop([0,24,51], axis=0).head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Conditional Filtering" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepriceproduct_idquantity_purchasedserial_nouser_iduser_type
3NaN3681.488653510035557n
62016-01-222725.815723610065661n
9NaN3138.585563210095332n
11NaN2780.0382937-15307n
122016-01-184192.7610992710125824n
\n", "
" ], "text/plain": [ " date price product_id quantity_purchased serial_no user_id \\\n", "3 NaN 3681.48 865 35 1003 5557 \n", "6 2016-01-22 2725.81 572 36 1006 5661 \n", "9 NaN 3138.58 556 32 1009 5332 \n", "11 NaN 2780.03 829 37 -1 5307 \n", "12 2016-01-18 4192.76 1099 27 1012 5824 \n", "\n", " user_type \n", "3 n \n", "6 n \n", "9 n \n", "11 n \n", "12 n " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df[df.quantity_purchased>25].head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Offset from top of the dataframe" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepriceproduct_idquantity_purchasedserial_nouser_iduser_type
1002016-07-013151.1037938-15405c
1012016-01-25235.779061911015632d
1022016-01-28429.296253311025240d
1032016-01-316877.388653511035557c
1042016-10-011895.899292911045489c
\n", "
" ], "text/plain": [ " date price product_id quantity_purchased serial_no user_id \\\n", "100 2016-07-01 3151.10 379 38 -1 5405 \n", "101 2016-01-25 235.77 906 19 1101 5632 \n", "102 2016-01-28 429.29 625 33 1102 5240 \n", "103 2016-01-31 6877.38 865 35 1103 5557 \n", "104 2016-10-01 1895.89 929 29 1104 5489 \n", "\n", " user_type \n", "100 c \n", "101 d \n", "102 d \n", "103 c \n", "104 c " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df[100:].head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Offset from bottom of the dataframe" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepriceproduct_idquantity_purchasedserial_nouser_iduser_type
9902016-01-133366.486111619905039d
9912016-10-02398.647752419915496a
9922016-07-014910.837432419925245d
9932016-11-021172.383001219935233b
9942016-01-171528.267541319945112c
\n", "
" ], "text/plain": [ " date price product_id quantity_purchased serial_no user_id \\\n", "990 2016-01-13 3366.48 611 16 1990 5039 \n", "991 2016-10-02 398.64 775 24 1991 5496 \n", "992 2016-07-01 4910.83 743 24 1992 5245 \n", "993 2016-11-02 1172.38 300 12 1993 5233 \n", "994 2016-01-17 1528.26 754 13 1994 5112 \n", "\n", " user_type \n", "990 d \n", "991 a \n", "992 d \n", "993 b \n", "994 c " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df[-10:].head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### TypeCasting/Data Type Conversion" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "date datetime64[ns]\n", "price float64\n", "product_id int64\n", "quantity_purchased int32\n", "serial_no int32\n", "user_id int64\n", "user_type object\n", "dtype: object\n" ] } ], "source": [ "df['date'] = pd.to_datetime(df.date)\n", "# compare dtypes of the original df with this one\n", "print(df.dtypes)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Apply/Map Usage" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Map : Create a derived attribute using map" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepriceproduct_idquantity_purchasedserial_nouser_iduser_typeuser_class
9952016-01-19404.667134019955976cexisting
9962016-02-01236.638084019965950bnew
9972016-08-013413.149653319975264bnew
9982016-10-024820.056783619985955bnew
9992016-01-153906.33551319995688anew
\n", "
" ], "text/plain": [ " date price product_id quantity_purchased serial_no user_id \\\n", "995 2016-01-19 404.66 713 40 1995 5976 \n", "996 2016-02-01 236.63 808 40 1996 5950 \n", "997 2016-08-01 3413.14 965 33 1997 5264 \n", "998 2016-10-02 4820.05 678 36 1998 5955 \n", "999 2016-01-15 3906.33 551 3 1999 5688 \n", "\n", " user_type user_class \n", "995 c existing \n", "996 b new \n", "997 b new \n", "998 b new \n", "999 a new " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df['user_class'] = df['user_type'].map(expand_user_type)\n", "display(df.tail())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Apply: Using apply to get attribute ranges" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "price 8490.61\n", "product_id 1099.00\n", "quantity_purchased 40.00\n", "serial_no 2000.00\n", "user_id 6093.00\n", "dtype: float64" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df.select_dtypes(include=[np.number]).apply(lambda x: \n", " x.max()- x.min()))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Applymap: Extract week from date" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df['purchase_week'] = df[['date']].applymap(lambda dt:dt.week \n", " if not pd.isnull(dt.week) \n", " else 0)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepriceproduct_idquantity_purchasedserial_nouser_iduser_typeuser_classpurchase_week
02016-01-231395.65031000-101nerror3
1NaT1352.999061910015632nerror0
2NaT3997.326252110025240nerror0
3NaT3681.488653510035557nerror0
42016-01-273850.22929310045489nerror4
\n", "
" ], "text/plain": [ " date price product_id quantity_purchased serial_no user_id \\\n", "0 2016-01-23 1395.65 0 3 1000 -101 \n", "1 NaT 1352.99 906 19 1001 5632 \n", "2 NaT 3997.32 625 21 1002 5240 \n", "3 NaT 3681.48 865 35 1003 5557 \n", "4 2016-01-27 3850.22 929 3 1004 5489 \n", "\n", " user_type user_class purchase_week \n", "0 n error 3 \n", "1 n error 0 \n", "2 n error 0 \n", "3 n error 0 \n", "4 n error 4 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df.head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Missing Values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Drop Rows with missing dates" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepriceproduct_idquantity_purchasedserial_nouser_iduser_typeuser_classpurchase_week
02016-01-231395.65031000-101nerror3
42016-01-273850.22929310045489nerror4
52016-01-28786.27300610055262nerror4
62016-01-222725.815723610065661nerror3
72016-10-014857.7010111410075412nerror39
\n", "
" ], "text/plain": [ " date price product_id quantity_purchased serial_no user_id \\\n", "0 2016-01-23 1395.65 0 3 1000 -101 \n", "4 2016-01-27 3850.22 929 3 1004 5489 \n", "5 2016-01-28 786.27 300 6 1005 5262 \n", "6 2016-01-22 2725.81 572 36 1006 5661 \n", "7 2016-10-01 4857.70 1011 14 1007 5412 \n", "\n", " user_type user_class purchase_week \n", "0 n error 3 \n", "4 n error 4 \n", "5 n error 4 \n", "6 n error 3 \n", "7 n error 39 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_dropped = df.dropna(subset=['date'])\n", "display(df_dropped.head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fill Missing Price values with mean price" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df_dropped['price'].fillna(value=np.round(df.price.mean(),decimals=2),\n", " inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fill Missing user_type values with value from previous row (forward fill) " ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df_dropped['user_type'].fillna(method='ffill',inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fill Missing user_type values with value from next row (backward fill)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df_dropped['user_type'].fillna(method='bfill',inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Duplicates" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Drop Duplicate serial_no rows" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepriceproduct_idquantity_purchasedserial_nouser_iduser_typeuser_classpurchase_week
752016-07-024584.9753412-15351anew26
972016-01-01743.379653-15264cexisting53
1002016-07-013151.1037938-15405cexisting26
1142016-05-01337.447366-15443dloyal_existing17
1452016-04-02682.7299426-15412anew13
\n", "
" ], "text/plain": [ " date price product_id quantity_purchased serial_no user_id \\\n", "75 2016-07-02 4584.97 534 12 -1 5351 \n", "97 2016-01-01 743.37 965 3 -1 5264 \n", "100 2016-07-01 3151.10 379 38 -1 5405 \n", "114 2016-05-01 337.44 736 6 -1 5443 \n", "145 2016-04-02 682.72 994 26 -1 5412 \n", "\n", " user_type user_class purchase_week \n", "75 a new 26 \n", "97 c existing 53 \n", "100 c existing 26 \n", "114 d loyal_existing 17 \n", "145 a new 13 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Shape of df=(969, 9)\n" ] } ], "source": [ "# sample duplicates\n", "display(df_dropped[df_dropped.duplicated(subset=['serial_no'])].head())\n", "print(\"Shape of df={}\".format(df_dropped.shape))" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df_dropped.drop_duplicates(subset=['serial_no'],inplace=True)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepriceproduct_idquantity_purchasedserial_nouser_iduser_typeuser_classpurchase_week
02016-01-231395.65031000-101nerror3
42016-01-273850.22929310045489nerror4
52016-01-28786.27300610055262nerror4
62016-01-222725.815723610065661nerror3
72016-10-014857.7010111410075412nerror39
\n", "
" ], "text/plain": [ " date price product_id quantity_purchased serial_no user_id \\\n", "0 2016-01-23 1395.65 0 3 1000 -101 \n", "4 2016-01-27 3850.22 929 3 1004 5489 \n", "5 2016-01-28 786.27 300 6 1005 5262 \n", "6 2016-01-22 2725.81 572 36 1006 5661 \n", "7 2016-10-01 4857.70 1011 14 1007 5412 \n", "\n", " user_type user_class purchase_week \n", "0 n error 3 \n", "4 n error 4 \n", "5 n error 4 \n", "6 n error 3 \n", "7 n error 39 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Shape of df=(940, 9)\n" ] } ], "source": [ "# updated dataframe\n", "display(df_dropped.head())\n", "print(\"Shape of df={}\".format(df_dropped.shape))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Remove rows which have less than 3 attributes with non-missing data" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepriceproduct_idquantity_purchasedserial_nouser_iduser_typeuser_classpurchase_week
02016-01-231395.65031000-101nerror3
1NaT1352.999061910015632nerror0
2NaT3997.326252110025240nerror0
3NaT3681.488653510035557nerror0
42016-01-273850.22929310045489nerror4
\n", "
" ], "text/plain": [ " date price product_id quantity_purchased serial_no user_id \\\n", "0 2016-01-23 1395.65 0 3 1000 -101 \n", "1 NaT 1352.99 906 19 1001 5632 \n", "2 NaT 3997.32 625 21 1002 5240 \n", "3 NaT 3681.48 865 35 1003 5557 \n", "4 2016-01-27 3850.22 929 3 1004 5489 \n", "\n", " user_type user_class purchase_week \n", "0 n error 3 \n", "1 n error 0 \n", "2 n error 0 \n", "3 n error 0 \n", "4 n error 4 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Shape of df=(1000, 9)\n" ] } ], "source": [ "display(df.dropna(thresh=3).head())\n", "print(\"Shape of df={}\".format(df.dropna(thresh=3).shape))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Encode Categoricals" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "One Hot Encoding using get_dummies()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepriceproduct_idquantity_purchasedserial_nouser_iduser_classpurchase_weekuser_type_auser_type_buser_type_cuser_type_duser_type_n
02016-01-231395.65031000-101error300001
1NaT1352.999061910015632error000001
2NaT3997.326252110025240error000001
3NaT3681.488653510035557error000001
42016-01-273850.22929310045489error400001
\n", "
" ], "text/plain": [ " date price product_id quantity_purchased serial_no user_id \\\n", "0 2016-01-23 1395.65 0 3 1000 -101 \n", "1 NaT 1352.99 906 19 1001 5632 \n", "2 NaT 3997.32 625 21 1002 5240 \n", "3 NaT 3681.48 865 35 1003 5557 \n", "4 2016-01-27 3850.22 929 3 1004 5489 \n", "\n", " user_class purchase_week user_type_a user_type_b user_type_c \\\n", "0 error 3 0 0 0 \n", "1 error 0 0 0 0 \n", "2 error 0 0 0 0 \n", "3 error 0 0 0 0 \n", "4 error 4 0 0 0 \n", "\n", " user_type_d user_type_n \n", "0 0 1 \n", "1 0 1 \n", "2 0 1 \n", "3 0 1 \n", "4 0 1 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(pd.get_dummies(df,columns=['user_type']).head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Label Mapping" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepriceproduct_idquantity_purchasedserial_nouser_iduser_typeuser_classpurchase_weekencoded_user_type
9952016-01-19404.667134019955976cexisting32.0
9962016-02-01236.638084019965950bnew51.0
9972016-08-013413.149653319975264bnew311.0
9982016-10-024820.056783619985955bnew391.0
9992016-01-153906.33551319995688anew20.0
\n", "
" ], "text/plain": [ " date price product_id quantity_purchased serial_no user_id \\\n", "995 2016-01-19 404.66 713 40 1995 5976 \n", "996 2016-02-01 236.63 808 40 1996 5950 \n", "997 2016-08-01 3413.14 965 33 1997 5264 \n", "998 2016-10-02 4820.05 678 36 1998 5955 \n", "999 2016-01-15 3906.33 551 3 1999 5688 \n", "\n", " user_type user_class purchase_week encoded_user_type \n", "995 c existing 3 2.0 \n", "996 b new 5 1.0 \n", "997 b new 31 1.0 \n", "998 b new 39 1.0 \n", "999 a new 2 0.0 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "type_map={'a':0,'b':1,'c':2,'d':3,np.NAN:-1}\n", "df['encoded_user_type'] = df.user_type.map(type_map)\n", "display((df.tail()))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Random Sampling data from DataFrame" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepriceproduct_idquantity_purchasedserial_nouser_iduser_typeuser_classpurchase_weekencoded_user_type
1022016-01-28429.296253311025240dloyal_existing43.0
4352016-01-251068.7310674114355943bnew41.0
8602016-08-025952.593203918605024cexisting312.0
2702016-12-011158.084052512705759cexisting482.0
1062016-08-022207.995724111065661bnew311.0
\n", "
" ], "text/plain": [ " date price product_id quantity_purchased serial_no user_id \\\n", "102 2016-01-28 429.29 625 33 1102 5240 \n", "435 2016-01-25 1068.73 1067 41 1435 5943 \n", "860 2016-08-02 5952.59 320 39 1860 5024 \n", "270 2016-12-01 1158.08 405 25 1270 5759 \n", "106 2016-08-02 2207.99 572 41 1106 5661 \n", "\n", " user_type user_class purchase_week encoded_user_type \n", "102 d loyal_existing 4 3.0 \n", "435 b new 4 1.0 \n", "860 c existing 31 2.0 \n", "270 c existing 48 2.0 \n", "106 b new 31 1.0 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df.sample(frac=0.2, replace=True, random_state=42).head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Normalizing Numeric Values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Normalize price values using **Min-Max Scaler**" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df_normalized = df.dropna().copy()\n", "min_max_scaler = preprocessing.MinMaxScaler()\n", "np_scaled = min_max_scaler.fit_transform(df_normalized['price'].values.reshape(-1,1))\n", "df_normalized['price'] = np_scaled.reshape(-1,1)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepriceproduct_idquantity_purchasedserial_nouser_iduser_typeuser_classpurchase_weekencoded_user_type
232016-01-310.0805918051210235042bnew41.0
322016-01-230.0928688003610325946dloyal_existing33.0
332016-05-010.102266538510335078dloyal_existing173.0
342016-08-020.08018710693110345202dloyal_existing313.0
352016-01-010.17136210673810355943anew530.0
\n", "
" ], "text/plain": [ " date price product_id quantity_purchased serial_no user_id \\\n", "23 2016-01-31 0.080591 805 12 1023 5042 \n", "32 2016-01-23 0.092868 800 36 1032 5946 \n", "33 2016-05-01 0.102266 538 5 1033 5078 \n", "34 2016-08-02 0.080187 1069 31 1034 5202 \n", "35 2016-01-01 0.171362 1067 38 1035 5943 \n", "\n", " user_type user_class purchase_week encoded_user_type \n", "23 b new 4 1.0 \n", "32 d loyal_existing 3 3.0 \n", "33 d loyal_existing 17 3.0 \n", "34 d loyal_existing 31 3.0 \n", "35 a new 53 0.0 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df_normalized.head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Normalize quantity purchased values using **Robust Scaler**" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df_normalized = df.dropna().copy()\n", "robust_scaler = preprocessing.RobustScaler()\n", "rs_scaled = robust_scaler.fit_transform(df_normalized['quantity_purchased'].values.reshape(-1,1))\n", "df_normalized['quantity_purchased'] = rs_scaled.reshape(-1,1)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepriceproduct_idquantity_purchasedserial_nouser_iduser_typeuser_classpurchase_weekencoded_user_type
232016-01-31686.87805-0.42857110235042bnew41.0
322016-01-23791.118000.71428610325946dloyal_existing33.0
332016-05-01870.90538-0.76190510335078dloyal_existing173.0
342016-08-02683.4410690.47619010345202dloyal_existing313.0
352016-01-011457.5710670.80952410355943anew530.0
\n", "
" ], "text/plain": [ " date price product_id quantity_purchased serial_no user_id \\\n", "23 2016-01-31 686.87 805 -0.428571 1023 5042 \n", "32 2016-01-23 791.11 800 0.714286 1032 5946 \n", "33 2016-05-01 870.90 538 -0.761905 1033 5078 \n", "34 2016-08-02 683.44 1069 0.476190 1034 5202 \n", "35 2016-01-01 1457.57 1067 0.809524 1035 5943 \n", "\n", " user_type user_class purchase_week encoded_user_type \n", "23 b new 4 1.0 \n", "32 d loyal_existing 3 3.0 \n", "33 d loyal_existing 17 3.0 \n", "34 d loyal_existing 31 3.0 \n", "35 a new 53 0.0 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df_normalized.head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data Summarization" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Condition based aggregation" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mean price of items purchased by user_type=a :: 2441.0280995475105\n" ] } ], "source": [ "print(\"Mean price of items purchased by user_type=a :: {}\".format(df['price'][df['user_type']=='a'].mean()))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Condtion based counts" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3 172\n", "4 166\n", "2 101\n", "39 53\n", "35 50\n", "26 50\n", "22 46\n", "9 46\n", "53 45\n", "13 45\n", "31 41\n", "44 39\n", "5 39\n", "0 31\n", "17 27\n", "48 27\n", "18 22\n", "Name: purchase_week, dtype: int64\n" ] } ], "source": [ "print(df['purchase_week'].value_counts())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Group By" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Group By certain attributes" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "user_class\n", "error 565\n", "existing 5299\n", "loyal_existing 5211\n", "new 9988\n", "Name: quantity_purchased, dtype: int32\n" ] } ], "source": [ "print(df.groupby(['user_class'])['quantity_purchased'].sum())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Group By with different aggregate functions" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
summeancount_nonzero
user_class
error56518.22580631
existing529920.699219256
loyal_existing521121.533058242
new998821.205945471
\n", "
" ], "text/plain": [ " sum mean count_nonzero\n", "user_class \n", "error 565 18.225806 31\n", "existing 5299 20.699219 256\n", "loyal_existing 5211 21.533058 242\n", "new 9988 21.205945 471" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df.groupby(['user_class'])['quantity_purchased'].agg([np.sum,\n", " np.mean,\n", " np.count_nonzero]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Group by specific aggregate functions for each attribute" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
quantity_purchasedprice
user_classuser_type
errorn402355.328710
existingc412502.277358
loyal_existingd412349.236695
newa412441.028100
b412592.225064
\n", "
" ], "text/plain": [ " quantity_purchased price\n", "user_class user_type \n", "error n 40 2355.328710\n", "existing c 41 2502.277358\n", "loyal_existing d 41 2349.236695\n", "new a 41 2441.028100\n", " b 41 2592.225064" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df.groupby(['user_class','user_type']).agg({'price':np.mean,\n", " 'quantity_purchased':np.max}))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Group by with multiple agg for each attribute" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Anaconda2\\envs\\python3\\lib\\site-packages\\pandas\\core\\groupby.py:4036: FutureWarning: using a dict with renaming is deprecated and will be removed in a future version\n", " return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
quantity_purchasedprice
sumcounttotal_pricevariance_pricemean_price
user_classuser_type
errorn56531.073015.191286.4208402355.328710
existingc5299256.0615560.231692.4738992502.277358
loyal_existingd5211242.0554419.861689.9762722349.236695
newa4752229.0539467.211530.8750302441.028100
b5236242.0609172.891746.4812122592.225064
\n", "
" ], "text/plain": [ " quantity_purchased price \\\n", " sum count total_price variance_price \n", "user_class user_type \n", "error n 565 31.0 73015.19 1286.420840 \n", "existing c 5299 256.0 615560.23 1692.473899 \n", "loyal_existing d 5211 242.0 554419.86 1689.976272 \n", "new a 4752 229.0 539467.21 1530.875030 \n", " b 5236 242.0 609172.89 1746.481212 \n", "\n", " \n", " mean_price \n", "user_class user_type \n", "error n 2355.328710 \n", "existing c 2502.277358 \n", "loyal_existing d 2349.236695 \n", "new a 2441.028100 \n", " b 2592.225064 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df.groupby(['user_class','user_type']).agg({'price':{\n", " 'total_price':np.sum,\n", " 'mean_price':np.mean,\n", " 'variance_price':np.std,\n", " 'count':np.count_nonzero},\n", " 'quantity_purchased':np.sum})) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Pivot Tables" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_typeabcdn
date
2016-01-011764.4271432783.2688891372.0355562394.892000NaN
2016-01-022783.7100001978.7000002249.1200002899.566667NaN
2016-01-132693.7460002589.3900003011.6100002391.913750NaN
2016-01-142456.0800001274.6233332336.7400002341.664286NaN
2016-01-152784.6350004452.0750002389.2525002019.024000NaN
2016-01-161871.7766674390.9100001474.1060003959.580000NaN
2016-01-171838.4675002715.7950002396.1675001604.277500NaN
2016-01-182287.1116674001.7937502302.7275002558.8800004192.760000
2016-01-192666.6025001599.9083331752.8833331199.605556NaN
2016-01-202931.5500003002.4833331593.8028572323.136000NaN
2016-01-212294.0500002676.7600001100.0900002637.204167NaN
2016-01-221982.8500001257.9628573056.1687502241.1216672267.086667
2016-01-232074.5800002288.3033333044.7100002620.3350001395.650000
2016-01-242643.1500002974.4685711296.0880002374.154000NaN
2016-01-252280.6628572126.7240002518.3660001378.290000NaN
2016-01-262856.3883332502.0040002032.230000856.983333NaN
2016-01-271861.8216672942.9000001988.678000NaN2465.666667
2016-01-282218.3466674202.7050002899.9433332481.795000786.270000
2016-01-292249.4050002308.1057143600.7644442495.716667NaN
2016-01-302340.6688892407.977143772.5800002442.558571NaN
2016-01-311747.9800002723.8657142928.4380002564.803333NaN
2016-02-014419.8666672494.5900003073.5737502686.650000NaN
2016-02-021486.7480002444.4550003577.7077781073.430000NaN
2016-03-012279.5900004372.2357142468.4400002574.437143NaN
2016-03-021490.2040003080.8720002099.9066672161.8671431920.210000
2016-04-012951.4528571038.9000002455.4414292187.444286NaN
2016-04-021507.8560002675.2340003138.1633332199.7320003027.860000
2016-05-013483.3633332249.2833332710.7200002948.008333NaN
2016-05-021992.6216672404.9480002845.6725002339.355000NaN
2016-06-012154.7000002748.5712501891.8900002777.006667NaN
2016-06-021924.1600001321.5490002408.6955562147.680000NaN
2016-07-012718.8683332836.8457143742.8683333811.535556538.690000
2016-07-023776.0600004228.1950002421.8450002798.5480003965.180000
2016-08-013618.5475003413.1400001708.8000002434.700000NaN
2016-08-022846.9700002320.3962503671.4881821544.168000NaN
2016-09-013104.2400003417.4885714045.3733332343.676667NaN
2016-09-021455.9440002181.5488892336.9600002016.3850003851.250000
2016-10-013133.8750001264.8100002009.4000002793.7300004184.955000
2016-10-021684.7940001824.7962502691.6980001785.662000NaN
2016-11-012104.942500NaN2520.6800002366.991429NaN
2016-11-024805.9340002694.2716671942.7666671376.7700002545.870000
2016-12-012400.4220003680.3012501640.0728572402.953333NaN
\n", "
" ], "text/plain": [ "user_type a b c d n\n", "date \n", "2016-01-01 1764.427143 2783.268889 1372.035556 2394.892000 NaN\n", "2016-01-02 2783.710000 1978.700000 2249.120000 2899.566667 NaN\n", "2016-01-13 2693.746000 2589.390000 3011.610000 2391.913750 NaN\n", "2016-01-14 2456.080000 1274.623333 2336.740000 2341.664286 NaN\n", "2016-01-15 2784.635000 4452.075000 2389.252500 2019.024000 NaN\n", "2016-01-16 1871.776667 4390.910000 1474.106000 3959.580000 NaN\n", "2016-01-17 1838.467500 2715.795000 2396.167500 1604.277500 NaN\n", "2016-01-18 2287.111667 4001.793750 2302.727500 2558.880000 4192.760000\n", "2016-01-19 2666.602500 1599.908333 1752.883333 1199.605556 NaN\n", "2016-01-20 2931.550000 3002.483333 1593.802857 2323.136000 NaN\n", "2016-01-21 2294.050000 2676.760000 1100.090000 2637.204167 NaN\n", "2016-01-22 1982.850000 1257.962857 3056.168750 2241.121667 2267.086667\n", "2016-01-23 2074.580000 2288.303333 3044.710000 2620.335000 1395.650000\n", "2016-01-24 2643.150000 2974.468571 1296.088000 2374.154000 NaN\n", "2016-01-25 2280.662857 2126.724000 2518.366000 1378.290000 NaN\n", "2016-01-26 2856.388333 2502.004000 2032.230000 856.983333 NaN\n", "2016-01-27 1861.821667 2942.900000 1988.678000 NaN 2465.666667\n", "2016-01-28 2218.346667 4202.705000 2899.943333 2481.795000 786.270000\n", "2016-01-29 2249.405000 2308.105714 3600.764444 2495.716667 NaN\n", "2016-01-30 2340.668889 2407.977143 772.580000 2442.558571 NaN\n", "2016-01-31 1747.980000 2723.865714 2928.438000 2564.803333 NaN\n", "2016-02-01 4419.866667 2494.590000 3073.573750 2686.650000 NaN\n", "2016-02-02 1486.748000 2444.455000 3577.707778 1073.430000 NaN\n", "2016-03-01 2279.590000 4372.235714 2468.440000 2574.437143 NaN\n", "2016-03-02 1490.204000 3080.872000 2099.906667 2161.867143 1920.210000\n", "2016-04-01 2951.452857 1038.900000 2455.441429 2187.444286 NaN\n", "2016-04-02 1507.856000 2675.234000 3138.163333 2199.732000 3027.860000\n", "2016-05-01 3483.363333 2249.283333 2710.720000 2948.008333 NaN\n", "2016-05-02 1992.621667 2404.948000 2845.672500 2339.355000 NaN\n", "2016-06-01 2154.700000 2748.571250 1891.890000 2777.006667 NaN\n", "2016-06-02 1924.160000 1321.549000 2408.695556 2147.680000 NaN\n", "2016-07-01 2718.868333 2836.845714 3742.868333 3811.535556 538.690000\n", "2016-07-02 3776.060000 4228.195000 2421.845000 2798.548000 3965.180000\n", "2016-08-01 3618.547500 3413.140000 1708.800000 2434.700000 NaN\n", "2016-08-02 2846.970000 2320.396250 3671.488182 1544.168000 NaN\n", "2016-09-01 3104.240000 3417.488571 4045.373333 2343.676667 NaN\n", "2016-09-02 1455.944000 2181.548889 2336.960000 2016.385000 3851.250000\n", "2016-10-01 3133.875000 1264.810000 2009.400000 2793.730000 4184.955000\n", "2016-10-02 1684.794000 1824.796250 2691.698000 1785.662000 NaN\n", "2016-11-01 2104.942500 NaN 2520.680000 2366.991429 NaN\n", "2016-11-02 4805.934000 2694.271667 1942.766667 1376.770000 2545.870000\n", "2016-12-01 2400.422000 3680.301250 1640.072857 2402.953333 NaN" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df.pivot_table(index='date', columns='user_type', \n", " values='price',aggfunc=np.mean))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Stack a Dataframe" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 date 2016-01-23 00:00:00\n", " price 1395.65\n", " product_id 0\n", " quantity_purchased 3\n", " serial_no 1000\n", " user_id -101\n", " user_type n\n", " user_class error\n", " purchase_week 3\n", "1 price 1352.99\n", " product_id 906\n", " quantity_purchased 19\n", " serial_no 1001\n", " user_id 5632\n", " user_type n\n", " user_class error\n", " purchase_week 0\n", "2 price 3997.32\n", " product_id 625\n", " quantity_purchased 21\n", " serial_no 1002\n", " user_id 5240\n", " user_type n\n", " user_class error\n", " purchase_week 0\n", "3 price 3681.48\n", " product_id 865\n", " quantity_purchased 35\n", " serial_no 1003\n", " user_id 5557\n", " ... \n", "997 date 2016-08-01 00:00:00\n", " price 3413.14\n", " product_id 965\n", " quantity_purchased 33\n", " serial_no 1997\n", " user_id 5264\n", " user_type b\n", " user_class new\n", " purchase_week 31\n", " encoded_user_type 1\n", "998 date 2016-10-02 00:00:00\n", " price 4820.05\n", " product_id 678\n", " quantity_purchased 36\n", " serial_no 1998\n", " user_id 5955\n", " user_type b\n", " user_class new\n", " purchase_week 39\n", " encoded_user_type 1\n", "999 date 2016-01-15 00:00:00\n", " price 3906.33\n", " product_id 551\n", " quantity_purchased 3\n", " serial_no 1999\n", " user_id 5688\n", " user_type a\n", " user_class new\n", " purchase_week 2\n", " encoded_user_type 0\n", "Length: 9907, dtype: object\n" ] } ], "source": [ "print(df.stack())" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }