{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "%matplotlib inline\n", "\n", "from datetime import datetime, date" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Loading the Customer Address Data from the excel file\n", "\n", "cust_address = pd.read_excel('Raw_data.xlsx' , sheet_name='CustomerAddress')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>customer_id</th>\n", " <th>address</th>\n", " <th>postcode</th>\n", " <th>state</th>\n", " <th>country</th>\n", " <th>property_valuation</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>060 Morning Avenue</td>\n", " <td>2016</td>\n", " <td>New South Wales</td>\n", " <td>Australia</td>\n", " <td>10</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>6 Meadow Vale Court</td>\n", " <td>2153</td>\n", " <td>New South Wales</td>\n", " <td>Australia</td>\n", " <td>10</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>4</td>\n", " <td>0 Holy Cross Court</td>\n", " <td>4211</td>\n", " <td>QLD</td>\n", " <td>Australia</td>\n", " <td>9</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>5</td>\n", " <td>17979 Del Mar Point</td>\n", " <td>2448</td>\n", " <td>New South Wales</td>\n", " <td>Australia</td>\n", " <td>4</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>6</td>\n", " <td>9 Oakridge Court</td>\n", " <td>3216</td>\n", " <td>VIC</td>\n", " <td>Australia</td>\n", " <td>9</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " customer_id address postcode state country \\\n", "0 1 060 Morning Avenue 2016 New South Wales Australia \n", "1 2 6 Meadow Vale Court 2153 New South Wales Australia \n", "2 4 0 Holy Cross Court 4211 QLD Australia \n", "3 5 17979 Del Mar Point 2448 New South Wales Australia \n", "4 6 9 Oakridge Court 3216 VIC Australia \n", "\n", " property_valuation \n", "0 10 \n", "1 10 \n", "2 9 \n", "3 4 \n", "4 9 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Checking first 5 records from Customer Address Data\n", "\n", "cust_address.head(5)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<class 'pandas.core.frame.DataFrame'>\n", "RangeIndex: 3999 entries, 0 to 3998\n", "Data columns (total 6 columns):\n", "customer_id 3999 non-null int64\n", "address 3999 non-null object\n", "postcode 3999 non-null int64\n", "state 3999 non-null object\n", "country 3999 non-null object\n", "property_valuation 3999 non-null int64\n", "dtypes: int64(3), object(3)\n", "memory usage: 187.5+ KB\n" ] } ], "source": [ "# Information of columns and data-types of Customer Address Data.\n", "\n", "cust_address.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The data-type of columns looks fine. Let;s check for the data quality and apply data cleaning process where ever applicable to clean our dataset before performing any analysis." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Total Records" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total records (rows) in the dataset : 3999\n", "Total columns (features) in the dataset : 6\n" ] } ], "source": [ "print(\"Total records (rows) in the dataset : {}\".format(cust_address.shape[0]))\n", "print(\"Total columns (features) in the dataset : {}\".format(cust_address.shape[1]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Numeric Columns and Non-Numeric Columns" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The numeric columns are : ['customer_id' 'postcode' 'property_valuation']\n", "The non-numeric columns are : ['address' 'state' 'country']\n" ] } ], "source": [ "# select numeric columns\n", "df_numeric = cust_address.select_dtypes(include=[np.number])\n", "numeric_cols = df_numeric.columns.values\n", "print(\"The numeric columns are : {}\".format(numeric_cols))\n", "\n", "\n", "# select non-numeric columns\n", "df_non_numeric = cust_address.select_dtypes(exclude=[np.number])\n", "non_numeric_cols = df_non_numeric.columns.values\n", "print(\"The non-numeric columns are : {}\".format(non_numeric_cols))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Missing Values Check" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Checking for the presence of any missing values in the dataset. If missing values are present for a particular feature then depending upon the situation the feature may be either dropped (cases when a major amount of data is missing) or an appropiate value will be imputed in the feature column with missing values." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "customer_id 0\n", "address 0\n", "postcode 0\n", "state 0\n", "country 0\n", "property_valuation 0\n", "dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total number of missing values\n", "cust_address.isnull().sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "<b>In the dataset there are no missing values.</b>" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Inconsistency Check in Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will check whether there is inconsistent data / typo error data is present in the categorical columns.<br>\n", "The columns to be checked are <b>'address', 'postcode' ,'state', 'country'</b>." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.1 State" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "NSW 2054\n", "VIC 939\n", "QLD 838\n", "New South Wales 86\n", "Victoria 82\n", "Name: state, dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cust_address['state'].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here there are inconsistent data in State column. For New South Wales and Victoria we have two values, one being the full name and the other being their short name. The State names should be standardised and columns with state as <b>New South Wales will be replaced by NSW</b> and columns with state as <b>Victoria will be replaced by VIC</b>." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Function to replace full state names with their short forms.\n", "\n", "def replace_state_names(state_name):\n", " \n", " # Making Short Form of State Names as standards\n", " if state_name=='New South Wales':\n", " return 'NSW'\n", " elif state_name=='Victoria':\n", " return 'VIC'\n", " else :\n", " return state_name\n", "\n", "# Applying the above fuction to state column\n", "cust_address['state'] = cust_address['state'].apply(replace_state_names)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "After applying the above function the state name is standardised and there is no inconsistency in the state column." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "NSW 2140\n", "VIC 1021\n", "QLD 838\n", "Name: state, dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cust_address['state'].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.2 Country" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Australia 3999\n", "Name: country, dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cust_address['country'].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There is no inconsistency of data in the Country column." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.3 Postcode" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The Postcode column looks perfect. There is no inconsistency / typo in the data." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>address</th>\n", " <th>postcode</th>\n", " <th>state</th>\n", " <th>country</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>060 Morning Avenue</td>\n", " <td>2016</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>6 Meadow Vale Court</td>\n", " <td>2153</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0 Holy Cross Court</td>\n", " <td>4211</td>\n", " <td>QLD</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>17979 Del Mar Point</td>\n", " <td>2448</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>9 Oakridge Court</td>\n", " <td>3216</td>\n", " <td>VIC</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>4 Delaware Trail</td>\n", " <td>2210</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>49 Londonderry Lane</td>\n", " <td>2650</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>97736 7th Trail</td>\n", " <td>2023</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>93405 Ludington Park</td>\n", " <td>3044</td>\n", " <td>VIC</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>44339 Golden Leaf Alley</td>\n", " <td>4557</td>\n", " <td>QLD</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>2 Sutherland Street</td>\n", " <td>3799</td>\n", " <td>VIC</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>9 Mcbride Trail</td>\n", " <td>2760</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>9861 New Castle Avenue</td>\n", " <td>2428</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>52 Moland Street</td>\n", " <td>3331</td>\n", " <td>VIC</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>82391 Kensington Lane</td>\n", " <td>3058</td>\n", " <td>VIC</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>092 2nd Alley</td>\n", " <td>2135</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>59 Spaight Circle</td>\n", " <td>2233</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>032 Bartelt Crossing</td>\n", " <td>2444</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>18 Jenna Center</td>\n", " <td>2650</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>3 Cordelia Plaza</td>\n", " <td>2153</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>20</th>\n", " <td>28 5th Center</td>\n", " <td>4413</td>\n", " <td>QLD</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>21</th>\n", " <td>52 Carey Alley</td>\n", " <td>4740</td>\n", " <td>QLD</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>96 Texas Plaza</td>\n", " <td>3218</td>\n", " <td>VIC</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>48 Eagan Avenue</td>\n", " <td>4868</td>\n", " <td>QLD</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>9 Buell Park</td>\n", " <td>4116</td>\n", " <td>QLD</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>25</th>\n", " <td>02663 Buell Parkway</td>\n", " <td>2519</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>26</th>\n", " <td>2294 Pleasure Place</td>\n", " <td>2135</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>27</th>\n", " <td>2951 Petterle Place</td>\n", " <td>2756</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>28</th>\n", " <td>63 Lukken Drive</td>\n", " <td>2170</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>29</th>\n", " <td>833 Luster Way</td>\n", " <td>4005</td>\n", " <td>QLD</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>3969</th>\n", " <td>81609 Vernon Terrace</td>\n", " <td>3934</td>\n", " <td>VIC</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3970</th>\n", " <td>37 Hintze Court</td>\n", " <td>2168</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3971</th>\n", " <td>07 Morning Court</td>\n", " <td>3805</td>\n", " <td>VIC</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3972</th>\n", " <td>327 Loeprich Street</td>\n", " <td>3187</td>\n", " <td>VIC</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3973</th>\n", " <td>5106 Northridge Drive</td>\n", " <td>2560</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3974</th>\n", " <td>9754 High Crossing Terrace</td>\n", " <td>2148</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3975</th>\n", " <td>7 Meadow Vale Court</td>\n", " <td>2114</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3976</th>\n", " <td>80260 Morning Road</td>\n", " <td>3178</td>\n", " <td>VIC</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3977</th>\n", " <td>38017 Briar Crest Drive</td>\n", " <td>2165</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3978</th>\n", " <td>60 Morningstar Center</td>\n", " <td>2126</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3979</th>\n", " <td>218 Stuart Junction</td>\n", " <td>2223</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3980</th>\n", " <td>9 Butterfield Lane</td>\n", " <td>4077</td>\n", " <td>QLD</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3981</th>\n", " <td>614 Burning Wood Way</td>\n", " <td>2148</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3982</th>\n", " <td>9 Grover Point</td>\n", " <td>4218</td>\n", " <td>QLD</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3983</th>\n", " <td>565 Bunting Park</td>\n", " <td>2076</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3984</th>\n", " <td>9461 Saint Paul Trail</td>\n", " <td>2428</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3985</th>\n", " <td>5204 Delaware Pass</td>\n", " <td>2560</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3986</th>\n", " <td>24 Scott Pass</td>\n", " <td>4300</td>\n", " <td>QLD</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3987</th>\n", " <td>8 Randy Parkway</td>\n", " <td>2209</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3988</th>\n", " <td>681 Elmside Place</td>\n", " <td>3750</td>\n", " <td>VIC</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3989</th>\n", " <td>2918 Summer Ridge Hill</td>\n", " <td>3030</td>\n", " <td>VIC</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3990</th>\n", " <td>613 Erie Lane</td>\n", " <td>2088</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3991</th>\n", " <td>0 Transport Center</td>\n", " <td>3977</td>\n", " <td>VIC</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3992</th>\n", " <td>4 Dovetail Crossing</td>\n", " <td>2350</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3993</th>\n", " <td>736 Roxbury Junction</td>\n", " <td>2540</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3994</th>\n", " <td>1482 Hauk Trail</td>\n", " <td>3064</td>\n", " <td>VIC</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3995</th>\n", " <td>57042 Village Green Point</td>\n", " <td>4511</td>\n", " <td>QLD</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3996</th>\n", " <td>87 Crescent Oaks Alley</td>\n", " <td>2756</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3997</th>\n", " <td>8194 Lien Street</td>\n", " <td>4032</td>\n", " <td>QLD</td>\n", " <td>Australia</td>\n", " </tr>\n", " <tr>\n", " <th>3998</th>\n", " <td>320 Acker Drive</td>\n", " <td>2251</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>3999 rows × 4 columns</p>\n", "</div>" ], "text/plain": [ " address postcode state country\n", "0 060 Morning Avenue 2016 NSW Australia\n", "1 6 Meadow Vale Court 2153 NSW Australia\n", "2 0 Holy Cross Court 4211 QLD Australia\n", "3 17979 Del Mar Point 2448 NSW Australia\n", "4 9 Oakridge Court 3216 VIC Australia\n", "5 4 Delaware Trail 2210 NSW Australia\n", "6 49 Londonderry Lane 2650 NSW Australia\n", "7 97736 7th Trail 2023 NSW Australia\n", "8 93405 Ludington Park 3044 VIC Australia\n", "9 44339 Golden Leaf Alley 4557 QLD Australia\n", "10 2 Sutherland Street 3799 VIC Australia\n", "11 9 Mcbride Trail 2760 NSW Australia\n", "12 9861 New Castle Avenue 2428 NSW Australia\n", "13 52 Moland Street 3331 VIC Australia\n", "14 82391 Kensington Lane 3058 VIC Australia\n", "15 092 2nd Alley 2135 NSW Australia\n", "16 59 Spaight Circle 2233 NSW Australia\n", "17 032 Bartelt Crossing 2444 NSW Australia\n", "18 18 Jenna Center 2650 NSW Australia\n", "19 3 Cordelia Plaza 2153 NSW Australia\n", "20 28 5th Center 4413 QLD Australia\n", "21 52 Carey Alley 4740 QLD Australia\n", "22 96 Texas Plaza 3218 VIC Australia\n", "23 48 Eagan Avenue 4868 QLD Australia\n", "24 9 Buell Park 4116 QLD Australia\n", "25 02663 Buell Parkway 2519 NSW Australia\n", "26 2294 Pleasure Place 2135 NSW Australia\n", "27 2951 Petterle Place 2756 NSW Australia\n", "28 63 Lukken Drive 2170 NSW Australia\n", "29 833 Luster Way 4005 QLD Australia\n", "... ... ... ... ...\n", "3969 81609 Vernon Terrace 3934 VIC Australia\n", "3970 37 Hintze Court 2168 NSW Australia\n", "3971 07 Morning Court 3805 VIC Australia\n", "3972 327 Loeprich Street 3187 VIC Australia\n", "3973 5106 Northridge Drive 2560 NSW Australia\n", "3974 9754 High Crossing Terrace 2148 NSW Australia\n", "3975 7 Meadow Vale Court 2114 NSW Australia\n", "3976 80260 Morning Road 3178 VIC Australia\n", "3977 38017 Briar Crest Drive 2165 NSW Australia\n", "3978 60 Morningstar Center 2126 NSW Australia\n", "3979 218 Stuart Junction 2223 NSW Australia\n", "3980 9 Butterfield Lane 4077 QLD Australia\n", "3981 614 Burning Wood Way 2148 NSW Australia\n", "3982 9 Grover Point 4218 QLD Australia\n", "3983 565 Bunting Park 2076 NSW Australia\n", "3984 9461 Saint Paul Trail 2428 NSW Australia\n", "3985 5204 Delaware Pass 2560 NSW Australia\n", "3986 24 Scott Pass 4300 QLD Australia\n", "3987 8 Randy Parkway 2209 NSW Australia\n", "3988 681 Elmside Place 3750 VIC Australia\n", "3989 2918 Summer Ridge Hill 3030 VIC Australia\n", "3990 613 Erie Lane 2088 NSW Australia\n", "3991 0 Transport Center 3977 VIC Australia\n", "3992 4 Dovetail Crossing 2350 NSW Australia\n", "3993 736 Roxbury Junction 2540 NSW Australia\n", "3994 1482 Hauk Trail 3064 VIC Australia\n", "3995 57042 Village Green Point 4511 QLD Australia\n", "3996 87 Crescent Oaks Alley 2756 NSW Australia\n", "3997 8194 Lien Street 4032 QLD Australia\n", "3998 320 Acker Drive 2251 NSW Australia\n", "\n", "[3999 rows x 4 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cust_address[['address','postcode', 'state' , 'country']].drop_duplicates()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Duplication Checks" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We need to ensure that there is no duplication of records in the dataset. This may lead to error in data analysis due to poor data quality. If there are duplicate rows of data then we need to drop such records.<br>For checking for duplicate records we need to firstly remove the primary key column of the dataset then apply drop_duplicates() function provided by Python." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of records after removing customer_id (pk), duplicates : 3999\n", "Number of records in original dataset : 3999\n" ] } ], "source": [ "# Dropping the primary key column i.e customer_id and storing into a temporary dataframe.\n", "cust_address_dedupped = cust_address.drop('customer_id', axis=1).drop_duplicates()\n", "\n", "print(\"Number of records after removing customer_id (pk), duplicates : {}\".format(cust_address_dedupped.shape[0]))\n", "print(\"Number of records in original dataset : {}\".format(cust_address.shape[0]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "<b>Since both the numbers are same. There are no duplicate records in the dataset</b>" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Exporting the Cleaned Customer Demographic Data Set to csv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Currently the Customer Address dataset is clean. Hence we can export the data to a csv to continue our data analysis of Customer Segments by joining it to other tables." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "cust_address.to_csv('CustomerAddress_Cleaned.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Checking for Master-Detail Record Counts" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Checking with the Master Table (CustomerDemographic_Cleaned.csv) containing the entire Customer Data for the Customer IDs which are getting dropped from the Customer Address Dataset.<br> \n", "<b>Basically these are the Customers who have an address but are not a part of the Demographics dataset yet.</b>" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "cust_demo_detail = pd.read_csv('CustomerDemographic_Cleaned.csv')" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>customer_id</th>\n", " <th>first_name</th>\n", " <th>last_name</th>\n", " <th>gender</th>\n", " <th>past_3_years_bike_related_purchases</th>\n", " <th>DOB</th>\n", " <th>job_title</th>\n", " <th>job_industry_category</th>\n", " <th>wealth_segment</th>\n", " <th>deceased_indicator</th>\n", " <th>owns_car</th>\n", " <th>tenure</th>\n", " <th>Age</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>Laraine</td>\n", " <td>Medendorp</td>\n", " <td>Female</td>\n", " <td>93</td>\n", " <td>1953-10-12</td>\n", " <td>Executive Secretary</td>\n", " <td>Health</td>\n", " <td>Mass Customer</td>\n", " <td>N</td>\n", " <td>Yes</td>\n", " <td>11.0</td>\n", " <td>67</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>Eli</td>\n", " <td>Bockman</td>\n", " <td>Male</td>\n", " <td>81</td>\n", " <td>1980-12-16</td>\n", " <td>Administrative Officer</td>\n", " <td>Financial Services</td>\n", " <td>Mass Customer</td>\n", " <td>N</td>\n", " <td>Yes</td>\n", " <td>16.0</td>\n", " <td>40</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>Arlin</td>\n", " <td>Dearle</td>\n", " <td>Male</td>\n", " <td>61</td>\n", " <td>1954-01-20</td>\n", " <td>Recruiting Manager</td>\n", " <td>Property</td>\n", " <td>Mass Customer</td>\n", " <td>N</td>\n", " <td>Yes</td>\n", " <td>15.0</td>\n", " <td>67</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4</td>\n", " <td>Talbot</td>\n", " <td>None</td>\n", " <td>Male</td>\n", " <td>33</td>\n", " <td>1961-10-03</td>\n", " <td>Missing</td>\n", " <td>IT</td>\n", " <td>Mass Customer</td>\n", " <td>N</td>\n", " <td>No</td>\n", " <td>7.0</td>\n", " <td>59</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>5</td>\n", " <td>Sheila-kathryn</td>\n", " <td>Calton</td>\n", " <td>Female</td>\n", " <td>56</td>\n", " <td>1977-05-13</td>\n", " <td>Senior Editor</td>\n", " <td>Missing</td>\n", " <td>Affluent Customer</td>\n", " <td>N</td>\n", " <td>Yes</td>\n", " <td>8.0</td>\n", " <td>43</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " customer_id first_name last_name gender \\\n", "0 1 Laraine Medendorp Female \n", "1 2 Eli Bockman Male \n", "2 3 Arlin Dearle Male \n", "3 4 Talbot None Male \n", "4 5 Sheila-kathryn Calton Female \n", "\n", " past_3_years_bike_related_purchases DOB job_title \\\n", "0 93 1953-10-12 Executive Secretary \n", "1 81 1980-12-16 Administrative Officer \n", "2 61 1954-01-20 Recruiting Manager \n", "3 33 1961-10-03 Missing \n", "4 56 1977-05-13 Senior Editor \n", "\n", " job_industry_category wealth_segment deceased_indicator owns_car \\\n", "0 Health Mass Customer N Yes \n", "1 Financial Services Mass Customer N Yes \n", "2 Property Mass Customer N Yes \n", "3 IT Mass Customer N No \n", "4 Missing Affluent Customer N Yes \n", "\n", " tenure Age \n", "0 11.0 67 \n", "1 16.0 40 \n", "2 15.0 67 \n", "3 7.0 59 \n", "4 8.0 43 " ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cust_demo_detail.head()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total Records in Customer_Demographic_Table : 3912\n", "Total Records in Customer_Address_Table : 3999\n", "In Demographic Table 87 records are getting dropped due to data cleaning process in Demographic Table\n" ] } ], "source": [ "print(\"Total Records in Customer_Demographic_Table : {}\".format(cust_demo_detail.shape[0]))\n", "print(\"Total Records in Customer_Address_Table : {}\".format(cust_address.shape[0]))\n", "print('In Demographic Table {} records are getting dropped due to data cleaning process in Demographic Table'\n", " .format(cust_address.shape[0]-cust_demo_detail.shape[0]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Customer IDs in Address table getting dropped :" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>customer_id</th>\n", " <th>address</th>\n", " <th>postcode</th>\n", " <th>state</th>\n", " <th>country</th>\n", " <th>property_valuation</th>\n", " <th>first_name</th>\n", " <th>last_name</th>\n", " <th>gender</th>\n", " <th>past_3_years_bike_related_purchases</th>\n", " <th>DOB</th>\n", " <th>job_title</th>\n", " <th>job_industry_category</th>\n", " <th>wealth_segment</th>\n", " <th>deceased_indicator</th>\n", " <th>owns_car</th>\n", " <th>tenure</th>\n", " <th>Age</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>060 Morning Avenue</td>\n", " <td>2016.0</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " <td>10.0</td>\n", " <td>Laraine</td>\n", " <td>Medendorp</td>\n", " <td>Female</td>\n", " <td>93.0</td>\n", " <td>1953-10-12</td>\n", " <td>Executive Secretary</td>\n", " <td>Health</td>\n", " <td>Mass Customer</td>\n", " <td>N</td>\n", " <td>Yes</td>\n", " <td>11.0</td>\n", " <td>67.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>6 Meadow Vale Court</td>\n", " <td>2153.0</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " <td>10.0</td>\n", " <td>Eli</td>\n", " <td>Bockman</td>\n", " <td>Male</td>\n", " <td>81.0</td>\n", " <td>1980-12-16</td>\n", " <td>Administrative Officer</td>\n", " <td>Financial Services</td>\n", " <td>Mass Customer</td>\n", " <td>N</td>\n", " <td>Yes</td>\n", " <td>16.0</td>\n", " <td>40.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>4</td>\n", " <td>0 Holy Cross Court</td>\n", " <td>4211.0</td>\n", " <td>QLD</td>\n", " <td>Australia</td>\n", " <td>9.0</td>\n", " <td>Talbot</td>\n", " <td>None</td>\n", " <td>Male</td>\n", " <td>33.0</td>\n", " <td>1961-10-03</td>\n", " <td>Missing</td>\n", " <td>IT</td>\n", " <td>Mass Customer</td>\n", " <td>N</td>\n", " <td>No</td>\n", " <td>7.0</td>\n", " <td>59.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>5</td>\n", " <td>17979 Del Mar Point</td>\n", " <td>2448.0</td>\n", " <td>NSW</td>\n", " <td>Australia</td>\n", " <td>4.0</td>\n", " <td>Sheila-kathryn</td>\n", " <td>Calton</td>\n", " <td>Female</td>\n", " <td>56.0</td>\n", " <td>1977-05-13</td>\n", " <td>Senior Editor</td>\n", " <td>Missing</td>\n", " <td>Affluent Customer</td>\n", " <td>N</td>\n", " <td>Yes</td>\n", " <td>8.0</td>\n", " <td>43.0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>6</td>\n", " <td>9 Oakridge Court</td>\n", " <td>3216.0</td>\n", " <td>VIC</td>\n", " <td>Australia</td>\n", " <td>9.0</td>\n", " <td>Curr</td>\n", " <td>Duckhouse</td>\n", " <td>Male</td>\n", " <td>35.0</td>\n", " <td>1966-09-16</td>\n", " <td>Missing</td>\n", " <td>Retail</td>\n", " <td>High Net Worth</td>\n", " <td>N</td>\n", " <td>Yes</td>\n", " <td>13.0</td>\n", " <td>54.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " customer_id address postcode state country \\\n", "0 1 060 Morning Avenue 2016.0 NSW Australia \n", "1 2 6 Meadow Vale Court 2153.0 NSW Australia \n", "2 4 0 Holy Cross Court 4211.0 QLD Australia \n", "3 5 17979 Del Mar Point 2448.0 NSW Australia \n", "4 6 9 Oakridge Court 3216.0 VIC Australia \n", "\n", " property_valuation first_name last_name gender \\\n", "0 10.0 Laraine Medendorp Female \n", "1 10.0 Eli Bockman Male \n", "2 9.0 Talbot None Male \n", "3 4.0 Sheila-kathryn Calton Female \n", "4 9.0 Curr Duckhouse Male \n", "\n", " past_3_years_bike_related_purchases DOB job_title \\\n", "0 93.0 1953-10-12 Executive Secretary \n", "1 81.0 1980-12-16 Administrative Officer \n", "2 33.0 1961-10-03 Missing \n", "3 56.0 1977-05-13 Senior Editor \n", "4 35.0 1966-09-16 Missing \n", "\n", " job_industry_category wealth_segment deceased_indicator owns_car \\\n", "0 Health Mass Customer N Yes \n", "1 Financial Services Mass Customer N Yes \n", "2 IT Mass Customer N No \n", "3 Missing Affluent Customer N Yes \n", "4 Retail High Net Worth N Yes \n", "\n", " tenure Age \n", "0 11.0 67.0 \n", "1 16.0 40.0 \n", "2 7.0 59.0 \n", "3 8.0 43.0 \n", "4 13.0 54.0 " ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cust_drop = cust_address.merge(cust_demo_detail , left_on = 'customer_id', right_on='customer_id'\n", " , how='outer')\n", "cust_drop.head()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }