{ "cells": [ { "cell_type": "markdown", "id": "2540c9d1", "metadata": {}, "source": [ "# Supermarket Ordering, Invoicing, and Sales\n", "\n", "Joel Day, Nicholas Lee, and Christine Vu\n", "\n", "Shiley-Marcos School of Engineering, University of San Diego\n", "\n", "ADS 507: Practical Data Engineering\n", "\n", "Professor Jonathan Sixt\n", "\n", "February 27, 2023" ] }, { "cell_type": "markdown", "id": "a4ddd353", "metadata": {}, "source": [ "***" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Description" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Invoices.csv" ] }, { "cell_type": "markdown", "id": "99906f06", "metadata": {}, "source": [ "| Variable | Description |\n", "| --- | --- |\n", "| Order Id | The order identification number |\n", "| Date | The date the order was placed |\n", "| Meal Id | The meal identification number |\n", "| Company Id | The company identification number |\n", "| Date of Meal | The date the meal was served |\n", "| Participants | The number of people who participated in the meal |\n", "| Meal Price | The cost of the meal |\n", "| Type of Meal | The type of meal that was ordered |" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### OrderLeads.csv" ] }, { "cell_type": "markdown", "id": "99906f06", "metadata": {}, "source": [ "| Variable | Description |\n", "| --- | --- |\n", "| Order Id | The order identification number |\n", "| Company Id | The company identification number |\n", "| Company Name | The name of the company associated with the order |\n", "| Date | The date the order was placed |\n", "| Order Value | The total value of the order |\n", "| Converted | Whether or not the order was converted into a sale |" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### SalesTeam.csv" ] }, { "cell_type": "markdown", "id": "99906f06", "metadata": {}, "source": [ "| Variable | Description |\n", "| --- | --- |\n", "| Sales Rep | The name of the sales representative |\n", "| Sales Rep Id | The sales representative identification number |\n", "| Company Name | The name of the company associated with the order |\n", "| Company Id | The company identification number |" ] }, { "cell_type": "markdown", "id": "de343d32", "metadata": {}, "source": [ "***" ] }, { "cell_type": "markdown", "id": "af0c1b94", "metadata": {}, "source": [ "## Data Importing and Pre-processing" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Packages\n", "import datetime as dt\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import re\n", "\n", "import pymysql\n", "from sqlalchemy import create_engine\n", "import requests\n", "import io\n", "import os\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Import in CSV files" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Function to Pull Raw CSV from GitHub and Convert to Pandas Dataframe Object\n", "\n", "def github_to_pandas(raw_git_url):\n", " # Pull Raw CSV File from GitHub\n", " file_name = str(raw_git_url)\n", " pull_file = requests.get(file_name).content\n", "\n", " # Convert Raw CSV to Pandas Dataframe\n", " csv_df = pd.read_csv(io.StringIO(pull_file.decode('utf-8')))\n", "\n", " return csv_df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Pull CSV files from GitHub and Convert to Pandas Dataframe\n", "invoice_df = github_to_pandas(\n", " \"https://raw.githubusercontent.com/nlee98/ADS-507-Data-Engineering/main/Invoices.csv\")\n", "\n", "orderleads_df = github_to_pandas(\n", " \"https://raw.githubusercontent.com/nlee98/ADS-507-Data-Engineering/main/OrderLeads.csv\")\n", "\n", "salesteam_df = github_to_pandas(\n", " \"https://raw.githubusercontent.com/nlee98/ADS-507-Data-Engineering/main/SalesTeam.csv\")\n", "\n", "print(\"CSV Files from GitHub Loaded into Pandas DataFrames\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data Pre-processing" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "- Invoice Missing Values:\n", " Order Id 0\n", "Date 0\n", "Meal Id 0\n", "Company Id 0\n", "Date of Meal 0\n", "Participants 0\n", "Meal Price 0\n", "Type of Meal 0\n", "dtype: int64\n", "\n", "- Order Leads Missing Values:\n", " Order Id 0\n", "Company Id 0\n", "Company Name 0\n", "Date 0\n", "Order Value 0\n", "Converted 0\n", "dtype: int64\n", "\n", "- Sales Team Missing Values:\n", " Sales Rep 0\n", "Sales Rep Id 0\n", "Company Name 0\n", "Company Id 0\n", "dtype: int64\n" ] } ], "source": [ "# Find missing values\n", "print(\"- Invoice Missing Values:\\n\", invoice_df.isnull().sum())\n", "print(\"\\n- Order Leads Missing Values:\\n\", orderleads_df.isnull().sum())\n", "print(\"\\n- Sales Team Missing Values:\\n\", salesteam_df.isnull().sum())" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "- Invoice Data Types:\n", " Order Id object\n", "Date object\n", "Meal Id object\n", "Company Id object\n", "Date of Meal object\n", "Participants object\n", "Meal Price int64\n", "Type of Meal object\n", "dtype: object\n", "\n", "- Order Leads Data Types:\n", " Order Id object\n", "Company Id object\n", "Company Name object\n", "Date object\n", "Order Value int64\n", "Converted int64\n", "dtype: object\n", "\n", "- Sales Team Data Types:\n", " Sales Rep object\n", "Sales Rep Id object\n", "Company Name object\n", "Company Id object\n", "dtype: object\n" ] } ], "source": [ "# Data types of all columns\n", "print(\"- Invoice Data Types:\\n\", invoice_df.dtypes)\n", "print(\"\\n- Order Leads Data Types:\\n\", orderleads_df.dtypes)\n", "print(\"\\n- Sales Team Data Types:\\n\", salesteam_df.dtypes)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "- Invoice Duplicated Values: 0\n", "- Order Leads Duplicated Values: 0\n", "- Sales Team Duplicated Values: 0\n" ] } ], "source": [ "# Duplicated data\n", "print(\"- Invoice Duplicated Values:\", invoice_df.duplicated().sum())\n", "print(\"- Order Leads Duplicated Values:\", orderleads_df.duplicated().sum())\n", "print(\"- Sales Team Duplicated Values:\", salesteam_df.duplicated().sum())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "***" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Explore CSV Files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Invoice CSV" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Order IdDateMeal IdCompany IdDate of MealParticipantsMeal PriceType of Meal
0839FKFW2LLX4LMBB27-05-2016INBUX904GIHI8YBDLJKS5NK6788CYMUU2016-05-31 07:00:00+02:00['David Bishop']469Breakfast
197OX39BGVMHODLJM27-09-2018J0MMOOPP709DIDIELJKS5NK6788CYMUU2018-10-01 20:00:00+02:00['David Bishop']22Dinner
2041ORQM5OIHTIU6L24-08-2014E4UJLQNCI16UX5CSLJKS5NK6788CYMUU2014-08-23 14:00:00+02:00['Karen Stansell']314Lunch
\n", "
" ], "text/plain": [ " Order Id Date Meal Id Company Id \\\n", "0 839FKFW2LLX4LMBB 27-05-2016 INBUX904GIHI8YBD LJKS5NK6788CYMUU \n", "1 97OX39BGVMHODLJM 27-09-2018 J0MMOOPP709DIDIE LJKS5NK6788CYMUU \n", "2 041ORQM5OIHTIU6L 24-08-2014 E4UJLQNCI16UX5CS LJKS5NK6788CYMUU \n", "\n", " Date of Meal Participants Meal Price Type of Meal \n", "0 2016-05-31 07:00:00+02:00 ['David Bishop'] 469 Breakfast \n", "1 2018-10-01 20:00:00+02:00 ['David Bishop'] 22 Dinner \n", "2 2014-08-23 14:00:00+02:00 ['Karen Stansell'] 314 Lunch " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "invoice_df.head(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Transformations\n", "* Add Underscores to each column name\n", "* Transform Date and Date of Meal to date/datetime data types\n", "* Time of day column\n", "* Number of participants column" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Replace spaces with underscores in all dataframe column names\n", "invoice_df.columns = invoice_df.columns.str.replace(\" \", \"_\")\n", "orderleads_df.columns = orderleads_df.columns.str.replace(\" \", \"_\")\n", "salesteam_df.columns = salesteam_df.columns.str.replace(\" \", \"_\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Date to Date (\"d-m-Y\")\n", "invoice_df[\"Date\"] = pd.to_datetime(\n", " invoice_df[\"Date\"], format='%d-%m-%Y')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Drop \"+HH:MM:SS\" to make all uniform to UTC timezone\n", "invoice_df[\"Date_of_Meal\"] = invoice_df[\"Date_of_Meal\"].apply(\n", " lambda x: x.split(\"+\")[0]\n", ")\n", "\n", "# Convert Date_of_Meal to Datetime format\n", "invoice_df[\"Date_of_Meal\"] = pd.to_datetime(\n", " invoice_df[\"Date_of_Meal\"],\n", " format = \"%Y-%m-%d %H:%M:%S\"\n", ")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Convert Date_of_Meal to Datetime format\n", "invoice_df[\"Date_of_Meal\"] = pd.to_datetime(\n", " invoice_df[\"Date_of_Meal\"],\n", " format = \"%Y-%m-%d %H:%M:%S\"\n", ")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# Function defining hour of the day with the time of day\n", "def time_of_day(x):\n", " day_hour = x.hour\n", " if (day_hour >= 5) and (day_hour <= 8): # 5am - 8am\n", " return \"Early Morning\"\n", " elif (day_hour > 8) and (day_hour <= 12): # 9am - 12pm\n", " return \"Late Morning\"\n", " elif (day_hour > 12) and (day_hour <= 15): # 1pm - 3pm\n", " return \"Early Afternoon\"\n", " elif (day_hour > 15) and (day_hour <= 19): # 4pm - 7pm\n", " return \"Evening\"\n", " elif (day_hour > 19) and (day_hour <= 23): # 8pm - 11pm\n", " return \"Night\"\n", " else: # 12am - 4am\n", " return \"Late Night\"" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Apply time_of_day function to Date_of_Meal column\n", "invoice_df[\"Part_of_Day\"] = invoice_df[\"Date_of_Meal\"].apply(time_of_day)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Order_IdDateMeal_IdCompany_IdDate_of_MealParticipantsMeal_PriceType_of_MealPart_of_DayNumber_of_Participants
0839FKFW2LLX4LMBB2016-05-27INBUX904GIHI8YBDLJKS5NK6788CYMUU2016-05-31 07:00:00['David Bishop']469BreakfastEarly Morning1
197OX39BGVMHODLJM2018-09-27J0MMOOPP709DIDIELJKS5NK6788CYMUU2018-10-01 20:00:00['David Bishop']22DinnerNight1
2041ORQM5OIHTIU6L2014-08-24E4UJLQNCI16UX5CSLJKS5NK6788CYMUU2014-08-23 14:00:00['Karen Stansell']314LunchEarly Afternoon1
3YT796QI18WNGZ7ZJ2014-04-12C9SDFHF7553BE247LJKS5NK6788CYMUU2014-04-07 21:00:00['Addie Patino']438DinnerNight1
46YLROQT27B6HRF4E2015-07-2848EQXS6IHYNZDDZ5LJKS5NK6788CYMUU2015-07-27 14:00:00['Addie Patino' 'Susan Guerrero']690LunchEarly Afternoon2
\n", "
" ], "text/plain": [ " Order_Id Date Meal_Id Company_Id \\\n", "0 839FKFW2LLX4LMBB 2016-05-27 INBUX904GIHI8YBD LJKS5NK6788CYMUU \n", "1 97OX39BGVMHODLJM 2018-09-27 J0MMOOPP709DIDIE LJKS5NK6788CYMUU \n", "2 041ORQM5OIHTIU6L 2014-08-24 E4UJLQNCI16UX5CS LJKS5NK6788CYMUU \n", "3 YT796QI18WNGZ7ZJ 2014-04-12 C9SDFHF7553BE247 LJKS5NK6788CYMUU \n", "4 6YLROQT27B6HRF4E 2015-07-28 48EQXS6IHYNZDDZ5 LJKS5NK6788CYMUU \n", "\n", " Date_of_Meal Participants Meal_Price \\\n", "0 2016-05-31 07:00:00 ['David Bishop'] 469 \n", "1 2018-10-01 20:00:00 ['David Bishop'] 22 \n", "2 2014-08-23 14:00:00 ['Karen Stansell'] 314 \n", "3 2014-04-07 21:00:00 ['Addie Patino'] 438 \n", "4 2015-07-27 14:00:00 ['Addie Patino' 'Susan Guerrero'] 690 \n", "\n", " Type_of_Meal Part_of_Day Number_of_Participants \n", "0 Breakfast Early Morning 1 \n", "1 Dinner Night 1 \n", "2 Lunch Early Afternoon 1 \n", "3 Dinner Night 1 \n", "4 Lunch Early Afternoon 2 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Add a field to count the number of participants\n", "invoice_df['Number_of_Participants'] = invoice_df['Participants'].apply(lambda x: x.count(\"'\")/2)\n", "invoice_df['Number_of_Participants'] = invoice_df['Number_of_Participants'].astype(int)\n", "\n", "invoice_df.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Customer-Order Table\n", "Connect the customer id to each order id the customer placed. This table will link the customer information to the invoice information." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Order_IdParticipant_NameCustomer_IdLast_Updated
0839FKFW2LLX4LMBBDavid Bishop12023-02-15
197OX39BGVMHODLJMDavid Bishop12023-02-15
2041ORQM5OIHTIU6LKaren Stansell22023-02-15
3YT796QI18WNGZ7ZJAddie Patino32023-02-15
46YLROQT27B6HRF4EAddie Patino32023-02-15
46YLROQT27B6HRF4ESusan Guerrero42023-02-15
5AT0R4DFYYAFOC88QDavid Bishop12023-02-15
5AT0R4DFYYAFOC88QSusan Guerrero42023-02-15
5AT0R4DFYYAFOC88QKaren Stansell22023-02-15
\n", "
" ], "text/plain": [ " Order_Id Participant_Name Customer_Id Last_Updated\n", "0 839FKFW2LLX4LMBB David Bishop 1 2023-02-15\n", "1 97OX39BGVMHODLJM David Bishop 1 2023-02-15\n", "2 041ORQM5OIHTIU6L Karen Stansell 2 2023-02-15\n", "3 YT796QI18WNGZ7ZJ Addie Patino 3 2023-02-15\n", "4 6YLROQT27B6HRF4E Addie Patino 3 2023-02-15\n", "4 6YLROQT27B6HRF4E Susan Guerrero 4 2023-02-15\n", "5 AT0R4DFYYAFOC88Q David Bishop 1 2023-02-15\n", "5 AT0R4DFYYAFOC88Q Susan Guerrero 4 2023-02-15\n", "5 AT0R4DFYYAFOC88Q Karen Stansell 2 2023-02-15" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Find all the occurrences of customer names then explode to convert values in lists to rows\n", "cust = invoice_df['Participants'].str.findall(r\"'(.*?)'\").explode()\n", "\n", "# Join with order id \n", "cust_order_df = invoice_df[['Order_Id']].join(cust)\n", "\n", "# Factorize to encode the unique values in participants\n", "cust_order_df['Customer_Id'] = cust_order_df['Participants'].factorize()[0] + 1\n", "cust_order_df[\"Customer_Id\"] = cust_order_df[\"Customer_Id\"].astype(str)\n", "\n", "# Rename Participants Column\n", "cust_order_df.columns = [\"Order_Id\", \"Participant_Name\", \"Customer_Id\"]\n", "\n", "# Add Last Updated Date\n", "cust_order_df[\"Last_Updated\"] = dt.date.today()\n", "\n", "cust_order_df.head(9)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Order Leads CSV\n", "* Converted Column - Whether or not a order was converted into a sale" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Order_IdCompany_IdCompany_NameDateOrder_ValueConverted
080EYLOKP9E762WKGLJKS5NK6788CYMUUChimera-Chasing Casbah18-02-201748751
1TLEXR1HZWTUTBHPBLJKS5NK6788CYMUUChimera-Chasing Casbah30-07-201584250
2839FKFW2LLX4LMBBLJKS5NK6788CYMUUChimera-Chasing Casbah27-05-201648370
\n", "
" ], "text/plain": [ " Order_Id Company_Id Company_Name Date \\\n", "0 80EYLOKP9E762WKG LJKS5NK6788CYMUU Chimera-Chasing Casbah 18-02-2017 \n", "1 TLEXR1HZWTUTBHPB LJKS5NK6788CYMUU Chimera-Chasing Casbah 30-07-2015 \n", "2 839FKFW2LLX4LMBB LJKS5NK6788CYMUU Chimera-Chasing Casbah 27-05-2016 \n", "\n", " Order_Value Converted \n", "0 4875 1 \n", "1 8425 0 \n", "2 4837 0 " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "orderleads_df.head(3)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "orderleads_df[\"Date\"] = pd.to_datetime(orderleads_df[\"Date\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sales Team CSV" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sales_RepSales_Rep_IdCompany_NameCompany_Id
0Jessie Mcallister97UNNAT790E0WM4NChimera-Chasing CasbahLJKS5NK6788CYMUU
1Jessie Mcallister97UNNAT790E0WM4NTangential Sheds36MFTZOYMTAJP1RK
2Jessie Mcallister97UNNAT790E0WM4NTwo-Mile GrabH3JRC7XX7WJAD4ZO
\n", "
" ], "text/plain": [ " Sales_Rep Sales_Rep_Id Company_Name \\\n", "0 Jessie Mcallister 97UNNAT790E0WM4N Chimera-Chasing Casbah \n", "1 Jessie Mcallister 97UNNAT790E0WM4N Tangential Sheds \n", "2 Jessie Mcallister 97UNNAT790E0WM4N Two-Mile Grab \n", "\n", " Company_Id \n", "0 LJKS5NK6788CYMUU \n", "1 36MFTZOYMTAJP1RK \n", "2 H3JRC7XX7WJAD4ZO " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "salesteam_df.head(3)" ] }, { "cell_type": "markdown", "id": "de343d32", "metadata": {}, "source": [ "***" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Connection to MySQL Server" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# Manually Login to MySQL\n", "mysql_username = str(input(\"Enter MySQL Username: \"))\n", "mysql_password = str(input(\"Enter MySQL Password: \"))\n", "\n", "mysql_conn = pymysql.connect(\n", " host = \"localhost\",\n", " port = int(3306),\n", " user = mysql_username,\n", " passwd = mysql_password\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create Supermarket Database\n", "* Tries to drop the database, if it previously existed\n", " - Otherwise, creates the database" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ADS-507 Supermarket Database Created\n" ] } ], "source": [ "# Create ADS-507_Supermarket MySQL Database\n", "## Drop the database to create an updated version if it exists\n", "try :\n", " mysql_conn.cursor().execute(\n", " \"\"\"\n", " DROP DATABASE ADS_507_Supermarket;\n", " \"\"\"\n", " )\n", " mysql_conn.cursor().execute(\n", " \"\"\"\n", " CREATE DATABASE IF NOT EXISTS ADS_507_Supermarket;\n", " \"\"\"\n", " )\n", "# Create the database if it has not done so before\n", "except: \n", " mysql_conn.cursor().execute(\n", " \"\"\"\n", " CREATE DATABASE IF NOT EXISTS ADS_507_Supermarket;\n", " \"\"\"\n", " )\n", "\n", "# Navigate to Supermarket Database\n", "mysql_conn.select_db(\"ADS_507_Supermarket\")\n", "\n", "print(\"ADS-507 Supermarket Database Created\")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Tables_in_ads_507_supermarket
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [Tables_in_ads_507_supermarket]\n", "Index: []" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_sql(\"SHOW TABLES\", mysql_conn)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Orders Table Created\n" ] } ], "source": [ "create_order_table = \"\"\"\n", " CREATE TABLE IF NOT EXISTS orders(\n", " Order_Id VARCHAR(100) NOT NULL,\n", " Company_Id VARCHAR(100) NOT NULL,\n", " Company_Name VARCHAR(255),\n", " Date DATE,\n", " Order_Value SMALLINT,\n", " Converted TINYINT UNSIGNED,\n", " PRIMARY KEY (Order_Id, Company_Id),\n", " INDEX (Company_Id),\n", " INDEX (Order_Id)\n", " )\n", ";\n", "\"\"\"\n", "\n", "mysql_conn.cursor().execute(create_order_table);\n", "print(\"Orders Table Created\")" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Invoice Table Created\n" ] } ], "source": [ "create_invoice_table = \"\"\"\n", "CREATE TABLE IF NOT EXISTS invoice (\n", " Order_Id VARCHAR(100) NOT NULL,\n", " Date DATE NOT NULL,\n", " Meal_Id VARCHAR(100) NOT NULL, \n", " Company_Id VARCHAR(100) NOT NULL,\n", " Date_of_Meal DATETIME NOT NULL,\n", " Participants VARCHAR(255),\n", " Meal_Price SMALLINT,\n", " Type_of_Meal ENUM('Breakfast', 'Lunch', 'Dinner'),\n", " Part_of_Day ENUM('Early Morning', 'Late Morning', 'Early Afternoon', 'Night', 'Late Night'),\n", " Number_of_Participants TINYINT,\n", " PRIMARY KEY (Order_Id, Date),\n", " FOREIGN KEY (Order_Id) REFERENCES orders(Order_Id),\n", " FOREIGN KEY (Company_Id) REFERENCES orders(Company_Id),\n", " INDEX (Date),\n", " UNIQUE INDEX unique_order_id (Order_Id)\n", " )\n", ";\n", "\"\"\"\n", "\n", "mysql_conn.cursor().execute(create_invoice_table);\n", "print(\"Invoice Table Created\")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sales Table Created\n" ] } ], "source": [ "# Foreign key added on company_id to link the salesteam to the orders table\n", "create_salesteam_table = \"\"\"\n", " CREATE TABLE IF NOT EXISTS salesteam(\n", " Sales_Rep VARCHAR(255),\n", " Sales_Rep_Id VARCHAR(100),\n", " Company_Name VARCHAR(255),\n", " Company_Id VARCHAR(100),\n", " FOREIGN KEY (Company_Id) REFERENCES orders(Company_Id),\n", " INDEX (Sales_Rep),\n", " INDEX (Company_Name)\n", " )\n", "\"\"\"\n", "\n", "mysql_conn.cursor().execute(create_salesteam_table);\n", "print(\"Sales Table Created\")" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Customer_Order Table Created\n" ] } ], "source": [ "create_customerorder_table = \"\"\"\n", " CREATE TABLE IF NOT EXISTS customer_order(\n", " Order_Id VARCHAR(100),\n", " Participant_Name VARCHAR(255),\n", " Customer_Id VARCHAR(255),\n", " Last_Updated DATE,\n", " FOREIGN KEY (Order_Id) REFERENCES orders(Order_Id),\n", " INDEX (Participant_Name)\n", " )\n", "\"\"\"\n", "\n", "mysql_conn.cursor().execute(create_customerorder_table);\n", "print(\"Customer_Order Table Created\")" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "# Create Engine to write to SQL table\n", "engine = create_engine(\n", " \"mysql+pymysql://{user}:{pw}@{host}/{db}\".format(\n", " host = \"localhost\", \n", " db = \"ADS_507_Supermarket\", \n", " user = mysql_username, \n", " pw = mysql_password))" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Load Dataframes as Tables into MySQL\n", "* Orders\n", "* Invoice\n", "* Sales Lead\n", "* Customer (cust_order_df)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Successfully added data to orders table\n" ] } ], "source": [ "# Load orders datafraome to SQL table\n", "for i, df_row in orderleads_df.iterrows():\n", " row_value = \"\"\"\n", " INSERT INTO ADS_507_Supermarket.orders VALUES (\n", " %s, %s, %s, %s, %s, %s)\n", " \"\"\"\n", " mysql_conn.cursor().execute(row_value, tuple(df_row))\n", "\n", "print(\"Successfully added data to orders table\")" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Successfully added data to invoice table\n" ] } ], "source": [ "# Load invoice dataframe to SQL table\n", "for i, df_row in invoice_df.iterrows():\n", " row_value = \"\"\"\n", " INSERT INTO ADS_507_Supermarket.invoice VALUES (\n", " %s, %s, %s, %s, %s,\n", " %s, %s, %s, %s, %s)\n", " \"\"\"\n", " mysql_conn.cursor().execute(row_value, tuple(df_row))\n", "\n", "print(\"Successfully added data to invoice table\")" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Successfully added data to salesteam table\n" ] } ], "source": [ "# Load salesteam datafraome to SQL table\n", "for i, df_row in salesteam_df.iterrows():\n", " row_value = \"\"\"\n", " INSERT INTO ADS_507_Supermarket.salesteam VALUES (\n", " %s, %s, %s, %s)\n", " \"\"\"\n", " mysql_conn.cursor().execute(row_value, tuple(df_row))\n", "\n", "print(\"Successfully added data to salesteam table\")" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Successfully added data to customer_order table\n" ] } ], "source": [ "# Load customer_order datafraome to SQL table\n", "for i, df_row in cust_order_df.iterrows():\n", " row_value = \"\"\"\n", " INSERT INTO ADS_507_Supermarket.customer_order VALUES (\n", " %s, %s, %s, %s)\n", " \"\"\"\n", " mysql_conn.cursor().execute(row_value, tuple(df_row))\n", "\n", "print(\"Successfully added data to customer_order table\")" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### MySQL Transformations\n", "* **Views** (listed alphabetically):\n", " - Average Meal Price: Average meal price by type of meal\n", " - Average Participants: Average number of participants by meal type\n", " - Company Metrics: For each company, the total amount and average amount of each invoice monthly are shown for each meal (and displaying their respective meal). In addition, the year-to-date amount collected and yearly total are presented.\n", " - Customer Purchases: Customer_Name, Part_of_Day, Company_Name, Number_of_Purchases, Total_Spent\n", " - Customer Stats: total number of orders by each customer, total amount each customer spent, and the average amount each spent\n", " - Difference Days: Difference in days between the date of meal and date the order was placed\n", " - Percent Converted: Shows the number of orders for every company and the total converted (as a sum and proportion) and not converted to an order, as a sum.\n", " - Sales by Year: Number of invoices each year\n", " - Sales Rep Performance: Sales_Rep, Sales_Rep_Id, Company_Name, Company_Id, Profit_by_Sales_Rep\n", " - Total Sales: Total sales by type of meal price for each year" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "customer_stats view created\n" ] } ], "source": [ "customer_stats = \"\"\"\n", " CREATE VIEW customer_stats\n", " (customer_name, number_of_order, total_spent, average_spent)\n", " AS\n", " SELECT co.Participant_Name , COUNT(*), SUM(i.Meal_Price), AVG(i.Meal_Price)\n", " FROM customer_order AS co\n", " INNER JOIN invoice AS i\n", " ON co.Order_Id = i.Order_Id\n", " GROUP BY co.Participant_Name;\n", "\"\"\"\n", "\n", "mysql_conn.cursor().execute(customer_stats)\n", "print(\"customer_stats view created\")" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customer_namenumber_of_ordertotal_spentaverage_spent
0Matthew Jackson156801.0453.4
1Barbara Tanaka3189.063.0
2Amy Bufford103610.0361.0
\n", "
" ], "text/plain": [ " customer_name number_of_order total_spent average_spent\n", "0 Matthew Jackson 15 6801.0 453.4\n", "1 Barbara Tanaka 3 189.0 63.0\n", "2 Amy Bufford 10 3610.0 361.0" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_sql(\"SELECT * FROM customer_stats LIMIT 3;\", mysql_conn)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "company_metrics view created\n" ] } ], "source": [ "company_metrics = \"\"\"\n", " CREATE VIEW company_metrics\n", " (company_name, year, month, \n", " meal_typ_sale, monthly_total,\n", " monthly_average, number_of_sales, year_to_date, yearly_total)\n", " AS\n", " SELECT o.Company_Name, YEAR(i.Date) AS Year, MONTHNAME(i.Date) AS Month,\n", " i.Type_of_Meal,\n", " SUM(i.Meal_Price) AS Monthly_Total,\n", " AVG(i.Meal_Price) AS Monthly_Average,\n", " COUNT(*) AS Number_of_Sales,\n", " SUM(SUM(i.Meal_Price))\n", " OVER (PARTITION BY o.Company_Name, Year(i.Date)\n", " ORDER BY MONTH(STR_TO_DATE(month, '%M'))\n", " ROWS UNBOUNDED PRECEDING) AS Year_to_Date,\n", " SUM(i.Meal_Price)\n", " OVER (PARTITION BY o.Company_Name, YEAR(i.Date)) AS Yearly_Total\n", " FROM orders AS o\n", " INNER JOIN invoice AS i\n", " ON o.Order_Id = i.Order_Id\n", " GROUP BY 1, 2, 3\n", " ORDER BY 1, 2, MONTH(STR_TO_DATE(month, '%M'))\n", "\"\"\"\n", "\n", "mysql_conn.cursor().execute(company_metrics)\n", "print(\"company_metrics view created\")" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
company_nameyearmonthmeal_typ_salemonthly_totalmonthly_averagenumber_of_salesyear_to_dateyearly_total
0'48 Wills2014FebruaryDinner789.0789.01789.01173.0
1'48 Wills2014MarchDinner384.0384.011173.01173.0
2'48 Wills2017FebruaryDinner216.0216.01216.0216.0
\n", "
" ], "text/plain": [ " company_name year month meal_typ_sale monthly_total monthly_average \\\n", "0 '48 Wills 2014 February Dinner 789.0 789.0 \n", "1 '48 Wills 2014 March Dinner 384.0 384.0 \n", "2 '48 Wills 2017 February Dinner 216.0 216.0 \n", "\n", " number_of_sales year_to_date yearly_total \n", "0 1 789.0 1173.0 \n", "1 1 1173.0 1173.0 \n", "2 1 216.0 216.0 " ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_sql(\"SELECT * FROM company_metrics LIMIT 3;\", mysql_conn)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sales_rep_performance view created\n" ] } ], "source": [ "sales_rep_performance = \"\"\"\n", " CREATE VIEW sales_rep_performance\n", " (Sales_Rep, Sales_Rep_Id, Company_Name,\n", " Company_Id, Profit_by_Sales_Rep,\n", " Min_Sale, Max_Sale)\n", " AS\n", " SELECT s.Sales_Rep AS Sales_Rep, \n", " s.Sales_Rep_Id AS Sales_Rep_Id,\n", " s.Company_Name AS Company_Name,\n", " s.Company_Id AS Company_Id,\n", " SUM(i.Meal_Price) AS Profit_by_Sales_Rep,\n", " MIN(i.Meal_Price) AS Min_Sale,\n", " MAX(i.Meal_Price) AS Max_Sale\n", " FROM salesteam AS s\n", " INNER JOIN orders AS o\n", " ON s.Company_Id = o.Company_Id\n", " INNER JOIN invoice AS i\n", " ON o.Order_Id = i.Order_Id\n", " GROUP BY s.Sales_Rep, s.Company_Name\n", "\"\"\"\n", "\n", "mysql_conn.cursor().execute(sales_rep_performance)\n", "print(\"sales_rep_performance view created\")" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sales_RepSales_Rep_IdCompany_NameCompany_IdProfit_by_Sales_RepMin_SaleMax_Sale
0Jessie Mcallister97UNNAT790E0WM4NChimera-Chasing CasbahLJKS5NK6788CYMUU2128.014690
1Jessie Mcallister97UNNAT790E0WM4NTangential Sheds36MFTZOYMTAJP1RK6378.027829
2Jessie Mcallister97UNNAT790E0WM4NTwo-Mile GrabH3JRC7XX7WJAD4ZO3818.098664
\n", "
" ], "text/plain": [ " Sales_Rep Sales_Rep_Id Company_Name \\\n", "0 Jessie Mcallister 97UNNAT790E0WM4N Chimera-Chasing Casbah \n", "1 Jessie Mcallister 97UNNAT790E0WM4N Tangential Sheds \n", "2 Jessie Mcallister 97UNNAT790E0WM4N Two-Mile Grab \n", "\n", " Company_Id Profit_by_Sales_Rep Min_Sale Max_Sale \n", "0 LJKS5NK6788CYMUU 2128.0 14 690 \n", "1 36MFTZOYMTAJP1RK 6378.0 27 829 \n", "2 H3JRC7XX7WJAD4ZO 3818.0 98 664 " ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_sql(\"SELECT * FROM sales_rep_performance LIMIT 3;\", mysql_conn)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "customer_purchases view created\n" ] } ], "source": [ "customer_purchases = \"\"\"\n", " CREATE VIEW customer_purchases\n", " (Customer_Name, Meal_Type, Part_of_Day, Company_Name,\n", " Number_of_Purchases, Total_Spent, Avg_Spent_per_Meal)\n", " AS\n", " SELECT c.Participant_Name, i.Type_of_Meal, i.Part_of_Day,\n", " s.Company_Name, COUNT(i.Order_Id) AS Num_Purchases,\n", " SUM(i.Meal_Price) AS Meal_Price, \n", " AVG(i.Meal_Price) AS Avg_Spent\n", " FROM invoice AS i\n", " INNER JOIN customer_order AS c\n", " ON i.Order_Id = c.Order_Id\n", " INNER JOIN salesteam AS s\n", " ON s.Company_Id = i.Company_Id\n", " GROUP BY 1,2\n", "\"\"\"\n", "\n", "mysql_conn.cursor().execute(customer_purchases)\n", "print(\"customer_purchases view created\")" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Customer_NameMeal_TypPart_of_DayCompany_NameNumber_of_PurchasesTotal_SpentAvg_Spent_per_Meal
0Karen StansellLunchEarly AfternoonChimera-Chasing Casbah1314.0314.0000
1Susan GuerreroDinnerNightChimera-Chasing Casbah2195.097.5000
2David BishopDinnerNightChimera-Chasing Casbah3217.072.3333
\n", "
" ], "text/plain": [ " Customer_Name Meal_Typ Part_of_Day Company_Name \\\n", "0 Karen Stansell Lunch Early Afternoon Chimera-Chasing Casbah \n", "1 Susan Guerrero Dinner Night Chimera-Chasing Casbah \n", "2 David Bishop Dinner Night Chimera-Chasing Casbah \n", "\n", " Number_of_Purchases Total_Spent Avg_Spent_per_Meal \n", "0 1 314.0 314.0000 \n", "1 2 195.0 97.5000 \n", "2 3 217.0 72.3333 " ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_sql(\"SELECT * FROM customer_purchases LIMIT 3;\", mysql_conn)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "sales_by_year = \"\"\"\n", " CREATE VIEW sales_by_year\n", " AS\n", " SELECT YEAR(Date) AS Year, COUNT(*) AS Total_Invoices \n", " FROM invoice \n", " GROUP BY Year;\n", "\"\"\"\n", "mysql_conn.cursor().execute(sales_by_year);" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
YearTotal_Invoices
0201410069
1201510039
220169857
320179937
4201810115
\n", "
" ], "text/plain": [ " Year Total_Invoices\n", "0 2014 10069\n", "1 2015 10039\n", "2 2016 9857\n", "3 2017 9937\n", "4 2018 10115" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_sql(\"SELECT * FROM sales_by_year\", mysql_conn)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "percent_converted = \"\"\"\n", " CREATE VIEW percent_converted\n", " AS\n", " SELECT Company_Name, \n", " COUNT(Converted) AS Converted_Total,\n", " SUM(IF(Converted = '1', Converted, 0)) AS Converted_to_Order,\n", " (COUNT(Converted) - SUM(IF(Converted = '1', Converted, 0))) AS Not_Converted,\n", " ROUND(((SUM(Converted)/Count(*))*100), 2) AS Percent_Converted\n", " FROM orders \n", " GROUP BY Company_Name;\n", "\"\"\"\n", "mysql_conn.cursor().execute(percent_converted);" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Company_NameConverted_TotalConverted_to_OrderNot_ConvertedPercent_Converted
0Flashlight-Type Escape137.06.053.85
1Nonparticulate Earthmoving137.06.053.85
2Uninterrupted Dining158.07.053.33
3Free-Burning Luke'S179.08.052.94
4Covert Hint126.06.050.00
\n", "
" ], "text/plain": [ " Company_Name Converted_Total Converted_to_Order \\\n", "0 Flashlight-Type Escape 13 7.0 \n", "1 Nonparticulate Earthmoving 13 7.0 \n", "2 Uninterrupted Dining 15 8.0 \n", "3 Free-Burning Luke'S 17 9.0 \n", "4 Covert Hint 12 6.0 \n", "\n", " Not_Converted Percent_Converted \n", "0 6.0 53.85 \n", "1 6.0 53.85 \n", "2 7.0 53.33 \n", "3 8.0 52.94 \n", "4 6.0 50.00 " ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_query = \"\"\"\n", " SELECT *\n", " FROM percent_converted\n", " ORDER BY Percent_Converted DESC\n", " LIMIT 5;\n", "\"\"\"\n", "\n", "pd.read_sql(test_query, mysql_conn)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "# avg_meal_price: Average meal price\n", "avg_meal_price = \"\"\"\n", " CREATE VIEW avg_meal_price \n", " AS\n", " SELECT Type_of_Meal,\n", " ROUND(AVG(Meal_Price),2) as Average_Meal_Price\n", " FROM invoice\n", " GROUP BY Type_of_Meal;\n", " \"\"\"\n", " \n", "mysql_conn.cursor().execute(avg_meal_price);" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Type_of_MealAverage_Meal_Price
0Lunch405.09
1Breakfast404.63
2Dinner400.07
\n", "
" ], "text/plain": [ " Type_of_Meal Average_Meal_Price\n", "0 Lunch 405.09\n", "1 Breakfast 404.63\n", "2 Dinner 400.07" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_query = \"\"\"\n", " SELECT * \n", " FROM avg_meal_price\n", " ORDER BY Average_Meal_Price DESC;\n", " \"\"\"\n", "\n", "pd.read_sql(test_query, mysql_conn)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "# total_sales: Total sales by type of meal price by year\n", "total_sales = \"\"\"\n", " CREATE VIEW total_sales \n", " AS\n", " SELECT YEAR(Date) AS Year, \n", " Type_of_Meal, \n", " SUM(Meal_Price) as Total_Sales \n", " FROM invoice\n", " GROUP BY Year, Type_of_Meal \n", " ORDER BY Year ASC, \n", " FIELD(Type_of_Meal,'Breakfast','Lunch','Dinner')\n", " ;\n", " \"\"\"\n", " \n", "mysql_conn.cursor().execute(total_sales);" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
YearType_of_MealTotal_Sales
02014Breakfast1331066.0
12014Lunch1365269.0
22014Dinner1338395.0
32015Breakfast1360415.0
42015Lunch1336229.0
52015Dinner1335497.0
62016Breakfast1342038.0
72016Lunch1353928.0
82016Dinner1318863.0
92017Breakfast1348430.0
102017Lunch1355628.0
112017Dinner1313810.0
122018Breakfast1360323.0
132018Lunch1322318.0
142018Dinner1387482.0
\n", "
" ], "text/plain": [ " Year Type_of_Meal Total_Sales\n", "0 2014 Breakfast 1331066.0\n", "1 2014 Lunch 1365269.0\n", "2 2014 Dinner 1338395.0\n", "3 2015 Breakfast 1360415.0\n", "4 2015 Lunch 1336229.0\n", "5 2015 Dinner 1335497.0\n", "6 2016 Breakfast 1342038.0\n", "7 2016 Lunch 1353928.0\n", "8 2016 Dinner 1318863.0\n", "9 2017 Breakfast 1348430.0\n", "10 2017 Lunch 1355628.0\n", "11 2017 Dinner 1313810.0\n", "12 2018 Breakfast 1360323.0\n", "13 2018 Lunch 1322318.0\n", "14 2018 Dinner 1387482.0" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_query = \"\"\"SELECT * FROM total_sales;\"\"\"\n", "pd.read_sql(test_query, mysql_conn)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Type_of_MealAverage_Participants_Per_Meal
0Lunch1.6678
1Breakfast1.6660
2Dinner1.6693
\n", "
" ], "text/plain": [ " Type_of_Meal Average_Participants_Per_Meal\n", "0 Lunch 1.6678\n", "1 Breakfast 1.6660\n", "2 Dinner 1.6693" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# avg_participants: Average amount of participants for each type of meal\n", "avg_participants = \"\"\"\n", " CREATE VIEW avg_participants \n", " AS\n", " SELECT Type_of_Meal, \n", " AVG(Number_of_Participants) as Average_Participants_Per_Meal\n", " FROM invoice\n", " GROUP BY Type_of_Meal\"\"\"\n", " \n", "mysql_conn.cursor().execute(avg_participants)\n", "\n", "test_query = \"\"\"SELECT * FROM avg_participants;\"\"\"\n", "pd.read_sql(test_query, mysql_conn)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "# difference_days: Difference in days between Date_of_Meal and Date\n", "difference_days = \"\"\"\n", " CREATE VIEW difference_days \n", " AS\n", " SELECT Date, \n", " Date_of_Meal, \n", " DATEDIFF(Date, Date_of_Meal) AS Days_Between\n", " FROM invoice\n", " ;\n", " \"\"\"\n", " \n", "mysql_conn.cursor().execute(difference_days);" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DateDate_of_MealDays_Between
02014-03-212014-03-17 12:00:004
12016-04-022016-03-28 12:00:005
\n", "
" ], "text/plain": [ " Date Date_of_Meal Days_Between\n", "0 2014-03-21 2014-03-17 12:00:00 4\n", "1 2016-04-02 2016-03-28 12:00:00 5" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_query = \"\"\"\n", " SELECT * \n", " FROM difference_days \n", " LIMIT 2;\n", " \"\"\"\n", "\n", "pd.read_sql(test_query, mysql_conn)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Tables_in_ads_507_supermarket
0avg_meal_price
1avg_participants
2company_metrics
3customer_order
4customer_purchases
5customer_stats
6difference_days
7invoice
8orders
9percent_converted
10sales_by_year
11sales_rep_performance
12salesteam
13total_sales
\n", "
" ], "text/plain": [ " Tables_in_ads_507_supermarket\n", "0 avg_meal_price\n", "1 avg_participants\n", "2 company_metrics\n", "3 customer_order\n", "4 customer_purchases\n", "5 customer_stats\n", "6 difference_days\n", "7 invoice\n", "8 orders\n", "9 percent_converted\n", "10 sales_by_year\n", "11 sales_rep_performance\n", "12 salesteam\n", "13 total_sales" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_query = \"\"\"\n", " show tables;\n", "\"\"\"\n", "\n", "pd.read_sql(test_query, mysql_conn)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline Completed!\n" ] } ], "source": [ "print(\"Pipeline Completed!\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.1" }, "vscode": { "interpreter": { "hash": "54be5dcaef3355b8195a69b65ef5b9c5e7ba5f5a12795eca27b7fa46b0f134bb" } } }, "nbformat": 4, "nbformat_minor": 2 }