{ "cells": [ { "cell_type": "markdown", "metadata": { "inputHidden": false, "outputHidden": false }, "source": [ "# Profiler performance\n", "\n", "We use the part of the instacart data that you can find here https://www.instacart.com/datasets/grocery-shopping-2017\n", "\n", "Specically order_products__prior.csv a 4 columns, 33.2 Million rows csv file.\n", "\n", "Before 2.2.10\n", "It took 355.58 seconds to process all the data set in a Windows 10, \n", "Instacart data\n", "\n", "After 2.2.10\n", "It took 78 sec. infer== False\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.append(\"..\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\Anaconda3\\lib\\site-packages\\socks.py:58: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n", " from collections import Callable\n", "\n", " You are using PySparkling of version 2.4.10, but your PySpark is of\n", " version 2.3.1. Please make sure Spark and PySparkling versions are compatible. \n", "INFO:optimus:Operative System:Windows\n", "INFO:optimus:Just check that Spark and all necessary environments vars are present...\n", "INFO:optimus:-----\n", "INFO:optimus:SPARK_HOME=C:\\opt\\spark\\spark-2.3.1-bin-hadoop2.7\n", "INFO:optimus:HADOOP_HOME=C:\\opt\\hadoop-2.7.7\n", "INFO:optimus:PYSPARK_PYTHON=C:\\Users\\argenisleon\\Anaconda3\\python.exe\n", "INFO:optimus:PYSPARK_DRIVER_PYTHON=jupyter\n", "INFO:optimus:PYSPARK_SUBMIT_ARGS=--jars \"file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar\" --driver-class-path \"C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar\" --conf \"spark.sql.catalogImplementation=hive\" pyspark-shell\n", "INFO:optimus:JAVA_HOME=C:\\java\n", "INFO:optimus:Pyarrow Installed\n", "INFO:optimus:-----\n", "INFO:optimus:Starting or getting SparkSession and SparkContext...\n", "INFO:optimus:Spark Version:2.3.1\n", "INFO:optimus:Setting checkpoint folder local. If you are in a cluster initialize Optimus with master='your_ip' as param\n", "INFO:optimus:Deleting previous folder if exists...\n", "INFO:optimus:Creating the checkpoint directory...\n", "INFO:optimus:\n", " ____ __ _ \n", " / __ \\____ / /_(_)___ ___ __ _______\n", " / / / / __ \\/ __/ / __ `__ \\/ / / / ___/\n", " / /_/ / /_/ / /_/ / / / / / / /_/ (__ ) \n", " \\____/ .___/\\__/_/_/ /_/ /_/\\__,_/____/ \n", " /_/ \n", " \n", "INFO:optimus:Transform and Roll out...\n", "INFO:optimus:Optimus successfully imported. Have fun :).\n", "INFO:optimus:Config.ini not found\n" ] } ], "source": [ "# Create optimus\n", "from optimus import Optimus\n", "op = Optimus(master=\"local[*]\", app_name = \"optimus\" ,verbose =True, checkpoint= True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Benchmark " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df = op.load.csv(\"C:\\\\Users\\\\argenisleon\\\\Desktop\\\\order_products__prior.csv\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
| \n",
" order_id \n",
" 1 (int) \n",
" \n",
" \n",
" nullable\n",
" \n",
" \n",
" | \n",
" \n",
" \n",
" product_id \n",
" 2 (int) \n",
" \n",
" \n",
" nullable\n",
" \n",
" \n",
" | \n",
" \n",
" \n",
" add_to_cart_order \n",
" 3 (int) \n",
" \n",
" \n",
" nullable\n",
" \n",
" \n",
" | \n",
" \n",
" \n",
" reordered \n",
" 4 (int) \n",
" \n",
" \n",
" nullable\n",
" \n",
" \n",
" | \n",
" \n",
"
|---|---|---|---|
| \n",
" 2\n",
" \n",
" | \n",
" \n",
" \n",
" 33120\n",
" \n",
" | \n",
" \n",
" \n",
" 1\n",
" \n",
" | \n",
" \n",
" \n",
" 1\n",
" \n",
" | \n",
" \n",
"
| \n",
" 2\n",
" \n",
" | \n",
" \n",
" \n",
" 28985\n",
" \n",
" | \n",
" \n",
" \n",
" 2\n",
" \n",
" | \n",
" \n",
" \n",
" 1\n",
" \n",
" | \n",
" \n",
"
| \n",
" 2\n",
" \n",
" | \n",
" \n",
" \n",
" 9327\n",
" \n",
" | \n",
" \n",
" \n",
" 3\n",
" \n",
" | \n",
" \n",
" \n",
" 0\n",
" \n",
" | \n",
" \n",
"
| \n",
" 2\n",
" \n",
" | \n",
" \n",
" \n",
" 45918\n",
" \n",
" | \n",
" \n",
" \n",
" 4\n",
" \n",
" | \n",
" \n",
" \n",
" 1\n",
" \n",
" | \n",
" \n",
"
| \n",
" 2\n",
" \n",
" | \n",
" \n",
" \n",
" 30035\n",
" \n",
" | \n",
" \n",
" \n",
" 5\n",
" \n",
" | \n",
" \n",
" \n",
" 0\n",
" \n",
" | \n",
" \n",
"
| \n",
" 2\n",
" \n",
" | \n",
" \n",
" \n",
" 17794\n",
" \n",
" | \n",
" \n",
" \n",
" 6\n",
" \n",
" | \n",
" \n",
" \n",
" 1\n",
" \n",
" | \n",
" \n",
"
| \n",
" 2\n",
" \n",
" | \n",
" \n",
" \n",
" 40141\n",
" \n",
" | \n",
" \n",
" \n",
" 7\n",
" \n",
" | \n",
" \n",
" \n",
" 1\n",
" \n",
" | \n",
" \n",
"
| \n",
" 2\n",
" \n",
" | \n",
" \n",
" \n",
" 1819\n",
" \n",
" | \n",
" \n",
" \n",
" 8\n",
" \n",
" | \n",
" \n",
" \n",
" 1\n",
" \n",
" | \n",
" \n",
"
| \n",
" 2\n",
" \n",
" | \n",
" \n",
" \n",
" 43668\n",
" \n",
" | \n",
" \n",
" \n",
" 9\n",
" \n",
" | \n",
" \n",
" \n",
" 0\n",
" \n",
" | \n",
" \n",
"
| \n",
" 3\n",
" \n",
" | \n",
" \n",
" \n",
" 33754\n",
" \n",
" | \n",
" \n",
" \n",
" 1\n",
" \n",
" | \n",
" \n",
" \n",
" 1\n",
" \n",
" | \n",
" \n",
"