{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## This will initialize Spark and will give you a link to open Bumblebee" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "..\\optimus\\helpers\\functions.py:170: DeprecationWarning: invalid escape sequence \\d\n", " pattern = '\\\"(\\d+\\.\\d+).*\\\"'\n", "C:\\Users\\argenisleon\\Anaconda3\\lib\\site-packages\\socks.py:58: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n", " from collections import Callable\n", "\n", " You are using PySparkling of version 2.4.10, but your PySpark is of\n", " version 2.3.1. Please make sure Spark and PySparkling versions are compatible. \n", "`formatargspec` is deprecated since Python 3.5. Use `signature` and the `Signature` object directly\n", "invalid escape sequence \\d\n" ] }, { "data": { "text/html": [ "Your connection keys are in bumblebee.ini. If you really care about privacy get your keys and put them here. If you are testing just call bumblebee" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# op= Optimus(master=\"local[*]\",comm=True)\n", "%load_ext autoreload\n", "%autoreload 2\n", "import sys\n", "sys.path.append(\"..\")\n", "from optimus import Optimus\n", "\n", "# Create optimus\n", "op = Optimus(master=\"local\", app_name= \"optimus\", comm=True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "df_small= op.load.csv(\"data/foo.csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### With send() the data will be displayed in Bumblebee" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "'stddev' function in 'product' column is returning 'nan'. Is that what you expected?. Seems that 'product' has 'nan' values\n", "'kurtosis' function in 'product' column is returning 'nan'. Is that what you expected?. Seems that 'product' has 'nan' values\n", "'skewness' function in 'product' column is returning 'nan'. Is that what you expected?. Seems that 'product' has 'nan' values\n", "'variance' function in 'product' column is returning 'nan'. Is that what you expected?. Seems that 'product' has 'nan' values\n", "Send!\n" ] } ], "source": [ "df_small.send(\"small\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Send a bigger a dataframe" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "df = op.load.csv(\"data/crime.csv\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "'stddev' function in 'INCIDENT_NUMBER' column is returning 'nan'. Is that what you expected?. Seems that 'INCIDENT_NUMBER' has 'nan' values\n", "'kurtosis' function in 'INCIDENT_NUMBER' column is returning 'nan'. Is that what you expected?. Seems that 'INCIDENT_NUMBER' has 'nan' values\n", "'skewness' function in 'INCIDENT_NUMBER' column is returning 'nan'. Is that what you expected?. Seems that 'INCIDENT_NUMBER' has 'nan' values\n", "'variance' function in 'INCIDENT_NUMBER' column is returning 'nan'. Is that what you expected?. Seems that 'INCIDENT_NUMBER' has 'nan' values\n", "'kurtosis' function in 'STREET' column is returning 'nan'. Is that what you expected?. Seems that 'STREET' has 'nan' values\n", "'skewness' function in 'STREET' column is returning 'nan'. Is that what you expected?. Seems that 'STREET' has 'nan' values\n", "Send!\n", "Wall time: 2min 51s\n" ] } ], "source": [ "%%time\n", "df.send(\"crime\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get data from mysql and send it to Bumblebee" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 0 ns\n" ] } ], "source": [ "%%time\n", "# Put your db credentials here\n", "db = op.connect(\n", " db_type=\"mysql\",\n", " host=\"165.227.196.70\", \n", " database= \"optimus\", \n", " user= \"test\", \n", " password = \"test\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check the tables in the database" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 1 of 1 rows / 2 columns
\n", "
1 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
TABLE_NAME
\n", "
1 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
TABLE_ROWS
\n", "
2 (decimal(20,0))
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " test_data\n", " \n", "
\n", "
\n", "
\n", " \n", " 100\n", " \n", "
\n", "
\n", "\n", "\n", "
Viewing 1 of 1 rows / 2 columns
\n", "
1 partition(s)
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "db.tables()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100 rows\n" ] } ], "source": [ "df_db = db.table_to_df(\"test_data\",limit=\"all\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Send!\n" ] } ], "source": [ "df_db.send(\"db\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 4 }