{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.append(\"/home/argenisleon/Optimus/\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from optimus import Optimus" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/argenisleon/.conda/envs/rapids_blazing/lib/python3.7/site-packages/numba/cuda/envvars.py:17: NumbaWarning: \u001b[1m\n", "Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_NVVM=/usr/local/cuda/nvvm/lib64/libnvvm.so.\n", "\n", "For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')\u001b[0m\n", " warnings.warn(errors.NumbaWarning(msg))\n", "/home/argenisleon/.conda/envs/rapids_blazing/lib/python3.7/site-packages/numba/cuda/envvars.py:17: NumbaWarning: \u001b[1m\n", "Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_LIBDEVICE=/usr/local/cuda/nvvm/libdevice.\n", "\n", "For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')\u001b[0m\n", " warnings.warn(errors.NumbaWarning(msg))\n" ] }, { "data": { "text/html": [ "Open Bumblebee: https://app.hi-bumblebee.com
If you really care about privacy get your keys in bumblebee.ini and put them here
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:Just check that Spark and all necessary environments vars are present...\n", "INFO:-----\n", "INFO:Starting or setting Dask Client...\n", "INFO:Config.ini not found\n" ] } ], "source": [ "op= Optimus(\"dask-cudf\", n_workers=4, threads_per_worker=2,\n", " processes=False, memory_limit=\"3G\", comm=True, verbose= True)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "latin1\n" ] } ], "source": [ "df = op.load.csv(\"data/crime.csv\", sep=\",\", header=True, infer_schema='false', null_value=\"None\", charset=\"latin1\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "df = df.repartition(npartitions=1)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "df = df.persist()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "INCIDENT_NUMBER object\n", "OFFENSE_CODE int64\n", "OFFENSE_CODE_GROUP object\n", "OFFENSE_DESCRIPTION object\n", "DISTRICT object\n", "REPORTING_AREA float64\n", "SHOOTING object\n", "OCCURRED_ON_DATE object\n", "YEAR int64\n", "MONTH int64\n", "DAY_OF_WEEK object\n", "HOUR int64\n", "UCR_PART object\n", "STREET object\n", "Lat float64\n", "Long float64\n", "Location object\n", "dtype: object" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "num_cols = [\"OFFENSE_CODE\",\"YEAR\", \"MONTH\", \"HOUR\"]\n", "# cc= [\"INCIDENT_NUMBER\",\"OFFENSE_CODE_GROUP\",\"OFFENSE_DESCRIPTION\",\"DISTRICT\",\"REPORTING_AREA\",\"REPORTING_AREA\",\"SHOOTING\",\"OCCURRED_ON_DATE\",\"DAY_OF_WEEK\",\"UCR_PART\",\"STREET\",\"Location\"]\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "bins = np.array([-30, 0, 3, 6, 9, 10, 19, 50])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using digitize" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 140 ms, sys: 36 ms, total: 176 ms\n", "Wall time: 1.77 s\n" ] }, { "data": { "text/plain": [ "((8 319073\n", " dtype: int32, array([-30, 0, 3, 6, 9, 10, 19, 50])), (8 319073\n", " dtype: int32, array([-30, 0, 3, 6, 9, 10, 19, 50])), (2 45263\n", " 3 74431\n", " 4 99947\n", " 5 26543\n", " 6 72889\n", " dtype: int32, array([-30, 0, 3, 6, 9, 10, 19, 50])), (2 32068\n", " 3 11308\n", " 4 26919\n", " 5 14740\n", " 6 162967\n", " 7 71071\n", " dtype: int32, array([-30, 0, 3, 6, 9, 10, 19, 50])))" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import dask\n", "import cudf\n", "\n", "# def histogram(ser, bins):\n", "# print(type(ser))\n", "# binned = ser.digitize(bins, right=False)\n", "# vc = binned.value_counts().sort_index()\n", "# return vc, cudf.Series(bins)\n", "\n", "def hist_serie(serie, buckets): \n", " def func(_serie):\n", " binned = _serie.digitize(bins, right=False)\n", " vc = binned.value_counts().sort_index()\n", " return vc, bins\n", " \n", " return dask.delayed(func)(serie)\n", "\n", "delayed_tasks =[]\n", "for num_col in num_cols:\n", " delayed_tasks.append(hist_serie(df[num_col],10))\n", "%time dask.compute(*delayed_tasks)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using Cupy" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "import cupy as cp\n", "import dask\n", "\n", "\n", "def hist_serie(serie, buckets): \n", " def func(_serie):\n", " arr = cp.fromDlpack(_serie.to_dlpack())\n", " return cp.histogram(arr, buckets)\n", " \n", " return dask.delayed(func)(serie)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "((array([12600, 65983, 10820, 18794, 16752, 3015, 23563, 32552, 88665,\n", " 46329]), array([ 111., 483., 855., 1227., 1599., 1971., 2343., 2715., 3087.,\n", " 3459., 3831.])), (array([ 53388, 0, 0, 99114, 0, 0, 100886, 0,\n", " 0, 65685]), array([2015. , 2015.3, 2015.6, 2015.9, 2016.2, 2016.5, 2016.8, 2017.1,\n", " 2017.4, 2017.7, 2018. ])), (array([45263, 24146, 24086, 26199, 30568, 34556, 34823, 26543, 25737,\n", " 47152]), array([ 1. , 2.1, 3.2, 4.3, 5.4, 6.5, 7.6, 8.7, 9.8, 10.9, 12. ])), (array([32068, 7997, 8337, 36633, 32795, 35525, 53582, 41065, 33438,\n", " 37633]), array([ 0. , 2.3, 4.6, 6.9, 9.2, 11.5, 13.8, 16.1, 18.4, 20.7, 23. ])))\n", "CPU times: user 20 ms, sys: 0 ns, total: 20 ms\n", "Wall time: 43 ms\n" ] } ], "source": [ "delayed_tasks =[]\n", "for num_col in num_cols:\n", " delayed_tasks.append(hist_serie(df[num_col],10))\n", "%time print(dask.compute(*delayed_tasks))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using Optimus" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 28 ms, sys: 0 ns, total: 28 ms\n", "Wall time: 55.3 ms\n" ] }, { "data": { "text/plain": [ "{'OFFENSE_CODE': {'hist': [{'count': 169.0, 'lower': 111.0, 'upper': 297.0},\n", " {'count': 12431.0, 'lower': 297.0, 'upper': 483.0},\n", " {'count': 44188.0, 'lower': 483.0, 'upper': 669.0},\n", " {'count': 21795.0, 'lower': 669.0, 'upper': 855.0},\n", " {'count': 1548.0, 'lower': 855.0, 'upper': 1041.0},\n", " {'count': 9272.0, 'lower': 1041.0, 'upper': 1227.0},\n", " {'count': 16609.0, 'lower': 1227.0, 'upper': 1413.0},\n", " {'count': 2185.0, 'lower': 1413.0, 'upper': 1599.0},\n", " {'count': 216.0, 'lower': 1599.0, 'upper': 1785.0},\n", " {'count': 16536.0, 'lower': 1785.0, 'upper': 1971.0},\n", " {'count': 2759.0, 'lower': 1971.0, 'upper': 2157.0},\n", " {'count': 256.0, 'lower': 2157.0, 'upper': 2343.0},\n", " {'count': 2655.0, 'lower': 2343.0, 'upper': 2529.0},\n", " {'count': 20908.0, 'lower': 2529.0, 'upper': 2715.0},\n", " {'count': 2894.0, 'lower': 2715.0, 'upper': 2901.0},\n", " {'count': 29658.0, 'lower': 2901.0, 'upper': 3087.0},\n", " {'count': 63012.0, 'lower': 3087.0, 'upper': 3273.0},\n", " {'count': 25653.0, 'lower': 3273.0, 'upper': 3459.0},\n", " {'count': 9197.0, 'lower': 3459.0, 'upper': 3645.0},\n", " {'count': 37132.0, 'lower': 3645.0, 'upper': 3831.0}]},\n", " 'YEAR': {'hist': [{'count': 53388.0, 'lower': 2015.0, 'upper': 2015.15},\n", " {'count': 0.0, 'lower': 2015.15, 'upper': 2015.3},\n", " {'count': 0.0, 'lower': 2015.3, 'upper': 2015.45},\n", " {'count': 0.0, 'lower': 2015.45, 'upper': 2015.6},\n", " {'count': 0.0, 'lower': 2015.6, 'upper': 2015.75},\n", " {'count': 0.0, 'lower': 2015.75, 'upper': 2015.9},\n", " {'count': 99114.0, 'lower': 2015.9, 'upper': 2016.05},\n", " {'count': 0.0, 'lower': 2016.05, 'upper': 2016.2},\n", " {'count': 0.0, 'lower': 2016.2, 'upper': 2016.35},\n", " {'count': 0.0, 'lower': 2016.35, 'upper': 2016.5},\n", " {'count': 0.0, 'lower': 2016.5, 'upper': 2016.65},\n", " {'count': 0.0, 'lower': 2016.65, 'upper': 2016.8},\n", " {'count': 0.0, 'lower': 2016.8, 'upper': 2016.95},\n", " {'count': 100886.0, 'lower': 2016.95, 'upper': 2017.1},\n", " {'count': 0.0, 'lower': 2017.1, 'upper': 2017.25},\n", " {'count': 0.0, 'lower': 2017.25, 'upper': 2017.4},\n", " {'count': 0.0, 'lower': 2017.4, 'upper': 2017.55},\n", " {'count': 0.0, 'lower': 2017.55, 'upper': 2017.7},\n", " {'count': 0.0, 'lower': 2017.7, 'upper': 2017.85},\n", " {'count': 65685.0, 'lower': 2017.85, 'upper': 2018.0}]},\n", " 'MONTH': {'hist': [{'count': 23610.0, 'lower': 1.0, 'upper': 1.55},\n", " {'count': 21653.0, 'lower': 1.55, 'upper': 2.1},\n", " {'count': 0.0, 'lower': 2.1, 'upper': 2.6500000000000004},\n", " {'count': 24146.0, 'lower': 2.6500000000000004, 'upper': 3.2},\n", " {'count': 0.0, 'lower': 3.2, 'upper': 3.75},\n", " {'count': 24086.0, 'lower': 3.75, 'upper': 4.300000000000001},\n", " {'count': 0.0, 'lower': 4.300000000000001, 'upper': 4.8500000000000005},\n", " {'count': 26199.0, 'lower': 4.8500000000000005, 'upper': 5.4},\n", " {'count': 0.0, 'lower': 5.4, 'upper': 5.95},\n", " {'count': 30568.0, 'lower': 5.95, 'upper': 6.5},\n", " {'count': 34556.0, 'lower': 6.5, 'upper': 7.050000000000001},\n", " {'count': 0.0, 'lower': 7.050000000000001, 'upper': 7.6000000000000005},\n", " {'count': 34823.0, 'lower': 7.6000000000000005, 'upper': 8.15},\n", " {'count': 0.0, 'lower': 8.15, 'upper': 8.700000000000001},\n", " {'count': 26543.0, 'lower': 8.700000000000001, 'upper': 9.25},\n", " {'count': 0.0, 'lower': 9.25, 'upper': 9.8},\n", " {'count': 25737.0, 'lower': 9.8, 'upper': 10.350000000000001},\n", " {'count': 0.0, 'lower': 10.350000000000001, 'upper': 10.9},\n", " {'count': 23675.0, 'lower': 10.9, 'upper': 11.450000000000001},\n", " {'count': 23477.0, 'lower': 11.450000000000001, 'upper': 12.0}]},\n", " 'HOUR': {'hist': [{'count': 24375.0, 'lower': 0.0, 'upper': 1.15},\n", " {'count': 7693.0, 'lower': 1.15, 'upper': 2.3},\n", " {'count': 4589.0, 'lower': 2.3, 'upper': 3.4499999999999997},\n", " {'count': 3408.0, 'lower': 3.4499999999999997, 'upper': 4.6},\n", " {'count': 3311.0, 'lower': 4.6, 'upper': 5.75},\n", " {'count': 5026.0, 'lower': 5.75, 'upper': 6.8999999999999995},\n", " {'count': 21893.0, 'lower': 6.8999999999999995, 'upper': 8.049999999999999},\n", " {'count': 14740.0, 'lower': 8.049999999999999, 'upper': 9.2},\n", " {'count': 16347.0, 'lower': 9.2, 'upper': 10.35},\n", " {'count': 16448.0, 'lower': 10.35, 'upper': 11.5},\n", " {'count': 18679.0, 'lower': 11.5, 'upper': 12.649999999999999},\n", " {'count': 16846.0,\n", " 'lower': 12.649999999999999,\n", " 'upper': 13.799999999999999},\n", " {'count': 17189.0, 'lower': 13.799999999999999, 'upper': 14.95},\n", " {'count': 36393.0, 'lower': 14.95, 'upper': 16.099999999999998},\n", " {'count': 20763.0, 'lower': 16.099999999999998, 'upper': 17.25},\n", " {'count': 20302.0, 'lower': 17.25, 'upper': 18.4},\n", " {'count': 17588.0, 'lower': 18.4, 'upper': 19.549999999999997},\n", " {'count': 15850.0, 'lower': 19.549999999999997, 'upper': 20.7},\n", " {'count': 14111.0, 'lower': 20.7, 'upper': 21.849999999999998},\n", " {'count': 23522.0, 'lower': 21.849999999999998, 'upper': 23.0}]}}" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "df.cols.hist(num_cols)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "rapids_blazing", "language": "python", "name": "rapids_blazing" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 2 }