{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.insert(0,\"..\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# print(sys.path)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": false }, "outputs": [], "source": [ "from optimus import Optimus" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\Anaconda3\\lib\\site-packages\\numpy\\_distributor_init.py:32: UserWarning: loaded more than 1 DLL from .libs:\n", "C:\\Users\\argenisleon\\Anaconda3\\lib\\site-packages\\numpy\\.libs\\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll\n", "C:\\Users\\argenisleon\\Anaconda3\\lib\\site-packages\\numpy\\.libs\\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll\n", " stacklevel=1)\n" ] } ], "source": [ "# op = Optimus(engine=\"dask\",verbose=True)\n", "op = Optimus(\"dask\", n_workers=1, threads_per_worker=8, processes=False, memory_limit=\"3G\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# df = op.load.file(\"data/Meteorite_Landings.csv\")\n", "# df = op.load.file(\"https://bumblebee.nyc3.digitaloceanspaces.com/luisaguirre/edad-32291509-7a3f-4d48-a3a5-a224e088dde7.csv\")\n", "df = op.load.file(\"data/crime.csv\").cache()\n", "# df = op.load.file(\"data/crime.csv\")\n", "# df= df.to_optimus_pandas()\n", "\n", "# op.load.csv(\"data/airline-safety_csv.csv\", sep=\",\", error_bad_lines=False, header=True, null_value=\"null\",infer_schema=\"true\").ext.cache()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "pdf = df.compute()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "def case1(self, df, col_name, conditions, default):\n", " \n", " _when = list(conditions.keys())\n", " _values = list(conditions.values())\n", " _else_ = default\n", "\n", " return np.select(_when, _values, default=_else_)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "case(df)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "ename": "SyntaxError", "evalue": "invalid syntax (, line 3)", "output_type": "error", "traceback": [ "\u001b[1;36m File \u001b[1;32m\"\"\u001b[1;36m, line \u001b[1;32m3\u001b[0m\n\u001b[1;33m x,y for i,j in a:\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n" ] } ], "source": [ "a = [(\"cond1\", \"value1\"), (\"cond2\", \"value2\")]\n", "\n", "x,y for i,j in a:\n", " " ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "class Case:\n", " def __init__(self, df):\n", " self.df = df\n", " self._when = []\n", " self._values = []\n", " self._else_ = []\n", " \n", " def case(value):\n", " return value\n", "\n", " def when(self, expr, value):\n", " self._when.append(expr)\n", " self._values.append(value)\n", " return self\n", "\n", " def else_(self, value):\n", " self._else_.append(value)\n", " return self\n", "\n", " def end(self):\n", " return np.select(self._when, self._values, default=self._else_)\n", "# return value\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for ()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# {\"when\":[cond1, cond2], \"values\":[\"Easy\",\"Medium\"], default:\"Unknown\"}\n", "\n", "\n", "condition = [\n", " (cond1, value1)\n", " \n", "]\n", "\n", "df.cols.case(\"name\",\n", " conditions, \n", " default=\"Unknown\")" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [], "source": [ "#case\n", "cond1 = pdf['INCIDENT_NUMBER']==\"I182070945\"\n", "cond2 = pdf['INCIDENT_NUMBER']==\"I182070943\"\n", "\n", "pdf[\"name\"] = pdf[\"name\"].case(pdf['INCIDENT_NUMBER']).\n", " .when(cond1, \"Easy\")\n", " .when(cond2, \"Medium\")\n", " .else_(\"Unknown\")\n", " .end()\n", " \n", "\n", "#if" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Easy', 'Medium', 'Unknown', ..., 'Unknown', 'Unknown', 'Unknown'],\n", " dtype='