{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.append(\"..\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from optimus import Optimus" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package averaged_perceptron_tagger to\n", "[nltk_data] C:\\Users\\argenisleon\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n", "[nltk_data] date!\n" ] } ], "source": [ "op = Optimus(\"dask\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'df' is not defined", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcols\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mNameError\u001b[0m: name 'df' is not defined" ] } ], "source": [ "df.cols.max()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# df = op.create.dataframe({\"name\": [\"A1\", \"B2\"]*20})\n", "df = op.load.csv(\"store.csv\", dtype=\"str\")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': 5, 'transaction_date': 9, 'name': 10, 'price': 18, 'discount': 3}" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.len(\"*\").cols.max()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Delayed('list-fa556bb4-e62a-4827-a9d3-73c09fc9ff0e')" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.min(compute=False)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': '1',\n", " 'transaction_date': '1/1/2015',\n", " 'name': 'PANTS FG00',\n", " 'price': '10',\n", " 'discount': ''}" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.min(compute=True)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 10 of 92780 rows / 5 columns
\n", "
1 partition(s)
\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
id
\n", "
1 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
transaction_date
\n", "
2 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
name
\n", "
3 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
price
\n", "
4 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
discount
\n", "
5 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " 1/7/2019\n", " \n", "
\n", "
\n", "
\n", " \n", " PANTS-rt38\n", " \n", "
\n", "
\n", "
\n", " \n", " 260\n", " \n", "
\n", "
\n", "
\n", " \n", " 50\n", " \n", "
\n", "
\n", "
\n", " \n", " 2\n", " \n", "
\n", "
\n", "
\n", " \n", " 1/4/2019\n", " \n", "
\n", "
\n", "
\n", " \n", " PANTS-FG63\n", " \n", "
\n", "
\n", "
\n", " \n", " 82.99\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 3\n", " \n", "
\n", "
\n", "
\n", " \n", " 1/5/2015\n", " \n", "
\n", "
\n", "
\n", " \n", " Pants-SG08\n", " \n", "
\n", "
\n", "
\n", " \n", " 372.99\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 4\n", " \n", "
\n", "
\n", "
\n", " \n", " 1/10/2019\n", " \n", "
\n", "
\n", "
\n", " \n", " SHIRT-St05\n", " \n", "
\n", "
\n", "
\n", " \n", " 150\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 5\n", " \n", "
\n", "
\n", "
\n", " \n", " 1/4/2017\n", " \n", "
\n", "
\n", "
\n", " \n", " shirt-Ft67\n", " \n", "
\n", "
\n", "
\n", " \n", " 52.99\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 6\n", " \n", "
\n", "
\n", "
\n", " \n", " 1/10/2018\n", " \n", "
\n", "
\n", "
\n", " \n", " shoes-FT13\n", " \n", "
\n", "
\n", "
\n", " \n", " 62.99\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 1/10/2017\n", " \n", "
\n", "
\n", "
\n", " \n", " Pants-FT46\n", " \n", "
\n", "
\n", "
\n", " \n", " 375\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 1/5/2016\n", " \n", "
\n", "
\n", "
\n", " \n", " pants-ft75\n", " \n", "
\n", "
\n", "
\n", " \n", " 77.99\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " 1/8/2018\n", " \n", "
\n", "
\n", "
\n", " \n", " SHIRT-Fg07\n", " \n", "
\n", "
\n", "
\n", " \n", " 232.99\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 1/12/2015\n", " \n", "
\n", "
\n", "
\n", " \n", " PANTS-Rt74\n", " \n", "
\n", "
\n", "
\n", " \n", " 245\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 11\n", " \n", "
\n", "
\n", "
\n", " \n", " 1/11/2018\n", " \n", "
\n", "
\n", "
\n", " \n", " PANTS-Fg76\n", " \n", "
\n", "
\n", "
\n", " \n", " 85\n", " \n", "
\n", "
\n", "
\n", " \n", " 5\n", " \n", "
\n", "
\n", "\n", "
Viewing 10 of 92780 rows / 5 columns
\n", "
1 partition(s) <class 'optimus.engines.dask.dataframe.DaskDataFrame'>
\n", "\n" ], "text/plain": [ " id transaction_date name price discount\n", " (object) (object) (object) (object) (object)\n", "---------- ------------------ ---------- ---------- ----------\n", " 1 1/7/2019 PANTS-rt38 260 50\n", " 2 1/4/2019 PANTS-FG63 82.99\n", " 3 1/5/2015 Pants-SG08 372.99\n", " 4 1/10/2019 SHIRT-St05 150\n", " 5 1/4/2017 shirt-Ft67 52.99\n", " 6 1/10/2018 shoes-FT13 62.99\n", " 7 1/10/2017 Pants-FT46 375\n", " 8 1/5/2016 pants-ft75 77.99\n", " 9 1/8/2018 SHIRT-Fg07 232.99\n", " 10 1/12/2015 PANTS-Rt74 245\n", " 11 1/11/2018 PANTS-Fg76 85 5" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'hist': {'id': [{'lower': 1.0, 'upper': 2812.4848484848485, 'count': 2812},\n", " {'lower': 2812.4848484848485, 'upper': 5623.969696969697, 'count': 2811},\n", " {'lower': 5623.969696969697, 'upper': 8435.454545454546, 'count': 2812},\n", " {'lower': 8435.454545454546, 'upper': 11246.939393939394, 'count': 2811},\n", " {'lower': 11246.939393939394, 'upper': 14058.424242424242, 'count': 2812},\n", " {'lower': 14058.424242424242, 'upper': 16869.909090909092, 'count': 2811},\n", " {'lower': 16869.909090909092, 'upper': 19681.39393939394, 'count': 2812},\n", " {'lower': 19681.39393939394, 'upper': 22492.878787878788, 'count': 2811},\n", " {'lower': 22492.878787878788, 'upper': 25304.363636363636, 'count': 2812},\n", " {'lower': 25304.363636363636, 'upper': 28115.848484848484, 'count': 2811},\n", " {'lower': 28115.848484848484, 'upper': 30927.333333333332, 'count': 2812},\n", " {'lower': 30927.333333333332, 'upper': 33738.818181818184, 'count': 2811},\n", " {'lower': 33738.818181818184, 'upper': 36550.30303030303, 'count': 2812},\n", " {'lower': 36550.30303030303, 'upper': 39361.78787878788, 'count': 2811},\n", " {'lower': 39361.78787878788, 'upper': 42173.27272727273, 'count': 2812},\n", " {'lower': 42173.27272727273, 'upper': 44984.757575757576, 'count': 2811},\n", " {'lower': 44984.757575757576, 'upper': 47796.242424242424, 'count': 2812},\n", " {'lower': 47796.242424242424, 'upper': 50607.72727272727, 'count': 2811},\n", " {'lower': 50607.72727272727, 'upper': 53419.21212121212, 'count': 2812},\n", " {'lower': 53419.21212121212, 'upper': 56230.69696969697, 'count': 2811},\n", " {'lower': 56230.69696969697, 'upper': 59042.181818181816, 'count': 2812},\n", " {'lower': 59042.181818181816, 'upper': 61853.666666666664, 'count': 2811},\n", " {'lower': 61853.666666666664, 'upper': 64665.15151515151, 'count': 2812},\n", " {'lower': 64665.15151515151, 'upper': 67476.63636363637, 'count': 2811},\n", " {'lower': 67476.63636363637, 'upper': 70288.12121212122, 'count': 2812},\n", " {'lower': 70288.12121212122, 'upper': 73099.60606060606, 'count': 2811},\n", " {'lower': 73099.60606060606, 'upper': 75911.09090909091, 'count': 2812},\n", " {'lower': 75911.09090909091, 'upper': 78722.57575757576, 'count': 2811},\n", " {'lower': 78722.57575757576, 'upper': 81534.06060606061, 'count': 2812},\n", " {'lower': 81534.06060606061, 'upper': 84345.54545454546, 'count': 2811},\n", " {'lower': 84345.54545454546, 'upper': 87157.0303030303, 'count': 2812},\n", " {'lower': 87157.0303030303, 'upper': 89968.51515151515, 'count': 2811},\n", " {'lower': 89968.51515151515, 'upper': 92780.0, 'count': 2812}]}}" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.hist(\"id\")" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 288 ms\n" ] }, { "data": { "text/plain": [ "{'hist': {'id': [{'lower': 1.0, 'upper': 2812.4848484848485, 'count': 2812},\n", " {'lower': 2812.4848484848485, 'upper': 5623.969696969697, 'count': 2811},\n", " {'lower': 5623.969696969697, 'upper': 8435.454545454546, 'count': 2812},\n", " {'lower': 8435.454545454546, 'upper': 11246.939393939394, 'count': 2811},\n", " {'lower': 11246.939393939394, 'upper': 14058.424242424242, 'count': 2812},\n", " {'lower': 14058.424242424242, 'upper': 16869.909090909092, 'count': 2811},\n", " {'lower': 16869.909090909092, 'upper': 19681.39393939394, 'count': 2812},\n", " {'lower': 19681.39393939394, 'upper': 22492.878787878788, 'count': 2811},\n", " {'lower': 22492.878787878788, 'upper': 25304.363636363636, 'count': 2812},\n", " {'lower': 25304.363636363636, 'upper': 28115.848484848484, 'count': 2811},\n", " {'lower': 28115.848484848484, 'upper': 30927.333333333332, 'count': 2812},\n", " {'lower': 30927.333333333332, 'upper': 33738.818181818184, 'count': 2811},\n", " {'lower': 33738.818181818184, 'upper': 36550.30303030303, 'count': 2812},\n", " {'lower': 36550.30303030303, 'upper': 39361.78787878788, 'count': 2811},\n", " {'lower': 39361.78787878788, 'upper': 42173.27272727273, 'count': 2812},\n", " {'lower': 42173.27272727273, 'upper': 44984.757575757576, 'count': 2811},\n", " {'lower': 44984.757575757576, 'upper': 47796.242424242424, 'count': 2812},\n", " {'lower': 47796.242424242424, 'upper': 50607.72727272727, 'count': 2811},\n", " {'lower': 50607.72727272727, 'upper': 53419.21212121212, 'count': 2812},\n", " {'lower': 53419.21212121212, 'upper': 56230.69696969697, 'count': 2811},\n", " {'lower': 56230.69696969697, 'upper': 59042.181818181816, 'count': 2812},\n", " {'lower': 59042.181818181816, 'upper': 61853.666666666664, 'count': 2811},\n", " {'lower': 61853.666666666664, 'upper': 64665.15151515151, 'count': 2812},\n", " {'lower': 64665.15151515151, 'upper': 67476.63636363637, 'count': 2811},\n", " {'lower': 67476.63636363637, 'upper': 70288.12121212122, 'count': 2812},\n", " {'lower': 70288.12121212122, 'upper': 73099.60606060606, 'count': 2811},\n", " {'lower': 73099.60606060606, 'upper': 75911.09090909091, 'count': 2812},\n", " {'lower': 75911.09090909091, 'upper': 78722.57575757576, 'count': 2811},\n", " {'lower': 78722.57575757576, 'upper': 81534.06060606061, 'count': 2812},\n", " {'lower': 81534.06060606061, 'upper': 84345.54545454546, 'count': 2811},\n", " {'lower': 84345.54545454546, 'upper': 87157.0303030303, 'count': 2812},\n", " {'lower': 87157.0303030303, 'upper': 89968.51515151515, 'count': 2811},\n", " {'lower': 89968.51515151515, 'upper': 92780.0, 'count': 2812}]}}" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "df.cols.hist(\"id\")" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'frequency': {'id': {'values': [{'value': '1', 'count': 1},\n", " {'value': '65673', 'count': 1},\n", " {'value': '65671', 'count': 1},\n", " {'value': '65670', 'count': 1},\n", " {'value': '6567', 'count': 1},\n", " {'value': '65669', 'count': 1},\n", " {'value': '65668', 'count': 1},\n", " {'value': '65667', 'count': 1},\n", " {'value': '65666', 'count': 1},\n", " {'value': '65665', 'count': 1}],\n", " 'count_uniques': 92780},\n", " 'transaction_date': {'values': [{'value': '1/7/2017', 'count': 2267},\n", " {'value': '1/8/2016', 'count': 2219},\n", " {'value': '1/6/2016', 'count': 2208},\n", " {'value': '1/10/2016', 'count': 2194},\n", " {'value': '1/5/2016', 'count': 2167},\n", " {'value': '1/4/2017', 'count': 2147},\n", " {'value': '1/3/2016', 'count': 2142},\n", " {'value': '1/2/2017', 'count': 2135},\n", " {'value': '1/4/2016', 'count': 2128},\n", " {'value': '1/5/2017', 'count': 2128}],\n", " 'count_uniques': 60},\n", " 'name': {'values': [{'value': 'Pants-FG35', 'count': 38},\n", " {'value': 'Pants-SG42', 'count': 36},\n", " {'value': 'pants-SG82', 'count': 35},\n", " {'value': 'Pants-FG65', 'count': 34},\n", " {'value': 'PANTS-FG24', 'count': 32},\n", " {'value': 'pants-SG81', 'count': 31},\n", " {'value': 'PANTS-SG14', 'count': 30},\n", " {'value': 'PANTS-SG42', 'count': 30},\n", " {'value': 'pants-FG41', 'count': 30},\n", " {'value': 'pants-FG51', 'count': 29}],\n", " 'count_uniques': 20527},\n", " 'price': {'values': [{'value': '45', 'count': 1132},\n", " {'value': '50', 'count': 917},\n", " {'value': '40', 'count': 879},\n", " {'value': '90', 'count': 774},\n", " {'value': '80', 'count': 735},\n", " {'value': '155', 'count': 726},\n", " {'value': '165', 'count': 724},\n", " {'value': '175', 'count': 717},\n", " {'value': '110', 'count': 716},\n", " {'value': '160', 'count': 713}],\n", " 'count_uniques': 320},\n", " 'discount': {'values': [{'value': '', 'count': 54051},\n", " {'value': '5', 'count': 10594},\n", " {'value': '20', 'count': 6984},\n", " {'value': '15', 'count': 6947},\n", " {'value': '5%', 'count': 3348},\n", " {'value': '15%', 'count': 2385},\n", " {'value': '20%', 'count': 2363},\n", " {'value': '50', 'count': 2312},\n", " {'value': '75', 'count': 2288},\n", " {'value': '75%', 'count': 760}],\n", " 'count_uniques': 11}}}" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.meta\n", "df.cols.frequency(\"*\", n=10, count_uniques=True, compute=True)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['id', 'transaction_date', 'name', 'price', 'discount']" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.names()" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "92780" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.rows.count()" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "asdfasdf\n" ] }, { "data": { "text/plain": [ "{'hist': {'id': [{'lower': 1.0, 'upper': 2812.4848484848485, 'count': 2812},\n", " {'lower': 2812.4848484848485, 'upper': 5623.969696969697, 'count': 2811},\n", " {'lower': 5623.969696969697, 'upper': 8435.454545454546, 'count': 2812},\n", " {'lower': 8435.454545454546, 'upper': 11246.939393939394, 'count': 2811},\n", " {'lower': 11246.939393939394, 'upper': 14058.424242424242, 'count': 2812},\n", " {'lower': 14058.424242424242, 'upper': 16869.909090909092, 'count': 2811},\n", " {'lower': 16869.909090909092, 'upper': 19681.39393939394, 'count': 2812},\n", " {'lower': 19681.39393939394, 'upper': 22492.878787878788, 'count': 2811},\n", " {'lower': 22492.878787878788, 'upper': 25304.363636363636, 'count': 2812},\n", " {'lower': 25304.363636363636, 'upper': 28115.848484848484, 'count': 2811},\n", " {'lower': 28115.848484848484, 'upper': 30927.333333333332, 'count': 2812},\n", " {'lower': 30927.333333333332, 'upper': 33738.818181818184, 'count': 2811},\n", " {'lower': 33738.818181818184, 'upper': 36550.30303030303, 'count': 2812},\n", " {'lower': 36550.30303030303, 'upper': 39361.78787878788, 'count': 2811},\n", " {'lower': 39361.78787878788, 'upper': 42173.27272727273, 'count': 2812},\n", " {'lower': 42173.27272727273, 'upper': 44984.757575757576, 'count': 2811},\n", " {'lower': 44984.757575757576, 'upper': 47796.242424242424, 'count': 2812},\n", " {'lower': 47796.242424242424, 'upper': 50607.72727272727, 'count': 2811},\n", " {'lower': 50607.72727272727, 'upper': 53419.21212121212, 'count': 2812},\n", " {'lower': 53419.21212121212, 'upper': 56230.69696969697, 'count': 2811},\n", " {'lower': 56230.69696969697, 'upper': 59042.181818181816, 'count': 2812},\n", " {'lower': 59042.181818181816, 'upper': 61853.666666666664, 'count': 2811},\n", " {'lower': 61853.666666666664, 'upper': 64665.15151515151, 'count': 2812},\n", " {'lower': 64665.15151515151, 'upper': 67476.63636363637, 'count': 2811},\n", " {'lower': 67476.63636363637, 'upper': 70288.12121212122, 'count': 2812},\n", " {'lower': 70288.12121212122, 'upper': 73099.60606060606, 'count': 2811},\n", " {'lower': 73099.60606060606, 'upper': 75911.09090909091, 'count': 2812},\n", " {'lower': 75911.09090909091, 'upper': 78722.57575757576, 'count': 2811},\n", " {'lower': 78722.57575757576, 'upper': 81534.06060606061, 'count': 2812},\n", " {'lower': 81534.06060606061, 'upper': 84345.54545454546, 'count': 2811},\n", " {'lower': 84345.54545454546, 'upper': 87157.0303030303, 'count': 2812},\n", " {'lower': 87157.0303030303, 'upper': 89968.51515151515, 'count': 2811},\n", " {'lower': 89968.51515151515, 'upper': 92780.0, 'count': 2812}]}}" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.hist(\"id\")" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "asdfasdf\n" ] }, { "data": { "text/plain": [ "{'hist': {'price': [{'lower': 4.99,\n", " 'upper': 17.065151515151516,\n", " 'count': 1873},\n", " {'lower': 17.065151515151516, 'upper': 29.14030303030303, 'count': 2187},\n", " {'lower': 29.14030303030303, 'upper': 41.21545454545455, 'count': 3685},\n", " {'lower': 41.21545454545455, 'upper': 53.290606060606066, 'count': 4540},\n", " {'lower': 53.290606060606066, 'upper': 65.36575757575758, 'count': 3932},\n", " {'lower': 65.36575757575758, 'upper': 77.44090909090909, 'count': 2755},\n", " {'lower': 77.44090909090909, 'upper': 89.5160606060606, 'count': 3402},\n", " {'lower': 89.5160606060606, 'upper': 101.59121212121212, 'count': 3690},\n", " {'lower': 101.59121212121212, 'upper': 113.66636363636364, 'count': 3344},\n", " {'lower': 113.66636363636364, 'upper': 125.74151515151516, 'count': 3660},\n", " {'lower': 125.74151515151516, 'upper': 137.8166666666667, 'count': 2766},\n", " {'lower': 137.8166666666667, 'upper': 149.8918181818182, 'count': 3260},\n", " {'lower': 149.8918181818182, 'upper': 161.96696969696973, 'count': 3719},\n", " {'lower': 161.96696969696973, 'upper': 174.04212121212123, 'count': 3332},\n", " {'lower': 174.04212121212123, 'upper': 186.11727272727276, 'count': 3764},\n", " {'lower': 186.11727272727276, 'upper': 198.19242424242427, 'count': 3120},\n", " {'lower': 198.19242424242427, 'upper': 210.26757575757577, 'count': 2965},\n", " {'lower': 210.26757575757577, 'upper': 222.3427272727273, 'count': 1947},\n", " {'lower': 222.3427272727273, 'upper': 234.4178787878788, 'count': 2299},\n", " {'lower': 234.4178787878788, 'upper': 246.49303030303034, 'count': 2587},\n", " {'lower': 246.49303030303034, 'upper': 258.5681818181818, 'count': 2417},\n", " {'lower': 258.5681818181818, 'upper': 270.6433333333334, 'count': 2537},\n", " {'lower': 270.6433333333334, 'upper': 282.7184848484849, 'count': 1919},\n", " {'lower': 282.7184848484849, 'upper': 294.7936363636364, 'count': 2234},\n", " {'lower': 294.7936363636364, 'upper': 306.8687878787879, 'count': 2539},\n", " {'lower': 306.8687878787879, 'upper': 318.94393939393944, 'count': 2340},\n", " {'lower': 318.94393939393944, 'upper': 331.01909090909095, 'count': 2555},\n", " {'lower': 331.01909090909095, 'upper': 343.09424242424245, 'count': 2167},\n", " {'lower': 343.09424242424245, 'upper': 355.16939393939396, 'count': 2652},\n", " {'lower': 355.16939393939396, 'upper': 367.2445454545455, 'count': 1948},\n", " {'lower': 367.2445454545455, 'upper': 379.319696969697, 'count': 2271},\n", " {'lower': 379.319696969697, 'upper': 391.3948484848485, 'count': 2534},\n", " {'lower': 391.3948484848485, 'upper': 403.47, 'count': 1840}]}}" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.hist(\"price\")" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "distributed.worker - WARNING - Compute Failed\n", "Function: execute_task\n", "args: ((, , [(, [(, , [(subgraph_callable, 'getitem-d88a0ae422f4c1a9d42629e3644fd1fa', (, , (, , 0, 64000000, b'\\n'), b'id,transaction_date,name,price,discount\\n', {'sep': ',', 'header': 0, 'encoding': 'utf-8', 'quoting': 0, 'error_bad_lines': False, 'keep_default_na': True, 'na_values': None, 'engine': 'c', 'na_filter': False, 'low_memory': False, 'dtype': 'str'}, {'id': dtype('O'), 'transaction_date': dtype('O'), 'name': dtype('O'), 'price': dtype('O'), 'discount': dtype('O')}, ['id', 'transaction_date', 'name', 'price', 'discount'], False, False, None), 'transaction\n", "kwargs: {}\n", "Exception: ValueError(\"could not convert string to float: '1/7/2019'\")\n", "\n" ] }, { "ename": "ValueError", "evalue": "could not convert string to float: '1/7/2019'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n", "\u001b[1;32m~\\Documents\\Optimus\\optimus\\engines\\dask\\columns.py\u001b[0m in \u001b[0;36mhist\u001b[1;34m(self, columns, buckets, compute)\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[0mdfd\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mcol_name\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"float\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 38\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 39\u001b[1;33m \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdfd\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 40\u001b[0m \u001b[0m_count\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_bins\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdask\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcompute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mda\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhistogram\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdfd\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbins\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbuckets\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mdfd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdfd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 41\u001b[0m result[col_name] = [\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\core.py\u001b[0m in \u001b[0;36m__len__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 556\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 557\u001b[0m return self.reduction(\n\u001b[1;32m--> 558\u001b[1;33m \u001b[0mlen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtoken\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"len\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmeta\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mint\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msplit_every\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 559\u001b[0m ).compute()\n\u001b[0;32m 560\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\dask\\base.py\u001b[0m in \u001b[0;36mcompute\u001b[1;34m(self, **kwargs)\u001b[0m\n\u001b[0;32m 281\u001b[0m \u001b[0mdask\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbase\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcompute\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 282\u001b[0m \"\"\"\n\u001b[1;32m--> 283\u001b[1;33m \u001b[1;33m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcompute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtraverse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 284\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 285\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\dask\\base.py\u001b[0m in \u001b[0;36mcompute\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 563\u001b[0m \u001b[0mpostcomputes\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__dask_postcompute__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 564\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 565\u001b[1;33m \u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mschedule\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdsk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkeys\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 566\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mrepack\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0ma\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresults\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpostcomputes\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 567\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\distributed\\client.py\u001b[0m in \u001b[0;36mget\u001b[1;34m(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)\u001b[0m\n\u001b[0;32m 2710\u001b[0m \u001b[0mshould_rejoin\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2711\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2712\u001b[1;33m \u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgather\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpacked\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0masynchronous\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0masynchronous\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdirect\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdirect\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2713\u001b[0m \u001b[1;32mfinally\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2714\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mf\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mfutures\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\distributed\\client.py\u001b[0m in \u001b[0;36mgather\u001b[1;34m(self, futures, errors, direct, asynchronous)\u001b[0m\n\u001b[0;32m 1989\u001b[0m \u001b[0mdirect\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdirect\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1990\u001b[0m \u001b[0mlocal_worker\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mlocal_worker\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1991\u001b[1;33m \u001b[0masynchronous\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0masynchronous\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1992\u001b[0m )\n\u001b[0;32m 1993\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\distributed\\client.py\u001b[0m in \u001b[0;36msync\u001b[1;34m(self, func, asynchronous, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[0;32m 832\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 833\u001b[0m return sync(\n\u001b[1;32m--> 834\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloop\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallback_timeout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcallback_timeout\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 835\u001b[0m )\n\u001b[0;32m 836\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\distributed\\utils.py\u001b[0m in \u001b[0;36msync\u001b[1;34m(loop, func, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[0;32m 337\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 338\u001b[0m \u001b[0mtyp\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtb\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 339\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 340\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 341\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\distributed\\utils.py\u001b[0m in \u001b[0;36mf\u001b[1;34m()\u001b[0m\n\u001b[0;32m 321\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcallback_timeout\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 322\u001b[0m \u001b[0mfuture\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0masyncio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait_for\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfuture\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallback_timeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 323\u001b[1;33m \u001b[0mresult\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32myield\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 324\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 325\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\tornado\\gen.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 733\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 734\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 735\u001b[1;33m \u001b[0mvalue\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 736\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 737\u001b[0m \u001b[0mexc_info\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\distributed\\client.py\u001b[0m in \u001b[0;36m_gather\u001b[1;34m(self, futures, errors, direct, local_worker)\u001b[0m\n\u001b[0;32m 1848\u001b[0m \u001b[0mexc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mCancelledError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1849\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1850\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mexception\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtraceback\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1851\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1852\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m\"skip\"\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\dask\\optimization.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 961\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minkeys\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 962\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Expected %d args, got %d\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minkeys\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 963\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mcore\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdsk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moutkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mzip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minkeys\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 964\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 965\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__reduce__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\dask\\core.py\u001b[0m in \u001b[0;36mget\u001b[1;34m(dsk, out, cache)\u001b[0m\n\u001b[0;32m 149\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mkey\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mtoposort\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdsk\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 150\u001b[0m \u001b[0mtask\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdsk\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 151\u001b[1;33m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_execute_task\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtask\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 152\u001b[0m \u001b[0mcache\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 153\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_execute_task\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mout\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\dask\\core.py\u001b[0m in \u001b[0;36m_execute_task\u001b[1;34m(arg, cache, dsk)\u001b[0m\n\u001b[0;32m 119\u001b[0m \u001b[1;31m# temporaries by their reference count and can execute certain\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 120\u001b[0m \u001b[1;31m# operations in-place.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 121\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_execute_task\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0ma\u001b[0m \u001b[1;32min\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 122\u001b[0m \u001b[1;32melif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mishashable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 123\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0marg\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\dask\\utils.py\u001b[0m in \u001b[0;36mapply\u001b[1;34m(func, args, kwargs)\u001b[0m\n\u001b[0;32m 33\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mapply\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 34\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 35\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 36\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\dask\\utils.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, obj, *args, **kwargs)\u001b[0m\n\u001b[0;32m 899\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 900\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 901\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 902\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 903\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__reduce__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36mastype\u001b[1;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 5870\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5871\u001b[0m \u001b[1;31m# else, only a single dtype is given\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 5872\u001b[1;33m \u001b[0mnew_data\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_mgr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 5873\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_constructor\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnew_data\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__finalize__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"astype\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5874\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\internals\\managers.py\u001b[0m in \u001b[0;36mastype\u001b[1;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 629\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"raise\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 630\u001b[0m ) -> \"BlockManager\":\n\u001b[1;32m--> 631\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"astype\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 632\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 633\u001b[0m def convert(\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\internals\\managers.py\u001b[0m in \u001b[0;36mapply\u001b[1;34m(self, f, align_keys, ignore_failures, **kwargs)\u001b[0m\n\u001b[0;32m 425\u001b[0m \u001b[0mapplied\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mb\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 426\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 427\u001b[1;33m \u001b[0mapplied\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 428\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mTypeError\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 429\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mignore_failures\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\internals\\blocks.py\u001b[0m in \u001b[0;36mastype\u001b[1;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 671\u001b[0m \u001b[0mvals1d\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvalues\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 672\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 673\u001b[1;33m \u001b[0mvalues\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mastype_nansafe\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvals1d\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 674\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mValueError\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 675\u001b[0m \u001b[1;31m# e.g. astype_nansafe can fail on object-dtype of strings\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\dtypes\\cast.py\u001b[0m in \u001b[0;36mastype_nansafe\u001b[1;34m(arr, dtype, copy, skipna)\u001b[0m\n\u001b[0;32m 1095\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcopy\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mis_object_dtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marr\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mis_object_dtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1096\u001b[0m \u001b[1;31m# Explicit copy, or required since NumPy can't view from / to object.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1097\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1098\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1099\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mview\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mValueError\u001b[0m: could not convert string to float: '1/7/2019'" ] } ], "source": [ "%%time\n", "df.cols.hist()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# import sys,os,os.path\n", "# os.environ\n", "def xray(df, col_name):\n", " for i in df[col_name]:\n", " print(i, type(i))\n", " " ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Open Bumblebee: https://app.hi-bumblebee.com
If you really care about privacy get your keys in bumblebee.ini and put them here
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\Anaconda3\\lib\\site-packages\\distributed\\dashboard\\core.py:79: UserWarning: \n", "Port 8787 is already in use. \n", "Perhaps you already have a cluster running?\n", "Hosting the diagnostics dashboard on a random port instead.\n", " warnings.warn(\"\\n\" + msg)\n", "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 35 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] } ], "source": [ "op = Optimus(\"dask\", threads_per_worker=8, n_workers=1, comm=True)\n", "\n", "preview_df = op.load.file(\"http://159.65.217.17:5003/uploads/datasetFile-1591123129359.csv\", n_rows=35).ext.cache() " ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "float() argument must be a string or a number, not 'NoneType'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mfastnumbers\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mfastnumbers\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfast_float\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mTypeError\u001b[0m: float() argument must be a string or a number, not 'NoneType'" ] } ], "source": [ "import fastnumbers\n", "fastnumbers.fast_float(None)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] } ], "source": [ "f = df.cols.set(value='df[\"ticket_price\"]*df[\"discount\"]', where='df[\"ticket_price\"]!=None', output_cols=\"ticket_price\").ext.cache()\n", "_output = df.ext.profile(columns=\"*\", output=\"json\")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'sample': {'columns': [{'title': 'customer_id'}, {'title': 'transactoin_date'}, {'title': 'ticket_price'}, {'title': 'discount'}, {'title': 'product'}, {'title': 'info'}], 'value': [['0f345kjh345oiuy345', '2010/08/19', '29.99', nan, 'platinum', '1'], ['0fju234978rfjkhsdf', '2012/01/05', '29.99', nan, 'platinum', '2'], ['0f34ruiy23e78y2r', '2009/08/11', '29.99', nan, 'platinum', '3'], ['0fue298y2r23r23r2', '2010/08/19', '29.99', nan, 'platinum', '4'], ['0f345kjh345oiuy346', '2010/08/19', '29.99', nan, 'platinum', '5'], ['0fju234978rfjkhsdf', '2010/08/19', '9.99', '5%', 'basic', '6'], ['0f34ruiy23e78y2r', '2010/08/19', '9.99', nan, 'basic', '7'], ['0fue298y2r23r23r3', '2010/08/19', '9.99', nan, 'basic', '8'], ['0f345kjh345oiuy347', '2010/08/19', '9.99', nan, 'basic', '9'], ['0fju234978rfjkhsdf', '2011/08/11', '9.99', '5%', 'basic', '10'], ['0f34ruiy23e78y2r', '2015/08/09', '9.99', '5%', 'basic', '10'], ['0fue298y2r23r23r4', nan, '9.91', nan, 'basic', '10'], ['0f345kjh345oiuy348', nan, '9.99', nan, 'basic', '10'], ['0fju234978rfjkhsdf', nan, '14.99', '5%', 'deluxe', '10'], ['0f34ruiy23e78y2r', nan, '14.99', nan, 'deluxe', '10'], ['0fue298y2r23r23r5', nan, '14.99', '5%', 'deluxe', '10'], ['0f345kjh345oiuy349', nan, '14.99', nan, 'deluxe', 'a']]}}\n", "pdf ticket_price\n", "0 29.99\n", "1 29.99\n", "2 29.99\n", "3 29.99\n", "4 29.99\n", "5 9.99\n", "6 9.99\n", "7 9.99\n", "8 9.99\n", "9 9.99\n", "10 9.99\n", "11 9.91\n", "12 9.99\n", "13 14.99\n", "14 14.99\n", "15 14.99\n", "16 14.99\n", "a 0 29.99\n", "1 29.99\n", "2 29.99\n", "3 29.99\n", "4 29.99\n", "5 9.99\n", "6 9.99\n", "7 9.99\n", "8 9.99\n", "9 9.99\n", "10 9.99\n", "11 9.91\n", "12 9.99\n", "13 14.99\n", "14 14.99\n", "15 14.99\n", "16 14.99\n", "Name: new ticket_price, dtype: float64\n", "pdf ticket_price\n", "0 29.99\n", "1 29.99\n", "2 29.99\n", "3 29.99\n", "4 29.99\n", "5 9.99\n", "6 9.99\n", "7 9.99\n", "8 9.99\n", "9 9.99\n", "10 9.99\n", "11 9.91\n", "12 9.99\n", "13 14.99\n", "14 14.99\n", "15 14.99\n", "16 14.99\n", "a 0 29.99\n", "1 29.99\n", "2 29.99\n", "3 29.99\n", "4 29.99\n", "5 9.99\n", "6 9.99\n", "7 9.99\n", "8 9.99\n", "9 9.99\n", "10 9.99\n", "11 9.91\n", "12 9.99\n", "13 14.99\n", "14 14.99\n", "15 14.99\n", "16 14.99\n", "Name: new ticket_price, dtype: float64\n", "dtype int\n", "{'new ticket_price': {'match': 17, 'missing': 0, 'mismatch': 17}}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 500000 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n", "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 500000 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] } ], "source": [ "\n", "_output = {**preview_df.ext.to_json(\"*\"), \"meta\": preview_df.meta.get() if (preview_df.meta and preview_df.meta.get) else {} } \n", "\n", "_output = preview_df.ext.profile(columns=\"*\", output=\"json\")\n", "df = op.load.file(\"http://159.65.217.17:5003/uploads/datasetFile-1591123129359.csv\").ext.cache()\n", "df = df.ext.repartition(8).ext.cache()\n", "_output = df.ext.profile(columns=\"*\", output=\"json\")\n", "_output = df.ext.set_buffer(\"*\")\n", "_output = df.ext.buffer_window(\"*\", 0, 17).ext.to_json(\"*\")\n", "\n", "print(_output)\n", "\n", "_output = df.ext.set_buffer(\"*\")\n", "_output = df.ext.buffer_window(\"*\", 0, 17).cols.set(value='df[\"ticket_price\"]', where='df[\"ticket_price\"]!=None', output_cols=\"new ticket_price\").ext.to_json(\"*\")\n", "_df_profile = df.ext.buffer_window(\"*\").cols.set(value='df[\"ticket_price\"]', where='df[\"ticket_price\"]!=None', output_cols=\"new ticket_price\")\n", "_output = { \"profile\": _df_profile.ext.profile([\"new ticket_price\"], output=\"json\")}\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customer_idtransactoin_dateticket_pricediscountproductinfo
00f345kjh345oiuy3452010/08/1929.99NaNplatinum1
10fju234978rfjkhsdf2012/01/0529.99NaNplatinum2
20f34ruiy23e78y2r2009/08/1129.99NaNplatinum3
30fue298y2r23r23r22010/08/1929.99NaNplatinum4
40f345kjh345oiuy3462010/08/1929.99NaNplatinum5
50fju234978rfjkhsdf2010/08/199.995%basic6
60f34ruiy23e78y2r2010/08/199.99NaNbasic7
70fue298y2r23r23r32010/08/199.99NaNbasic8
80f345kjh345oiuy3472010/08/199.99NaNbasic9
90fju234978rfjkhsdf2011/08/119.995%basic10
\n", "
" ], "text/plain": [ " customer_id transactoin_date ticket_price discount product info\n", "0 0f345kjh345oiuy345 2010/08/19 29.99 NaN platinum 1\n", "1 0fju234978rfjkhsdf 2012/01/05 29.99 NaN platinum 2\n", "2 0f34ruiy23e78y2r 2009/08/11 29.99 NaN platinum 3\n", "3 0fue298y2r23r23r2 2010/08/19 29.99 NaN platinum 4\n", "4 0f345kjh345oiuy346 2010/08/19 29.99 NaN platinum 5\n", "5 0fju234978rfjkhsdf 2010/08/19 9.99 5% basic 6\n", "6 0f34ruiy23e78y2r 2010/08/19 9.99 NaN basic 7\n", "7 0fue298y2r23r23r3 2010/08/19 9.99 NaN basic 8\n", "8 0f345kjh345oiuy347 2010/08/19 9.99 NaN basic 9\n", "9 0fju234978rfjkhsdf 2011/08/11 9.99 5% basic 10" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.compute().ext.head()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "pdf ticket_price\n", "0 29.99\n", "1 29.99\n", "2 29.99\n", "3 29.99\n", "4 29.99\n", "5 9.99\n", "6 9.99\n", "7 9.99\n", "8 9.99\n", "9 9.99\n", "10 9.99\n", "11 9.91\n", "12 9.99\n", "13 14.99\n", "14 14.99\n", "15 14.99\n", "16 14.99\n", "a 0 899.4001\n", "1 899.4001\n", "2 899.4001\n", "3 899.4001\n", "4 899.4001\n", "5 99.8001\n", "6 99.8001\n", "7 99.8001\n", "8 99.8001\n", "9 99.8001\n", "10 99.8001\n", "11 98.2081\n", "12 99.8001\n", "13 224.7001\n", "14 224.7001\n", "15 224.7001\n", "16 224.7001\n", "Name: new ticket_price, dtype: float64\n" ] } ], "source": [ "_output = df.ext.buffer_window(\"*\", 0, 17).cols.set(value='df[\"ticket_price\"]*df[\"discount\"]', where='df[\"ticket_price\"]!=None', output_cols=\"new ticket_price\").ext.to_json(\"*\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.ext.buffer_window(\"*\", 0, 17).cols.set(value='df[\"ticket_price\"]*df[\"ticket_price\"]',output_cols=\"new 1\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Open Bumblebee: https://app.hi-bumblebee.com
If you really care about privacy get your keys in bumblebee.ini and put them here
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from optimus import Optimus\n", "op = Optimus(\"dask\", n_workers=1, threads_per_worker=8, processes=False, memory_limit=\"3G\", comm=True)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 20 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 17 of 17 rows / 6 columns
\n", "
1 partition(s)
\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
customer_id
\n", "
1 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
transactoin_date
\n", "
2 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
ticket_price
\n", "
3 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
discount
\n", "
4 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
product
\n", "
5 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
info
\n", "
6 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f345kjh345oiuy345\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fju234978rfjkhsdf\n", " \n", "
\n", "
\n", "
\n", " \n", " 2012/01/05\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 2\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f34ruiy23e78y2r\n", " \n", "
\n", "
\n", "
\n", " \n", " 2009/08/11\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 3\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fue298y2r23r23r2\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 4\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f345kjh345oiuy346\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 5\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fju234978rfjkhsdf\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " 5%\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 6\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f34ruiy23e78y2r\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fue298y2r23r23r3\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f345kjh345oiuy347\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fju234978rfjkhsdf\n", " \n", "
\n", "
\n", "
\n", " \n", " 2011/08/11\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " 5%\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f34ruiy23e78y2r\n", " \n", "
\n", "
\n", "
\n", " \n", " 2015/08/09\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " 5%\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fue298y2r23r23r4\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.91\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f345kjh345oiuy348\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fju234978rfjkhsdf\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 14.99\n", " \n", "
\n", "
\n", "
\n", " \n", " 5%\n", " \n", "
\n", "
\n", "
\n", " \n", " deluxe\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f34ruiy23e78y2r\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 14.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " deluxe\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fue298y2r23r23r5\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 14.99\n", " \n", "
\n", "
\n", "
\n", " \n", " 5%\n", " \n", "
\n", "
\n", "
\n", " \n", " deluxe\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f345kjh345oiuy349\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 14.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " deluxe\n", " \n", "
\n", "
\n", "
\n", " \n", " a\n", " \n", "
\n", "
\n", "\n", "
Viewing 17 of 17 rows / 6 columns
\n", "
1 partition(s) <class 'dask.dataframe.core.DataFrame'>
\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 163 ms\n" ] } ], "source": [ "%%time\n", "df = op.load.file(\"data/dataset-transactions.csv\").ext.cache()\n", "# df = df.ext.optimize()\n", "df = df.ext.repartition(1).ext.cache()\n", "df.ext.display(20)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# cols_and_inferred_dtype = df.cols.infer_profiler_dtypes(\"*\")\n", "# df.cols.cast_to_profiler_dtypes(columns=cols_and_inferred_dtype).persist()\n", "# # result = df.ext.profile(columns=columns, bins=bins, output=output, flush=flush, size=size)\n", " \n", "# print(cols_and_inferred_dtype)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "AAAA {'file_name': 'dataset-transactions.csv', 'mime_info': [{'mime': 'text/plain', 'encoding': 'us-ascii', 'file_ext': 'csv', 'file_type': 'csv', 'properties': {'delimiter': ',', 'doublequote': False, 'escapechar': None, 'lineterminator': '\\r\\n', 'quotechar': '\"', 'quoting': 0, 'skipinitialspace': False}}], 'transformations': {}, 'profile': {'columns': {'customer_id': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4}, {'value': '0f34ruiy23e78y2r', 'count': 4}, {'value': '0fue298y2r23r23r5', 'count': 1}, {'value': '0fue298y2r23r23r4', 'count': 1}, {'value': '0fue298y2r23r23r3', 'count': 1}, {'value': '0fue298y2r23r23r2', 'count': 1}, {'value': '0f345kjh345oiuy349', 'count': 1}, {'value': '0f345kjh345oiuy348', 'count': 1}, {'value': '0f345kjh345oiuy347', 'count': 1}, {'value': '0f345kjh345oiuy346', 'count': 1}, {'value': '0f345kjh345oiuy345', 'count': 1}], 'count_uniques': 11}, 'dtype': 'object'}, 'transactoin_date': {'stats': {'mismatch': 11, 'missing': 6, 'match': 0, 'frequency': [{'value': '2010/08/19', 'count': 7}, {'value': '2015/08/09', 'count': 1}, {'value': '2012/01/05', 'count': 1}, {'value': '2011/08/11', 'count': 1}, {'value': '2009/08/11', 'count': 1}], 'count_uniques': 5}, 'dtype': 'object'}, 'ticket_price': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'hist': [{'lower': 9.91, 'upper': 10.5375, 'count': 8}, {'lower': 10.5375, 'upper': 11.165, 'count': 0}, {'lower': 11.165, 'upper': 11.7925, 'count': 0}, {'lower': 11.7925, 'upper': 12.42, 'count': 0}, {'lower': 12.42, 'upper': 13.0475, 'count': 0}, {'lower': 13.0475, 'upper': 13.675, 'count': 0}, {'lower': 13.675, 'upper': 14.3025, 'count': 0}, {'lower': 14.3025, 'upper': 14.93, 'count': 0}, {'lower': 14.93, 'upper': 15.5575, 'count': 4}, {'lower': 15.5575, 'upper': 16.185, 'count': 0}, {'lower': 16.185, 'upper': 16.8125, 'count': 0}, {'lower': 16.8125, 'upper': 17.439999999999998, 'count': 0}, {'lower': 17.439999999999998, 'upper': 18.0675, 'count': 0}, {'lower': 18.0675, 'upper': 18.695, 'count': 0}, {'lower': 18.695, 'upper': 19.322499999999998, 'count': 0}, {'lower': 19.322499999999998, 'upper': 19.95, 'count': 0}, {'lower': 19.95, 'upper': 20.5775, 'count': 0}, {'lower': 20.5775, 'upper': 21.205, 'count': 0}, {'lower': 21.205, 'upper': 21.8325, 'count': 0}, {'lower': 21.8325, 'upper': 22.46, 'count': 0}, {'lower': 22.46, 'upper': 23.0875, 'count': 0}, {'lower': 23.0875, 'upper': 23.715, 'count': 0}, {'lower': 23.715, 'upper': 24.3425, 'count': 0}, {'lower': 24.3425, 'upper': 24.97, 'count': 0}, {'lower': 24.97, 'upper': 25.597499999999997, 'count': 0}, {'lower': 25.597499999999997, 'upper': 26.224999999999998, 'count': 0}, {'lower': 26.224999999999998, 'upper': 26.8525, 'count': 0}, {'lower': 26.8525, 'upper': 27.48, 'count': 0}, {'lower': 27.48, 'upper': 28.107499999999998, 'count': 0}, {'lower': 28.107499999999998, 'upper': 28.735, 'count': 0}, {'lower': 28.735, 'upper': 29.362499999999997, 'count': 0}, {'lower': 29.362499999999997, 'upper': 29.99, 'count': 5}], 'count_uniques': 4}, 'dtype': 'object'}, 'discount': {'stats': {'mismatch': 0, 'missing': 12, 'match': 5, 'frequency': [{'value': '5%', 'count': 5}], 'count_uniques': 1}, 'dtype': 'object'}, 'product': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': 'basic', 'count': 8}, {'value': 'platinum', 'count': 5}, {'value': 'deluxe', 'count': 4}], 'count_uniques': 3}, 'dtype': 'object'}, 'info': {'stats': {'mismatch': 1, 'missing': 0, 'match': 16}, 'dtype': 'object'}}, 'name': None, 'file_name': 'dataset-transactions.csv', 'summary': {'cols_count': 6, 'rows_count': 17, 'dtypes_list': ['object'], 'total_count_dtypes': 1, 'missing_count': 0, 'p_missing': 0.0}}}\n", "BBBB {'file_name': 'dataset-transactions.csv', 'mime_info': [{'mime': 'text/plain', 'encoding': 'us-ascii', 'file_ext': 'csv', 'file_type': 'csv', 'properties': {'delimiter': ',', 'doublequote': False, 'escapechar': None, 'lineterminator': '\\r\\n', 'quotechar': '\"', 'quoting': 0, 'skipinitialspace': False}}], 'transformations': {'actions': [{'profiler_dtype': 'customer_id'}, {'profiler_dtype': 'transactoin_date'}, {'profiler_dtype': 'ticket_price'}, {'profiler_dtype': 'discount'}, {'profiler_dtype': 'product'}, {'profiler_dtype': 'info'}]}, 'profile': {'columns': {'customer_id': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4}, {'value': '0f34ruiy23e78y2r', 'count': 4}, {'value': '0fue298y2r23r23r5', 'count': 1}, {'value': '0fue298y2r23r23r4', 'count': 1}, {'value': '0fue298y2r23r23r3', 'count': 1}, {'value': '0fue298y2r23r23r2', 'count': 1}, {'value': '0f345kjh345oiuy349', 'count': 1}, {'value': '0f345kjh345oiuy348', 'count': 1}, {'value': '0f345kjh345oiuy347', 'count': 1}, {'value': '0f345kjh345oiuy346', 'count': 1}, {'value': '0f345kjh345oiuy345', 'count': 1}], 'count_uniques': 11}, 'dtype': 'object', 'profiler_dtype': 'string'}, 'transactoin_date': {'stats': {'mismatch': 11, 'missing': 6, 'match': 0, 'frequency': [{'value': '2010/08/19', 'count': 7}, {'value': '2015/08/09', 'count': 1}, {'value': '2012/01/05', 'count': 1}, {'value': '2011/08/11', 'count': 1}, {'value': '2009/08/11', 'count': 1}], 'count_uniques': 5}, 'dtype': 'object', 'profiler_dtype': 'date'}, 'ticket_price': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'hist': [{'lower': 9.91, 'upper': 10.5375, 'count': 8}, {'lower': 10.5375, 'upper': 11.165, 'count': 0}, {'lower': 11.165, 'upper': 11.7925, 'count': 0}, {'lower': 11.7925, 'upper': 12.42, 'count': 0}, {'lower': 12.42, 'upper': 13.0475, 'count': 0}, {'lower': 13.0475, 'upper': 13.675, 'count': 0}, {'lower': 13.675, 'upper': 14.3025, 'count': 0}, {'lower': 14.3025, 'upper': 14.93, 'count': 0}, {'lower': 14.93, 'upper': 15.5575, 'count': 4}, {'lower': 15.5575, 'upper': 16.185, 'count': 0}, {'lower': 16.185, 'upper': 16.8125, 'count': 0}, {'lower': 16.8125, 'upper': 17.439999999999998, 'count': 0}, {'lower': 17.439999999999998, 'upper': 18.0675, 'count': 0}, {'lower': 18.0675, 'upper': 18.695, 'count': 0}, {'lower': 18.695, 'upper': 19.322499999999998, 'count': 0}, {'lower': 19.322499999999998, 'upper': 19.95, 'count': 0}, {'lower': 19.95, 'upper': 20.5775, 'count': 0}, {'lower': 20.5775, 'upper': 21.205, 'count': 0}, {'lower': 21.205, 'upper': 21.8325, 'count': 0}, {'lower': 21.8325, 'upper': 22.46, 'count': 0}, {'lower': 22.46, 'upper': 23.0875, 'count': 0}, {'lower': 23.0875, 'upper': 23.715, 'count': 0}, {'lower': 23.715, 'upper': 24.3425, 'count': 0}, {'lower': 24.3425, 'upper': 24.97, 'count': 0}, {'lower': 24.97, 'upper': 25.597499999999997, 'count': 0}, {'lower': 25.597499999999997, 'upper': 26.224999999999998, 'count': 0}, {'lower': 26.224999999999998, 'upper': 26.8525, 'count': 0}, {'lower': 26.8525, 'upper': 27.48, 'count': 0}, {'lower': 27.48, 'upper': 28.107499999999998, 'count': 0}, {'lower': 28.107499999999998, 'upper': 28.735, 'count': 0}, {'lower': 28.735, 'upper': 29.362499999999997, 'count': 0}, {'lower': 29.362499999999997, 'upper': 29.99, 'count': 5}], 'count_uniques': 4}, 'dtype': 'object', 'profiler_dtype': 'decimal'}, 'discount': {'stats': {'mismatch': 0, 'missing': 12, 'match': 5, 'frequency': [{'value': '5%', 'count': 5}], 'count_uniques': 1}, 'dtype': 'object', 'profiler_dtype': 'string'}, 'product': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': 'basic', 'count': 8}, {'value': 'platinum', 'count': 5}, {'value': 'deluxe', 'count': 4}], 'count_uniques': 3}, 'dtype': 'object', 'profiler_dtype': 'string'}, 'info': {'stats': {'mismatch': 1, 'missing': 0, 'match': 16}, 'dtype': 'object', 'profiler_dtype': 'int'}}, 'name': None, 'file_name': 'dataset-transactions.csv', 'summary': {'cols_count': 6, 'rows_count': 17, 'dtypes_list': ['object'], 'total_count_dtypes': 1, 'missing_count': 0, 'p_missing': 0.0}}}\n" ] } ], "source": [ "# df.cols.infer_profiler_dtypes(\"*\")\n", "p = df.ext.profile(\"*\", flush=True)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'stats': {'mismatch': 0,\n", " 'missing': 0,\n", " 'match': 17,\n", " 'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4},\n", " {'value': '0f34ruiy23e78y2r', 'count': 4},\n", " {'value': '0fue298y2r23r23r5', 'count': 1},\n", " {'value': '0fue298y2r23r23r4', 'count': 1},\n", " {'value': '0fue298y2r23r23r3', 'count': 1},\n", " {'value': '0fue298y2r23r23r2', 'count': 1},\n", " {'value': '0f345kjh345oiuy349', 'count': 1},\n", " {'value': '0f345kjh345oiuy348', 'count': 1},\n", " {'value': '0f345kjh345oiuy347', 'count': 1},\n", " {'value': '0f345kjh345oiuy346', 'count': 1},\n", " {'value': '0f345kjh345oiuy345', 'count': 1}],\n", " 'count_uniques': 11},\n", " 'dtype': 'object',\n", " 'profiler_dtype': 'string'}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.meta.get()[\"profile\"][\"columns\"][\"customer_id\"]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# df, p = df.ext.cast_and_profile(\"*\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "assert(p[\"columns\"][\"ticket_price\"][\"stats\"][\"match\"]==17)\n", "assert(p[\"columns\"][\"ticket_price\"][\"stats\"][\"mismatch\"]==0)\n", "assert(p[\"columns\"][\"ticket_price\"][\"stats\"][\"missing\"]==0)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] }, { "data": { "text/plain": [ "{'customer_id': 'string',\n", " 'transactoin_date': 'date',\n", " 'ticket_price': 'decimal',\n", " 'discount': 'string',\n", " 'product': 'string',\n", " 'info': 'int'}" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.infer_profiler_dtypes(\"*\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "df = df.cols.replace(\"discount\", search=[\"%\"], replace_by=\"\", search_by=\"chars\", ignore_case=True)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customer_idtransactoin_dateticket_pricediscountproductinfo
00f345kjh345oiuy3452010/08/1929.99NaNplatinum1
10fju234978rfjkhsdf2012/01/0529.99NaNplatinum2
20f34ruiy23e78y2r2009/08/1129.99NaNplatinum3
30fue298y2r23r23r22010/08/1929.99NaNplatinum4
40f345kjh345oiuy3462010/08/1929.99NaNplatinum5
50fju234978rfjkhsdf2010/08/199.995basic6
60f34ruiy23e78y2r2010/08/199.99NaNbasic7
70fue298y2r23r23r32010/08/199.99NaNbasic8
80f345kjh345oiuy3472010/08/199.99NaNbasic9
90fju234978rfjkhsdf2011/08/119.995basic10
100f34ruiy23e78y2r2015/08/099.995basic10
110fue298y2r23r23r4NaN9.91NaNbasic10
120f345kjh345oiuy348NaN9.99NaNbasic10
130fju234978rfjkhsdfNaN14.995deluxe10
140f34ruiy23e78y2rNaN14.99NaNdeluxe10
150fue298y2r23r23r5NaN14.995deluxe10
160f345kjh345oiuy349NaN14.99NaNdeluxea
\n", "
" ], "text/plain": [ " customer_id transactoin_date ticket_price discount product info\n", "0 0f345kjh345oiuy345 2010/08/19 29.99 NaN platinum 1\n", "1 0fju234978rfjkhsdf 2012/01/05 29.99 NaN platinum 2\n", "2 0f34ruiy23e78y2r 2009/08/11 29.99 NaN platinum 3\n", "3 0fue298y2r23r23r2 2010/08/19 29.99 NaN platinum 4\n", "4 0f345kjh345oiuy346 2010/08/19 29.99 NaN platinum 5\n", "5 0fju234978rfjkhsdf 2010/08/19 9.99 5 basic 6\n", "6 0f34ruiy23e78y2r 2010/08/19 9.99 NaN basic 7\n", "7 0fue298y2r23r23r3 2010/08/19 9.99 NaN basic 8\n", "8 0f345kjh345oiuy347 2010/08/19 9.99 NaN basic 9\n", "9 0fju234978rfjkhsdf 2011/08/11 9.99 5 basic 10\n", "10 0f34ruiy23e78y2r 2015/08/09 9.99 5 basic 10\n", "11 0fue298y2r23r23r4 NaN 9.91 NaN basic 10\n", "12 0f345kjh345oiuy348 NaN 9.99 NaN basic 10\n", "13 0fju234978rfjkhsdf NaN 14.99 5 deluxe 10\n", "14 0f34ruiy23e78y2r NaN 14.99 NaN deluxe 10\n", "15 0fue298y2r23r23r5 NaN 14.99 5 deluxe 10\n", "16 0f345kjh345oiuy349 NaN 14.99 NaN deluxe a" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.compute()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 50 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customer_idtransactoin_dateticket_pricediscountproductinfo
0stringdatedecimalnullstringint
1stringdatedecimalnullstringint
2stringdatedecimalnullstringint
3stringdatedecimalnullstringint
4stringdatedecimalnullstringint
5stringdatedecimalintstringint
6stringdatedecimalnullstringint
7stringdatedecimalnullstringint
8stringdatedecimalnullstringint
9stringdatedecimalintstringint
10stringdatedecimalintstringint
11stringnulldecimalnullstringint
12stringnulldecimalnullstringint
13stringnulldecimalintstringint
14stringnulldecimalnullstringint
15stringnulldecimalintstringint
16stringnulldecimalnullstringstring
\n", "
" ], "text/plain": [ " customer_id transactoin_date ticket_price discount product info\n", "0 string date decimal null string int\n", "1 string date decimal null string int\n", "2 string date decimal null string int\n", "3 string date decimal null string int\n", "4 string date decimal null string int\n", "5 string date decimal int string int\n", "6 string date decimal null string int\n", "7 string date decimal null string int\n", "8 string date decimal null string int\n", "9 string date decimal int string int\n", "10 string date decimal int string int\n", "11 string null decimal null string int\n", "12 string null decimal null string int\n", "13 string null decimal int string int\n", "14 string null decimal null string int\n", "15 string null decimal int string int\n", "16 string null decimal null string string" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# df.cols.infer_profiler_dtypes(\"*\")\n", "from optimus.infer import Infer\n", "df.ext.head(\"*\",50).applymap(Infer.parse_pandas)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "nan \n", "nan \n", "nan \n", "nan \n", "nan \n", "5 \n", "nan \n", "nan \n", "nan \n", "5 \n", "5 \n", "nan \n", "nan \n", "5 \n", "nan \n", "5 \n", "nan \n" ] } ], "source": [ "xray(df, \"discount\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] } ], "source": [ "p = df.ext.profile(\"discount\")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] }, { "data": { "text/plain": [ "{'customer_id': 'string',\n", " 'transactoin_date': 'date',\n", " 'ticket_price': 'decimal',\n", " 'discount': 'int',\n", " 'product': 'string',\n", " 'info': 'int'}" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.infer_profiler_dtypes(\"*\")" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# df[\"transactoin_date\"].compute()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customer_idtransactoin_dateticket_pricediscountproductinfo
00f345kjh345oiuy3452010/08/1929.99NaNplatinum1
10fju234978rfjkhsdf2012/01/0529.99NaNplatinum2
20f34ruiy23e78y2r2009/08/1129.99NaNplatinum3
30fue298y2r23r23r22010/08/1929.99NaNplatinum4
40f345kjh345oiuy3462010/08/1929.99NaNplatinum5
50fju234978rfjkhsdf2010/08/199.995basic6
60f34ruiy23e78y2r2010/08/199.99NaNbasic7
70fue298y2r23r23r32010/08/199.99NaNbasic8
80f345kjh345oiuy3472010/08/199.99NaNbasic9
90fju234978rfjkhsdf2011/08/119.995basic10
100f34ruiy23e78y2r2015/08/099.995basic10
110fue298y2r23r23r4NaN9.91NaNbasic10
120f345kjh345oiuy348NaN9.99NaNbasic10
130fju234978rfjkhsdfNaN14.995deluxe10
140f34ruiy23e78y2rNaN14.99NaNdeluxe10
150fue298y2r23r23r5NaN14.995deluxe10
160f345kjh345oiuy349NaN14.99NaNdeluxea
\n", "
" ], "text/plain": [ " customer_id transactoin_date ticket_price discount product info\n", "0 0f345kjh345oiuy345 2010/08/19 29.99 NaN platinum 1\n", "1 0fju234978rfjkhsdf 2012/01/05 29.99 NaN platinum 2\n", "2 0f34ruiy23e78y2r 2009/08/11 29.99 NaN platinum 3\n", "3 0fue298y2r23r23r2 2010/08/19 29.99 NaN platinum 4\n", "4 0f345kjh345oiuy346 2010/08/19 29.99 NaN platinum 5\n", "5 0fju234978rfjkhsdf 2010/08/19 9.99 5 basic 6\n", "6 0f34ruiy23e78y2r 2010/08/19 9.99 NaN basic 7\n", "7 0fue298y2r23r23r3 2010/08/19 9.99 NaN basic 8\n", "8 0f345kjh345oiuy347 2010/08/19 9.99 NaN basic 9\n", "9 0fju234978rfjkhsdf 2011/08/11 9.99 5 basic 10\n", "10 0f34ruiy23e78y2r 2015/08/09 9.99 5 basic 10\n", "11 0fue298y2r23r23r4 NaN 9.91 NaN basic 10\n", "12 0f345kjh345oiuy348 NaN 9.99 NaN basic 10\n", "13 0fju234978rfjkhsdf NaN 14.99 5 deluxe 10\n", "14 0f34ruiy23e78y2r NaN 14.99 NaN deluxe 10\n", "15 0fue298y2r23r23r5 NaN 14.99 5 deluxe 10\n", "16 0f345kjh345oiuy349 NaN 14.99 NaN deluxe a" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.compute()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "nan \n", "nan \n", "nan \n", "nan \n", "nan \n", "5 \n", "nan \n", "nan \n", "nan \n", "5 \n", "5 \n", "nan \n", "nan \n", "5 \n", "nan \n", "5 \n", "nan \n" ] } ], "source": [ "xray(df, \"discount\")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "df = df.cols.unnest(\"transactoin_date\", separator=\"/\", splits=2, output_cols=\"transactoin_date\").ext.cache()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] } ], "source": [ "_output = df.ext.profile(columns=\"*\", output=\"json\", flush=True)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'stats': {'mismatch': 0,\n", " 'missing': 0,\n", " 'match': 17,\n", " 'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4},\n", " {'value': '0f34ruiy23e78y2r', 'count': 4},\n", " {'value': '0fue298y2r23r23r5', 'count': 1},\n", " {'value': '0fue298y2r23r23r4', 'count': 1},\n", " {'value': '0fue298y2r23r23r3', 'count': 1},\n", " {'value': '0fue298y2r23r23r2', 'count': 1},\n", " {'value': '0f345kjh345oiuy349', 'count': 1},\n", " {'value': '0f345kjh345oiuy348', 'count': 1},\n", " {'value': '0f345kjh345oiuy347', 'count': 1},\n", " {'value': '0f345kjh345oiuy346', 'count': 1},\n", " {'value': '0f345kjh345oiuy345', 'count': 1}],\n", " 'count_uniques': 11},\n", " 'dtype': 'object',\n", " 'profiler_dtype': 'string'}" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.meta.get()[\"profile\"][\"columns\"][\"customer_id\"]" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] }, { "data": { "text/plain": [ "{'customer_id': 'string',\n", " 'transactoin_date': 'date',\n", " 'transactoin_date_0': 'int',\n", " 'transactoin_date_1': 'int',\n", " 'ticket_price': 'decimal',\n", " 'discount': 'int',\n", " 'product': 'string',\n", " 'info': 'int'}" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.infer_profiler_dtypes(\"*\")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customer_idtransactoin_datetransactoin_date_0transactoin_date_1ticket_pricediscountproductinfo
00f345kjh345oiuy3452010/08/1920100829.99NaNplatinum1
10fju234978rfjkhsdf2012/01/0520120129.99NaNplatinum2
20f34ruiy23e78y2r2009/08/1120090829.99NaNplatinum3
30fue298y2r23r23r22010/08/1920100829.99NaNplatinum4
40f345kjh345oiuy3462010/08/1920100829.99NaNplatinum5
50fju234978rfjkhsdf2010/08/192010089.995basic6
60f34ruiy23e78y2r2010/08/192010089.99NaNbasic7
70fue298y2r23r23r32010/08/192010089.99NaNbasic8
80f345kjh345oiuy3472010/08/192010089.99NaNbasic9
90fju234978rfjkhsdf2011/08/112011089.995basic10
100f34ruiy23e78y2r2015/08/092015089.995basic10
110fue298y2r23r23r4NaNnanNone9.91NaNbasic10
120f345kjh345oiuy348NaNnanNone9.99NaNbasic10
130fju234978rfjkhsdfNaNnanNone14.995deluxe10
140f34ruiy23e78y2rNaNnanNone14.99NaNdeluxe10
150fue298y2r23r23r5NaNnanNone14.995deluxe10
160f345kjh345oiuy349NaNnanNone14.99NaNdeluxea
\n", "
" ], "text/plain": [ " customer_id transactoin_date transactoin_date_0 transactoin_date_1 \\\n", "0 0f345kjh345oiuy345 2010/08/19 2010 08 \n", "1 0fju234978rfjkhsdf 2012/01/05 2012 01 \n", "2 0f34ruiy23e78y2r 2009/08/11 2009 08 \n", "3 0fue298y2r23r23r2 2010/08/19 2010 08 \n", "4 0f345kjh345oiuy346 2010/08/19 2010 08 \n", "5 0fju234978rfjkhsdf 2010/08/19 2010 08 \n", "6 0f34ruiy23e78y2r 2010/08/19 2010 08 \n", "7 0fue298y2r23r23r3 2010/08/19 2010 08 \n", "8 0f345kjh345oiuy347 2010/08/19 2010 08 \n", "9 0fju234978rfjkhsdf 2011/08/11 2011 08 \n", "10 0f34ruiy23e78y2r 2015/08/09 2015 08 \n", "11 0fue298y2r23r23r4 NaN nan None \n", "12 0f345kjh345oiuy348 NaN nan None \n", "13 0fju234978rfjkhsdf NaN nan None \n", "14 0f34ruiy23e78y2r NaN nan None \n", "15 0fue298y2r23r23r5 NaN nan None \n", "16 0f345kjh345oiuy349 NaN nan None \n", "\n", " ticket_price discount product info \n", "0 29.99 NaN platinum 1 \n", "1 29.99 NaN platinum 2 \n", "2 29.99 NaN platinum 3 \n", "3 29.99 NaN platinum 4 \n", "4 29.99 NaN platinum 5 \n", "5 9.99 5 basic 6 \n", "6 9.99 NaN basic 7 \n", "7 9.99 NaN basic 8 \n", "8 9.99 NaN basic 9 \n", "9 9.99 5 basic 10 \n", "10 9.99 5 basic 10 \n", "11 9.91 NaN basic 10 \n", "12 9.99 NaN basic 10 \n", "13 14.99 5 deluxe 10 \n", "14 14.99 NaN deluxe 10 \n", "15 14.99 5 deluxe 10 \n", "16 14.99 NaN deluxe a " ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.compute()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'stats': {'mismatch': 0,\n", " 'missing': 0,\n", " 'match': 17,\n", " 'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4},\n", " {'value': '0f34ruiy23e78y2r', 'count': 4},\n", " {'value': '0fue298y2r23r23r5', 'count': 1},\n", " {'value': '0fue298y2r23r23r4', 'count': 1},\n", " {'value': '0fue298y2r23r23r3', 'count': 1},\n", " {'value': '0fue298y2r23r23r2', 'count': 1},\n", " {'value': '0f345kjh345oiuy349', 'count': 1},\n", " {'value': '0f345kjh345oiuy348', 'count': 1},\n", " {'value': '0f345kjh345oiuy347', 'count': 1},\n", " {'value': '0f345kjh345oiuy346', 'count': 1},\n", " {'value': '0f345kjh345oiuy345', 'count': 1}],\n", " 'count_uniques': 11},\n", " 'dtype': 'object',\n", " 'profiler_dtype': 'string'}" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.meta.get()[\"profile\"][\"columns\"][\"customer_id\"]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "cols df[\"ticket_price\"]*df[\"discount\"]*10\n", "columns ['ticket_price', 'discount']\n", "cols None\n" ] } ], "source": [ "df1 = df.cols.set(value='df[\"ticket_price\"]*df[\"discount\"]*10', output_cols=\"new ticket_price\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['customer_id',\n", " 'transactoin_date',\n", " 'ticket_price',\n", " 'discount',\n", " 'product',\n", " 'info']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.names()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customer_idtransactoin_dateticket_pricediscountproductinfonew ticket_price
00f345kjh345oiuy3452010/08/1929.99NaNplatinum1NaN
10fju234978rfjkhsdf2012/01/0529.99NaNplatinum2NaN
20f34ruiy23e78y2r2009/08/1129.99NaNplatinum3NaN
30fue298y2r23r23r22010/08/1929.99NaNplatinum4NaN
40f345kjh345oiuy3462010/08/1929.99NaNplatinum5NaN
50fju234978rfjkhsdf2010/08/199.995%basic6NaN
60f34ruiy23e78y2r2010/08/199.99NaNbasic7NaN
70fue298y2r23r23r32010/08/199.99NaNbasic8NaN
80f345kjh345oiuy3472010/08/199.99NaNbasic9NaN
90fju234978rfjkhsdf2011/08/119.995%basic10NaN
100f34ruiy23e78y2r2015/08/099.995%basic10NaN
110fue298y2r23r23r4NaN9.91NaNbasic10NaN
120f345kjh345oiuy348NaN9.99NaNbasic10NaN
130fju234978rfjkhsdfNaN14.995%deluxe10NaN
140f34ruiy23e78y2rNaN14.99NaNdeluxe10NaN
150fue298y2r23r23r5NaN14.995%deluxe10NaN
160f345kjh345oiuy349NaN14.99NaNdeluxeaNaN
\n", "
" ], "text/plain": [ " customer_id transactoin_date ticket_price discount product info \\\n", "0 0f345kjh345oiuy345 2010/08/19 29.99 NaN platinum 1 \n", "1 0fju234978rfjkhsdf 2012/01/05 29.99 NaN platinum 2 \n", "2 0f34ruiy23e78y2r 2009/08/11 29.99 NaN platinum 3 \n", "3 0fue298y2r23r23r2 2010/08/19 29.99 NaN platinum 4 \n", "4 0f345kjh345oiuy346 2010/08/19 29.99 NaN platinum 5 \n", "5 0fju234978rfjkhsdf 2010/08/19 9.99 5% basic 6 \n", "6 0f34ruiy23e78y2r 2010/08/19 9.99 NaN basic 7 \n", "7 0fue298y2r23r23r3 2010/08/19 9.99 NaN basic 8 \n", "8 0f345kjh345oiuy347 2010/08/19 9.99 NaN basic 9 \n", "9 0fju234978rfjkhsdf 2011/08/11 9.99 5% basic 10 \n", "10 0f34ruiy23e78y2r 2015/08/09 9.99 5% basic 10 \n", "11 0fue298y2r23r23r4 NaN 9.91 NaN basic 10 \n", "12 0f345kjh345oiuy348 NaN 9.99 NaN basic 10 \n", "13 0fju234978rfjkhsdf NaN 14.99 5% deluxe 10 \n", "14 0f34ruiy23e78y2r NaN 14.99 NaN deluxe 10 \n", "15 0fue298y2r23r23r5 NaN 14.99 5% deluxe 10 \n", "16 0f345kjh345oiuy349 NaN 14.99 NaN deluxe a \n", "\n", " new ticket_price \n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", "5 NaN \n", "6 NaN \n", "7 NaN \n", "8 NaN \n", "9 NaN \n", "10 NaN \n", "11 NaN \n", "12 NaN \n", "13 NaN \n", "14 NaN \n", "15 NaN \n", "16 NaN " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.compute()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.cols.set(value='mask[\"ticket_price\"]+mask[\"transactoin_date\"]', where='df[\"ticket_price\"]!=None', output_cols=\"new ticket_price_2\").compute()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "dtype string\n", "dtype date\n", "dtype decimal\n", "dtype string\n", "dtype string\n", "dtype int\n", "dtypedtype string\n", "dtype date\n", "dtype decimal\n", "dtype string\n", "dtype dtype string\n", "dtype string\n", "string\n", "dtype int\n", "dtype string\n", "dtype datedtype string\n", "\n", "dtype decimal\n", "dtype string\n", "dtype string date\n", "dtype decimal\n", "dtype string\n", "dtype date\n", "dtype decimaldtype string\n", "dtype datedtype\n", "dtypedtypedtype date\n", "dtype decimal\n", "dtype string\n", "dtypedtype date\n", "dtype decimal\n", " \n", "dtype string\n", "\n", "dtype decimal\n", "dtype string\n", " int\n", " stringdtypestring\n", "string\n", "dtype string\n", "dtype date\n", "\n", "dtype int\n", " string\n", "dtype int\n", "dtypedtypedtype string\n", "dtype string\n", "dtype int\n", "dtype decimal\n", "dtype string\n", "dtype string\n", "dtype int\n", " int\n", "string\n", "dtype int\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] } ], "source": [ "df = op.load.file(\"http://159.65.217.17:5003/uploads/datasetFile-1590599684769.csv\").ext.cache()\n", "df = df.ext.repartition(8).ext.cache()\n", "_output = df.ext.cast_and_profile(columns=\"*\", output=\"json\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "unmasked = \"DD/MM/YY\"\n", "masked = unmasked[1:4]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'ABB'" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "masked" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\"columns\": {\"customer_id\": {\"profiler_dtype\": \"string\", \"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 17, \"frequency\": [{\"value\": \"0fju234978rfjkhsdf\", \"count\": 4}, {\"value\": \"0f34ruiy23e78y2r\", \"count\": 4}, {\"value\": \"0fue298y2r23r23r5\", \"count\": 1}, {\"value\": \"0fue298y2r23r23r4\", \"count\": 1}, {\"value\": \"0fue298y2r23r23r3\", \"count\": 1}, {\"value\": \"0fue298y2r23r23r2\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy349\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy348\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy347\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy346\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy345\", \"count\": 1}], \"count_uniques\": 11}, \"dtype\": \"object\"}, \"transactoin_date\": {\"profiler_dtype\": \"date\", \"stats\": {\"mismatch\": 11, \"missing\": 6, \"match\": 0, \"frequency\": [{\"value\": \"2010/08/19\", \"count\": 7}, {\"value\": \"2015/08/09\", \"count\": 1}, {\"value\": \"2012/01/05\", \"count\": 1}, {\"value\": \"2011/08/11\", \"count\": 1}, {\"value\": \"2009/08/11\", \"count\": 1}], \"count_uniques\": 5}, \"dtype\": \"object\"}, \"ticket_price\": {\"profiler_dtype\": \"decimal\", \"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 17, \"hist\": [{\"lower\": 9.91, \"upper\": 10.5375, \"count\": 0}, {\"lower\": 10.5375, \"upper\": 11.165, \"count\": 0}, {\"lower\": 11.165, \"upper\": 11.7925, \"count\": 0}, {\"lower\": 11.7925, \"upper\": 12.42, \"count\": 0}, {\"lower\": 12.42, \"upper\": 13.0475, \"count\": 0}, {\"lower\": 13.0475, \"upper\": 13.675, \"count\": 0}, {\"lower\": 13.675, \"upper\": 14.3025, \"count\": 0}, {\"lower\": 14.3025, \"upper\": 14.93, \"count\": 0}, {\"lower\": 14.93, \"upper\": 15.5575, \"count\": 3}, {\"lower\": 15.5575, \"upper\": 16.185, \"count\": 0}, {\"lower\": 16.185, \"upper\": 16.8125, \"count\": 0}, {\"lower\": 16.8125, \"upper\": 17.439999999999998, \"count\": 0}, {\"lower\": 17.439999999999998, \"upper\": 18.0675, \"count\": 0}, {\"lower\": 18.0675, \"upper\": 18.695, \"count\": 0}, {\"lower\": 18.695, \"upper\": 19.322499999999998, \"count\": 0}, {\"lower\": 19.322499999999998, \"upper\": 19.95, \"count\": 0}, {\"lower\": 19.95, \"upper\": 20.5775, \"count\": 0}, {\"lower\": 20.5775, \"upper\": 21.205, \"count\": 0}, {\"lower\": 21.205, \"upper\": 21.8325, \"count\": 0}, {\"lower\": 21.8325, \"upper\": 22.46, \"count\": 0}, {\"lower\": 22.46, \"upper\": 23.0875, \"count\": 0}, {\"lower\": 23.0875, \"upper\": 23.715, \"count\": 0}, {\"lower\": 23.715, \"upper\": 24.3425, \"count\": 0}, {\"lower\": 24.3425, \"upper\": 24.97, \"count\": 0}, {\"lower\": 24.97, \"upper\": 25.597499999999997, \"count\": 0}, {\"lower\": 25.597499999999997, \"upper\": 26.224999999999998, \"count\": 0}, {\"lower\": 26.224999999999998, \"upper\": 26.8525, \"count\": 0}, {\"lower\": 26.8525, \"upper\": 27.48, \"count\": 0}, {\"lower\": 27.48, \"upper\": 28.107499999999998, \"count\": 0}, {\"lower\": 28.107499999999998, \"upper\": 28.735, \"count\": 0}, {\"lower\": 28.735, \"upper\": 29.362499999999997, \"count\": 0}, {\"lower\": 29.362499999999997, \"upper\": 29.99, \"count\": 0}], \"count_uniques\": 4}, \"dtype\": \"object\"}, \"discount\": {\"profiler_dtype\": \"object\", \"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 17, \"frequency\": [{\"value\": \"nan\", \"count\": 12}, {\"value\": \"5%\", \"count\": 5}], \"count_uniques\": 2}, \"dtype\": \"object\"}, \"product\": {\"profiler_dtype\": \"string\", \"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 17, \"frequency\": [{\"value\": \"basic\", \"count\": 8}, {\"value\": \"platinum\", \"count\": 5}, {\"value\": \"deluxe\", \"count\": 4}], \"count_uniques\": 3}, \"dtype\": \"object\"}, \"info\": {\"profiler_dtype\": \"int\", \"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 17, \"hist\": [{\"lower\": 1.0, \"upper\": 1.28125, \"count\": 0}, {\"lower\": 1.28125, \"upper\": 1.5625, \"count\": 0}, {\"lower\": 1.5625, \"upper\": 1.84375, \"count\": 0}, {\"lower\": 1.84375, \"upper\": 2.125, \"count\": 0}, {\"lower\": 2.125, \"upper\": 2.40625, \"count\": 0}, {\"lower\": 2.40625, \"upper\": 2.6875, \"count\": 0}, {\"lower\": 2.6875, \"upper\": 2.96875, \"count\": 0}, {\"lower\": 2.96875, \"upper\": 3.25, \"count\": 0}, {\"lower\": 3.25, \"upper\": 3.53125, \"count\": 0}, {\"lower\": 3.53125, \"upper\": 3.8125, \"count\": 0}, {\"lower\": 3.8125, \"upper\": 4.09375, \"count\": 0}, {\"lower\": 4.09375, \"upper\": 4.375, \"count\": 0}, {\"lower\": 4.375, \"upper\": 4.65625, \"count\": 0}, {\"lower\": 4.65625, \"upper\": 4.9375, \"count\": 0}, {\"lower\": 4.9375, \"upper\": 5.21875, \"count\": 0}, {\"lower\": 5.21875, \"upper\": 5.5, \"count\": 0}, {\"lower\": 5.5, \"upper\": 5.78125, \"count\": 0}, {\"lower\": 5.78125, \"upper\": 6.0625, \"count\": 0}, {\"lower\": 6.0625, \"upper\": 6.34375, \"count\": 0}, {\"lower\": 6.34375, \"upper\": 6.625, \"count\": 0}, {\"lower\": 6.625, \"upper\": 6.90625, \"count\": 0}, {\"lower\": 6.90625, \"upper\": 7.1875, \"count\": 0}, {\"lower\": 7.1875, \"upper\": 7.46875, \"count\": 0}, {\"lower\": 7.46875, \"upper\": 7.75, \"count\": 0}, {\"lower\": 7.75, \"upper\": 8.03125, \"count\": 0}, {\"lower\": 8.03125, \"upper\": 8.3125, \"count\": 0}, {\"lower\": 8.3125, \"upper\": 8.59375, \"count\": 0}, {\"lower\": 8.59375, \"upper\": 8.875, \"count\": 0}, {\"lower\": 8.875, \"upper\": 9.15625, \"count\": 0}, {\"lower\": 9.15625, \"upper\": 9.4375, \"count\": 0}, {\"lower\": 9.4375, \"upper\": 9.71875, \"count\": 0}, {\"lower\": 9.71875, \"upper\": 10.0, \"count\": 3}], \"count_uniques\": 10}, \"dtype\": \"object\"}}, \"name\": null, \"file_name\": \"tmpk28b8e9o.csv\", \"summary\": {\"cols_count\": 6, \"rows_count\": 17, \"dtypes_list\": [\"object\"], \"total_count_dtypes\": 1, \"missing_count\": 0, \"p_missing\": 0.0}}\n" ] } ], "source": [ "print(_output)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# cols_and_inferred_dtype = df.cols.infer_profiler_dtypes(\"*\")\n", "# print(cols_and_inferred_dtype)\n", "# df = df.cols.cast_to_profiler_dtypes(columns=cols_and_inferred_dtype).persist()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
INCIDENT_NUMBEROFFENSE_CODEOFFENSE_CODE_GROUPOFFENSE_DESCRIPTIONDISTRICTREPORTING_AREASHOOTINGOCCURRED_ON_DATEYEARMONTHDAY_OF_WEEKHOURUCR_PARTSTREETLatLongLocation
0I18207094500619LarcenyLARCENY ALL OTHERSD14808NaN2018-09-02 13:00:0020189Sunday13Part OneLINCOLN ST42.35779134-71.13937053(42.35779134, -71.13937053)
1I18207094301402VandalismVANDALISMC11347NaN2018-08-21 00:00:0020188Tuesday0Part TwoHECLA ST42.30682138-71.06030035(42.30682138, -71.06030035)
2I18207094103410TowedTOWED MOTOR VEHICLED4151NaN2018-09-03 19:27:0020189Monday19Part ThreeCAZENOVE ST42.34658879-71.07242943(42.34658879, -71.07242943)
3I18207094003114Investigate PropertyINVESTIGATE PROPERTYD4272NaN2018-09-03 21:16:0020189Monday21Part ThreeNEWCOMB ST42.33418175-71.07866441(42.33418175, -71.07866441)
4I18207093803114Investigate PropertyINVESTIGATE PROPERTYB3421NaN2018-09-03 21:05:0020189Monday21Part ThreeDELHI ST42.27536542-71.09036101(42.27536542, -71.09036101)
......................................................
319068I050310906-0003125Warrant ArrestsWARRANT ARRESTD4285NaN2016-06-05 17:25:0020166Sunday17Part ThreeCOVENTRY ST42.33695098-71.08574813(42.33695098, -71.08574813)
319069I030217815-0800111HomicideMURDER, NON-NEGLIGIENT MANSLAUGHTERE18520NaN2015-07-09 13:38:0020157Thursday13Part OneRIVER ST42.25592648-71.12317207(42.25592648, -71.12317207)
319070I030217815-0803125Warrant ArrestsWARRANT ARRESTE18520NaN2015-07-09 13:38:0020157Thursday13Part ThreeRIVER ST42.25592648-71.12317207(42.25592648, -71.12317207)
319071I010370257-0003125Warrant ArrestsWARRANT ARRESTE13569NaN2016-05-31 19:35:0020165Tuesday19Part ThreeNEW WASHINGTON ST42.30233307-71.11156487(42.30233307, -71.11156487)
31907214205255003125Warrant ArrestsWARRANT ARRESTD4903NaN2015-06-22 00:12:0020156Monday0Part ThreeWASHINGTON ST42.33383935-71.08029038(42.33383935, -71.08029038)
\n", "

319073 rows × 17 columns

\n", "
" ], "text/plain": [ " INCIDENT_NUMBER OFFENSE_CODE OFFENSE_CODE_GROUP \\\n", "0 I182070945 00619 Larceny \n", "1 I182070943 01402 Vandalism \n", "2 I182070941 03410 Towed \n", "3 I182070940 03114 Investigate Property \n", "4 I182070938 03114 Investigate Property \n", "... ... ... ... \n", "319068 I050310906-00 03125 Warrant Arrests \n", "319069 I030217815-08 00111 Homicide \n", "319070 I030217815-08 03125 Warrant Arrests \n", "319071 I010370257-00 03125 Warrant Arrests \n", "319072 142052550 03125 Warrant Arrests \n", "\n", " OFFENSE_DESCRIPTION DISTRICT REPORTING_AREA SHOOTING \\\n", "0 LARCENY ALL OTHERS D14 808 NaN \n", "1 VANDALISM C11 347 NaN \n", "2 TOWED MOTOR VEHICLE D4 151 NaN \n", "3 INVESTIGATE PROPERTY D4 272 NaN \n", "4 INVESTIGATE PROPERTY B3 421 NaN \n", "... ... ... ... ... \n", "319068 WARRANT ARREST D4 285 NaN \n", "319069 MURDER, NON-NEGLIGIENT MANSLAUGHTER E18 520 NaN \n", "319070 WARRANT ARREST E18 520 NaN \n", "319071 WARRANT ARREST E13 569 NaN \n", "319072 WARRANT ARREST D4 903 NaN \n", "\n", " OCCURRED_ON_DATE YEAR MONTH DAY_OF_WEEK HOUR UCR_PART \\\n", "0 2018-09-02 13:00:00 2018 9 Sunday 13 Part One \n", "1 2018-08-21 00:00:00 2018 8 Tuesday 0 Part Two \n", "2 2018-09-03 19:27:00 2018 9 Monday 19 Part Three \n", "3 2018-09-03 21:16:00 2018 9 Monday 21 Part Three \n", "4 2018-09-03 21:05:00 2018 9 Monday 21 Part Three \n", "... ... ... ... ... ... ... \n", "319068 2016-06-05 17:25:00 2016 6 Sunday 17 Part Three \n", "319069 2015-07-09 13:38:00 2015 7 Thursday 13 Part One \n", "319070 2015-07-09 13:38:00 2015 7 Thursday 13 Part Three \n", "319071 2016-05-31 19:35:00 2016 5 Tuesday 19 Part Three \n", "319072 2015-06-22 00:12:00 2015 6 Monday 0 Part Three \n", "\n", " STREET Lat Long \\\n", "0 LINCOLN ST 42.35779134 -71.13937053 \n", "1 HECLA ST 42.30682138 -71.06030035 \n", "2 CAZENOVE ST 42.34658879 -71.07242943 \n", "3 NEWCOMB ST 42.33418175 -71.07866441 \n", "4 DELHI ST 42.27536542 -71.09036101 \n", "... ... ... ... \n", "319068 COVENTRY ST 42.33695098 -71.08574813 \n", "319069 RIVER ST 42.25592648 -71.12317207 \n", "319070 RIVER ST 42.25592648 -71.12317207 \n", "319071 NEW WASHINGTON ST 42.30233307 -71.11156487 \n", "319072 WASHINGTON ST 42.33383935 -71.08029038 \n", "\n", " Location \n", "0 (42.35779134, -71.13937053) \n", "1 (42.30682138, -71.06030035) \n", "2 (42.34658879, -71.07242943) \n", "3 (42.33418175, -71.07866441) \n", "4 (42.27536542, -71.09036101) \n", "... ... \n", "319068 (42.33695098, -71.08574813) \n", "319069 (42.25592648, -71.12317207) \n", "319070 (42.25592648, -71.12317207) \n", "319071 (42.30233307, -71.11156487) \n", "319072 (42.33383935, -71.08029038) \n", "\n", "[319073 rows x 17 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.compute()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 589 ms\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
INCIDENT_NUMBEROFFENSE_CODEOFFENSE_CODE_GROUPOFFENSE_DESCRIPTIONDISTRICTREPORTING_AREASHOOTINGOCCURRED_ON_DATEYEARMONTHDAY_OF_WEEKHOURUCR_PARTSTREETLatLongLocation
0NaN00619LarcenyLARCENY ALL OTHERSD14808NaN2018-09-02 13:00:0020189Sunday13Part OneLINCOLN ST42.35779134-71.13937053(42.35779134, -71.13937053)
1NaN01402VandalismVANDALISMC11347NaN2018-08-21 00:00:0020188Tuesday0Part TwoHECLA ST42.30682138-71.06030035(42.30682138, -71.06030035)
2NaN03410TowedTOWED MOTOR VEHICLED4151NaN2018-09-03 19:27:0020189Monday19Part ThreeCAZENOVE ST42.34658879-71.07242943(42.34658879, -71.07242943)
3NaN03114Investigate PropertyINVESTIGATE PROPERTYD4272NaN2018-09-03 21:16:0020189Monday21Part ThreeNEWCOMB ST42.33418175-71.07866441(42.33418175, -71.07866441)
4NaN03114Investigate PropertyINVESTIGATE PROPERTYB3421NaN2018-09-03 21:05:0020189Monday21Part ThreeDELHI ST42.27536542-71.09036101(42.27536542, -71.09036101)
......................................................
319068NaN03125Warrant ArrestsWARRANT ARRESTD4285NaN2016-06-05 17:25:0020166Sunday17Part ThreeCOVENTRY ST42.33695098-71.08574813(42.33695098, -71.08574813)
319069NaN00111HomicideMURDER, NON-NEGLIGIENT MANSLAUGHTERE18520NaN2015-07-09 13:38:0020157Thursday13Part OneRIVER ST42.25592648-71.12317207(42.25592648, -71.12317207)
319070NaN03125Warrant ArrestsWARRANT ARRESTE18520NaN2015-07-09 13:38:0020157Thursday13Part ThreeRIVER ST42.25592648-71.12317207(42.25592648, -71.12317207)
319071NaN03125Warrant ArrestsWARRANT ARRESTE13569NaN2016-05-31 19:35:0020165Tuesday19Part ThreeNEW WASHINGTON ST42.30233307-71.11156487(42.30233307, -71.11156487)
31907214205255003125Warrant ArrestsWARRANT ARRESTD4903NaN2015-06-22 00:12:0020156Monday0Part ThreeWASHINGTON ST42.33383935-71.08029038(42.33383935, -71.08029038)
\n", "

319073 rows × 17 columns

\n", "
" ], "text/plain": [ " INCIDENT_NUMBER OFFENSE_CODE OFFENSE_CODE_GROUP \\\n", "0 NaN 00619 Larceny \n", "1 NaN 01402 Vandalism \n", "2 NaN 03410 Towed \n", "3 NaN 03114 Investigate Property \n", "4 NaN 03114 Investigate Property \n", "... ... ... ... \n", "319068 NaN 03125 Warrant Arrests \n", "319069 NaN 00111 Homicide \n", "319070 NaN 03125 Warrant Arrests \n", "319071 NaN 03125 Warrant Arrests \n", "319072 142052550 03125 Warrant Arrests \n", "\n", " OFFENSE_DESCRIPTION DISTRICT REPORTING_AREA SHOOTING \\\n", "0 LARCENY ALL OTHERS D14 808 NaN \n", "1 VANDALISM C11 347 NaN \n", "2 TOWED MOTOR VEHICLE D4 151 NaN \n", "3 INVESTIGATE PROPERTY D4 272 NaN \n", "4 INVESTIGATE PROPERTY B3 421 NaN \n", "... ... ... ... ... \n", "319068 WARRANT ARREST D4 285 NaN \n", "319069 MURDER, NON-NEGLIGIENT MANSLAUGHTER E18 520 NaN \n", "319070 WARRANT ARREST E18 520 NaN \n", "319071 WARRANT ARREST E13 569 NaN \n", "319072 WARRANT ARREST D4 903 NaN \n", "\n", " OCCURRED_ON_DATE YEAR MONTH DAY_OF_WEEK HOUR UCR_PART \\\n", "0 2018-09-02 13:00:00 2018 9 Sunday 13 Part One \n", "1 2018-08-21 00:00:00 2018 8 Tuesday 0 Part Two \n", "2 2018-09-03 19:27:00 2018 9 Monday 19 Part Three \n", "3 2018-09-03 21:16:00 2018 9 Monday 21 Part Three \n", "4 2018-09-03 21:05:00 2018 9 Monday 21 Part Three \n", "... ... ... ... ... ... ... \n", "319068 2016-06-05 17:25:00 2016 6 Sunday 17 Part Three \n", "319069 2015-07-09 13:38:00 2015 7 Thursday 13 Part One \n", "319070 2015-07-09 13:38:00 2015 7 Thursday 13 Part Three \n", "319071 2016-05-31 19:35:00 2016 5 Tuesday 19 Part Three \n", "319072 2015-06-22 00:12:00 2015 6 Monday 0 Part Three \n", "\n", " STREET Lat Long \\\n", "0 LINCOLN ST 42.35779134 -71.13937053 \n", "1 HECLA ST 42.30682138 -71.06030035 \n", "2 CAZENOVE ST 42.34658879 -71.07242943 \n", "3 NEWCOMB ST 42.33418175 -71.07866441 \n", "4 DELHI ST 42.27536542 -71.09036101 \n", "... ... ... ... \n", "319068 COVENTRY ST 42.33695098 -71.08574813 \n", "319069 RIVER ST 42.25592648 -71.12317207 \n", "319070 RIVER ST 42.25592648 -71.12317207 \n", "319071 NEW WASHINGTON ST 42.30233307 -71.11156487 \n", "319072 WASHINGTON ST 42.33383935 -71.08029038 \n", "\n", " Location \n", "0 (42.35779134, -71.13937053) \n", "1 (42.30682138, -71.06030035) \n", "2 (42.34658879, -71.07242943) \n", "3 (42.33418175, -71.07866441) \n", "4 (42.27536542, -71.09036101) \n", "... ... \n", "319068 (42.33695098, -71.08574813) \n", "319069 (42.25592648, -71.12317207) \n", "319070 (42.25592648, -71.12317207) \n", "319071 (42.30233307, -71.11156487) \n", "319072 (42.33383935, -71.08029038) \n", "\n", "[319073 rows x 17 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "df.cols.cast(columns = {\"INCIDENT_NUMBER\":\"int\"}).compute()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 9.43 s\n" ] }, { "data": { "text/plain": [ "'{\"columns\": {\"INCIDENT_NUMBER\": {\"profiler_dtype\": \"string\", \"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 319073, \"frequency\": [{\"value\": \"I162030584\", \"count\": 13}, {\"value\": \"I152080623\", \"count\": 11}, {\"value\": \"I172013170\", \"count\": 10}, {\"value\": \"I182065208\", \"count\": 10}, {\"value\": \"I172096394\", \"count\": 10}, {\"value\": \"I162071327\", \"count\": 9}, {\"value\": \"I162001871\", \"count\": 9}, {\"value\": \"I172056883\", \"count\": 9}, {\"value\": \"I172022524\", \"count\": 9}, {\"value\": \"I172054429\", \"count\": 9}, {\"value\": \"I162098170\", \"count\": 9}, {\"value\": \"I162078338\", \"count\": 8}, {\"value\": \"I162074826\", \"count\": 8}, {\"value\": \"I162090278\", \"count\": 8}, {\"value\": \"I172069723\", \"count\": 8}, {\"value\": \"I162022140\", \"count\": 8}, {\"value\": \"I130041200-00\", \"count\": 8}, {\"value\": \"I162087224\", \"count\": 8}, {\"value\": \"I152076465\", \"count\": 8}, {\"value\": \"I162056703\", \"count\": 8}, {\"value\": \"I162064331\", \"count\": 8}, {\"value\": \"I162082917\", \"count\": 8}, {\"value\": \"I152105431\", \"count\": 8}, {\"value\": \"I172053616\", \"count\": 8}, {\"value\": \"I152101399\", \"count\": 7}, {\"value\": \"I152095733\", \"count\": 7}, {\"value\": \"I162054378\", \"count\": 7}, {\"value\": \"I152067057\", \"count\": 7}, {\"value\": \"I152096998\", \"count\": 7}, {\"value\": \"I172018004\", \"count\": 7}, {\"value\": \"I162083089\", \"count\": 7}, {\"value\": \"I162045680\", \"count\": 7}, {\"value\": \"I152071480\", \"count\": 7}], \"count_uniques\": 282517}, \"dtype\": \"object\"}, \"OFFENSE_CODE\": {\"profiler_dtype\": \"zip_code\", \"stats\": {\"mismatch\": 251, \"missing\": 0, \"match\": 318822, \"frequency\": [{\"value\": \"03006\", \"count\": 18783}, {\"value\": \"03115\", \"count\": 18746}, {\"value\": \"03831\", \"count\": 16323}, {\"value\": \"01402\", \"count\": 15152}, {\"value\": \"00802\", \"count\": 14791}, {\"value\": \"03301\", \"count\": 13099}, {\"value\": \"03410\", \"count\": 11287}, {\"value\": \"03114\", \"count\": 11123}, {\"value\": \"00617\", \"count\": 9069}, {\"value\": \"02647\", \"count\": 9039}, {\"value\": \"00614\", \"count\": 8893}, {\"value\": \"03201\", \"count\": 8892}, {\"value\": \"03125\", \"count\": 8389}, {\"value\": \"00613\", \"count\": 7949}, {\"value\": \"03802\", \"count\": 6557}, {\"value\": \"00619\", \"count\": 5963}, {\"value\": \"03803\", \"count\": 5131}, {\"value\": \"00413\", \"count\": 4886}, {\"value\": \"01102\", \"count\": 4413}, {\"value\": \"03502\", \"count\": 4365}, {\"value\": \"02629\", \"count\": 4007}, {\"value\": \"03501\", \"count\": 3766}, {\"value\": \"03207\", \"count\": 3698}, {\"value\": \"00724\", \"count\": 3629}, {\"value\": \"02610\", \"count\": 3245}, {\"value\": \"01106\", \"count\": 3147}, {\"value\": \"00301\", \"count\": 3056}, {\"value\": \"03801\", \"count\": 2925}, {\"value\": \"00423\", \"count\": 2910}, {\"value\": \"02900\", \"count\": 2894}, {\"value\": \"02907\", \"count\": 2616}, {\"value\": \"00520\", \"count\": 2585}, {\"value\": \"01849\", \"count\": 2584}], \"count_uniques\": 263}, \"dtype\": \"object\"}, \"OFFENSE_CODE_GROUP\": {\"profiler_dtype\": \"string\", \"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 319073, \"frequency\": [{\"value\": \"Motor Vehicle Accident Response\", \"count\": 37132}, {\"value\": \"Larceny\", \"count\": 25935}, {\"value\": \"Medical Assistance\", \"count\": 23540}, {\"value\": \"Investigate Person\", \"count\": 18750}, {\"value\": \"Other\", \"count\": 18075}, {\"value\": \"Drug Violation\", \"count\": 16548}, {\"value\": \"Simple Assault\", \"count\": 15826}, {\"value\": \"Vandalism\", \"count\": 15415}, {\"value\": \"Verbal Disputes\", \"count\": 13099}, {\"value\": \"Towed\", \"count\": 11287}, {\"value\": \"Investigate Property\", \"count\": 11124}, {\"value\": \"Larceny From Motor Vehicle\", \"count\": 10847}, {\"value\": \"Property Lost\", \"count\": 9751}, {\"value\": \"Warrant Arrests\", \"count\": 8407}, {\"value\": \"Aggravated Assault\", \"count\": 7807}, {\"value\": \"Violations\", \"count\": 6095}, {\"value\": \"Fraud\", \"count\": 5829}, {\"value\": \"Residential Burglary\", \"count\": 5606}, {\"value\": \"Missing Person Located\", \"count\": 4958}, {\"value\": \"Auto Theft\", \"count\": 4851}, {\"value\": \"Robbery\", \"count\": 4624}, {\"value\": \"Harassment\", \"count\": 4007}, {\"value\": \"Property Found\", \"count\": 3925}, {\"value\": \"Missing Person Reported\", \"count\": 3797}, {\"value\": \"Confidence Games\", \"count\": 3147}, {\"value\": \"Police Service Incidents\", \"count\": 2781}, {\"value\": \"Disorderly Conduct\", \"count\": 2611}, {\"value\": \"Fire Related Reports\", \"count\": 1920}, {\"value\": \"Firearm Violations\", \"count\": 1777}, {\"value\": \"License Violation\", \"count\": 1701}, {\"value\": \"Restraining Order Violations\", \"count\": 1607}, {\"value\": \"Recovered Stolen Property\", \"count\": 1455}, {\"value\": \"Counterfeiting\", \"count\": 1454}], \"count_uniques\": 67}, \"dtype\": \"object\"}, \"OFFENSE_DESCRIPTION\": {\"profiler_dtype\": \"string\", \"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 319073, \"frequency\": [{\"value\": \"SICK/INJURED/MEDICAL - PERSON\", \"count\": 18783}, {\"value\": \"INVESTIGATE PERSON\", \"count\": 18754}, {\"value\": \"M/V - LEAVING SCENE - PROPERTY DAMAGE\", \"count\": 16323}, {\"value\": \"VANDALISM\", \"count\": 15154}, {\"value\": \"ASSAULT SIMPLE - BATTERY\", \"count\": 14791}, {\"value\": \"VERBAL DISPUTE\", \"count\": 13099}, {\"value\": \"TOWED MOTOR VEHICLE\", \"count\": 11287}, {\"value\": \"INVESTIGATE PROPERTY\", \"count\": 11124}, {\"value\": \"LARCENY THEFT FROM BUILDING\", \"count\": 9069}, {\"value\": \"THREATS TO DO BODILY HARM\", \"count\": 9042}, {\"value\": \"LARCENY THEFT FROM MV - NON-ACCESSORY\", \"count\": 8893}, {\"value\": \"PROPERTY - LOST\", \"count\": 8893}, {\"value\": \"WARRANT ARREST\", \"count\": 8407}, {\"value\": \"LARCENY SHOPLIFTING\", \"count\": 7949}, {\"value\": \"M/V ACCIDENT - PROPERTY \\\\u00a0DAMAGE\", \"count\": 6557}, {\"value\": \"LARCENY ALL OTHERS\", \"count\": 5963}, {\"value\": \"M/V ACCIDENT - PERSONAL INJURY\", \"count\": 5131}, {\"value\": \"ASSAULT - AGGRAVATED - BATTERY\", \"count\": 4886}, {\"value\": \"FRAUD - FALSE PRETENSE / SCHEME\", \"count\": 4413}, {\"value\": \"MISSING PERSON - LOCATED\", \"count\": 4365}, {\"value\": \"HARASSMENT\", \"count\": 4007}, {\"value\": \"MISSING PERSON\", \"count\": 3766}, {\"value\": \"PROPERTY - FOUND\", \"count\": 3698}, {\"value\": \"AUTO THEFT\", \"count\": 3630}, {\"value\": \"TRESPASSING\", \"count\": 3254}, {\"value\": \"FRAUD - CREDIT CARD / ATM FRAUD\", \"count\": 3147}, {\"value\": \"ROBBERY - STREET\", \"count\": 3056}, {\"value\": \"M/V ACCIDENT - OTHER\", \"count\": 2925}, {\"value\": \"ASSAULT - AGGRAVATED\", \"count\": 2910}, {\"value\": \"VAL - VIOLATION OF AUTO LAW - OTHER\", \"count\": 2894}, {\"value\": \"VAL - OPERATING AFTER REV/SUSP.\", \"count\": 2618}, {\"value\": \"DRUGS - POSS CLASS B - COCAINE, ETC.\", \"count\": 2591}, {\"value\": \"BURGLARY - RESIDENTIAL - FORCE\", \"count\": 2585}], \"count_uniques\": 244}, \"dtype\": \"object\"}, \"DISTRICT\": {\"profiler_dtype\": \"string\", \"stats\": {\"mismatch\": 0, \"missing\": 1765, \"match\": 317308, \"frequency\": [{\"value\": \"B2\", \"count\": 49945}, {\"value\": \"C11\", \"count\": 42530}, {\"value\": \"D4\", \"count\": 41915}, {\"value\": \"A1\", \"count\": 35717}, {\"value\": \"B3\", \"count\": 35442}, {\"value\": \"C6\", \"count\": 23460}, {\"value\": \"D14\", \"count\": 20127}, {\"value\": \"E13\", \"count\": 17536}, {\"value\": \"E18\", \"count\": 17348}, {\"value\": \"A7\", \"count\": 13544}, {\"value\": \"E5\", \"count\": 13239}, {\"value\": \"A15\", \"count\": 6505}], \"count_uniques\": 12}, \"dtype\": \"object\"}, \"REPORTING_AREA\": {\"profiler_dtype\": \"int\", \"stats\": {\"mismatch\": 0, \"missing\": 20250, \"match\": 298823, \"hist\": [{\"lower\": 0.0, \"upper\": 30.0625, \"count\": 1502}, {\"lower\": 30.0625, \"upper\": 60.125, \"count\": 740}, {\"lower\": 60.125, \"upper\": 90.1875, \"count\": 1376}, {\"lower\": 90.1875, \"upper\": 120.25, \"count\": 2618}, {\"lower\": 120.25, \"upper\": 150.3125, \"count\": 2175}, {\"lower\": 150.3125, \"upper\": 180.375, \"count\": 2034}, {\"lower\": 180.375, \"upper\": 210.4375, \"count\": 1349}, {\"lower\": 210.4375, \"upper\": 240.5, \"count\": 1116}, {\"lower\": 240.5, \"upper\": 270.5625, \"count\": 1501}, {\"lower\": 270.5625, \"upper\": 300.625, \"count\": 1968}, {\"lower\": 300.625, \"upper\": 330.6875, \"count\": 2684}, {\"lower\": 330.6875, \"upper\": 360.75, \"count\": 2336}, {\"lower\": 360.75, \"upper\": 390.8125, \"count\": 939}, {\"lower\": 390.8125, \"upper\": 420.875, \"count\": 1051}, {\"lower\": 420.875, \"upper\": 450.9375, \"count\": 2021}, {\"lower\": 450.9375, \"upper\": 481.0, \"count\": 1848}, {\"lower\": 481.0, \"upper\": 511.0625, \"count\": 996}, {\"lower\": 511.0625, \"upper\": 541.125, \"count\": 804}, {\"lower\": 541.125, \"upper\": 571.1875, \"count\": 897}, {\"lower\": 571.1875, \"upper\": 601.25, \"count\": 1146}, {\"lower\": 601.25, \"upper\": 631.3125, \"count\": 1610}, {\"lower\": 631.3125, \"upper\": 661.375, \"count\": 537}, {\"lower\": 661.375, \"upper\": 691.4375, \"count\": 423}, {\"lower\": 691.4375, \"upper\": 721.5, \"count\": 277}, {\"lower\": 721.5, \"upper\": 751.5625, \"count\": 337}, {\"lower\": 751.5625, \"upper\": 781.625, \"count\": 678}, {\"lower\": 781.625, \"upper\": 811.6875, \"count\": 1466}, {\"lower\": 811.6875, \"upper\": 841.75, \"count\": 455}, {\"lower\": 841.75, \"upper\": 871.8125, \"count\": 0}, {\"lower\": 871.8125, \"upper\": 901.875, \"count\": 245}, {\"lower\": 901.875, \"upper\": 931.9375, \"count\": 1546}, {\"lower\": 931.9375, \"upper\": 962.0, \"count\": 1210}], \"count_uniques\": 878}, \"dtype\": \"object\"}, \"SHOOTING\": {\"profiler_dtype\": \"string\", \"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 319073, \"frequency\": [{\"value\": \"nan\", \"count\": 318054}, {\"value\": \"Y\", \"count\": 1019}], \"count_uniques\": 2}, \"dtype\": \"object\"}, \"OCCURRED_ON_DATE\": {\"profiler_dtype\": \"date\", \"stats\": {\"mismatch\": 319073, \"missing\": 0, \"match\": 0, \"frequency\": [{\"value\": \"2017-06-01 00:00:00+00:00\", \"count\": 29}, {\"value\": \"2015-07-01 00:00:00+00:00\", \"count\": 27}, {\"value\": \"2016-08-01 00:00:00+00:00\", \"count\": 27}, {\"value\": \"2015-06-18 05:00:00+00:00\", \"count\": 22}, {\"value\": \"2017-08-01 00:00:00+00:00\", \"count\": 22}, {\"value\": \"2017-01-01 00:00:00+00:00\", \"count\": 21}, {\"value\": \"2016-04-01 00:00:00+00:00\", \"count\": 20}, {\"value\": \"2017-05-01 00:00:00+00:00\", \"count\": 20}, {\"value\": \"2015-12-07 11:38:00+00:00\", \"count\": 20}, {\"value\": \"2017-04-01 00:00:00+00:00\", \"count\": 19}, {\"value\": \"2016-09-01 00:00:00+00:00\", \"count\": 19}, {\"value\": \"2018-01-01 00:00:00+00:00\", \"count\": 18}, {\"value\": \"2016-11-01 00:00:00+00:00\", \"count\": 18}, {\"value\": \"2018-06-04 12:40:00+00:00\", \"count\": 18}, {\"value\": \"2015-12-01 00:00:00+00:00\", \"count\": 18}, {\"value\": \"2017-07-05 00:00:00+00:00\", \"count\": 18}, {\"value\": \"2017-11-01 00:00:00+00:00\", \"count\": 17}, {\"value\": \"2018-02-01 00:00:00+00:00\", \"count\": 16}, {\"value\": \"2016-02-01 00:00:00+00:00\", \"count\": 16}, {\"value\": \"2018-07-19 00:00:00+00:00\", \"count\": 16}, {\"value\": \"2017-03-01 00:00:00+00:00\", \"count\": 15}, {\"value\": \"2018-03-07 06:00:00+00:00\", \"count\": 15}, {\"value\": \"2018-08-18 08:45:00+00:00\", \"count\": 15}, {\"value\": \"2017-10-01 00:00:00+00:00\", \"count\": 15}, {\"value\": \"2016-12-15 06:00:00+00:00\", \"count\": 14}, {\"value\": \"2015-06-20 00:00:00+00:00\", \"count\": 14}, {\"value\": \"2016-06-24 17:30:00+00:00\", \"count\": 14}, {\"value\": \"2016-11-10 16:00:00+00:00\", \"count\": 13}, {\"value\": \"2018-04-02 00:00:00+00:00\", \"count\": 13}, {\"value\": \"2015-12-22 20:25:00+00:00\", \"count\": 13}, {\"value\": \"2018-01-03 18:00:00+00:00\", \"count\": 13}, {\"value\": \"2015-07-17 00:00:00+00:00\", \"count\": 13}, {\"value\": \"2016-04-20 11:07:00+00:00\", \"count\": 13}], \"count_uniques\": 233229}, \"dtype\": \"object\"}, \"YEAR\": {\"profiler_dtype\": \"int\", \"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 319073, \"hist\": [{\"lower\": 2015.0, \"upper\": 2015.09375, \"count\": 39649}, {\"lower\": 2015.09375, \"upper\": 2015.1875, \"count\": 0}, {\"lower\": 2015.1875, \"upper\": 2015.28125, \"count\": 0}, {\"lower\": 2015.28125, \"upper\": 2015.375, \"count\": 0}, {\"lower\": 2015.375, \"upper\": 2015.46875, \"count\": 0}, {\"lower\": 2015.46875, \"upper\": 2015.5625, \"count\": 0}, {\"lower\": 2015.5625, \"upper\": 2015.65625, \"count\": 0}, {\"lower\": 2015.65625, \"upper\": 2015.75, \"count\": 0}, {\"lower\": 2015.75, \"upper\": 2015.84375, \"count\": 0}, {\"lower\": 2015.84375, \"upper\": 2015.9375, \"count\": 0}, {\"lower\": 2015.9375, \"upper\": 2016.03125, \"count\": 184}, {\"lower\": 2016.03125, \"upper\": 2016.125, \"count\": 0}, {\"lower\": 2016.125, \"upper\": 2016.21875, \"count\": 0}, {\"lower\": 2016.21875, \"upper\": 2016.3125, \"count\": 0}, {\"lower\": 2016.3125, \"upper\": 2016.40625, \"count\": 0}, {\"lower\": 2016.40625, \"upper\": 2016.5, \"count\": 0}, {\"lower\": 2016.5, \"upper\": 2016.59375, \"count\": 0}, {\"lower\": 2016.59375, \"upper\": 2016.6875, \"count\": 0}, {\"lower\": 2016.6875, \"upper\": 2016.78125, \"count\": 0}, {\"lower\": 2016.78125, \"upper\": 2016.875, \"count\": 0}, {\"lower\": 2016.875, \"upper\": 2016.96875, \"count\": 0}, {\"lower\": 2016.96875, \"upper\": 2017.0625, \"count\": 39}, {\"lower\": 2017.0625, \"upper\": 2017.15625, \"count\": 0}, {\"lower\": 2017.15625, \"upper\": 2017.25, \"count\": 0}, {\"lower\": 2017.25, \"upper\": 2017.34375, \"count\": 0}, {\"lower\": 2017.34375, \"upper\": 2017.4375, \"count\": 0}, {\"lower\": 2017.4375, \"upper\": 2017.53125, \"count\": 0}, {\"lower\": 2017.53125, \"upper\": 2017.625, \"count\": 0}, {\"lower\": 2017.625, \"upper\": 2017.71875, \"count\": 0}, {\"lower\": 2017.71875, \"upper\": 2017.8125, \"count\": 0}, {\"lower\": 2017.8125, \"upper\": 2017.90625, \"count\": 0}, {\"lower\": 2017.90625, \"upper\": 2018.0, \"count\": 13}], \"count_uniques\": 4}, \"dtype\": \"object\"}, \"MONTH\": {\"profiler_dtype\": \"int\", \"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 319073, \"hist\": [{\"lower\": 1.0, \"upper\": 1.34375, \"count\": 55}, {\"lower\": 1.34375, \"upper\": 1.6875, \"count\": 0}, {\"lower\": 1.6875, \"upper\": 2.03125, \"count\": 28}, {\"lower\": 2.03125, \"upper\": 2.375, \"count\": 0}, {\"lower\": 2.375, \"upper\": 2.71875, \"count\": 0}, {\"lower\": 2.71875, \"upper\": 3.0625, \"count\": 31}, {\"lower\": 3.0625, \"upper\": 3.40625, \"count\": 0}, {\"lower\": 3.40625, \"upper\": 3.75, \"count\": 0}, {\"lower\": 3.75, \"upper\": 4.09375, \"count\": 30}, {\"lower\": 4.09375, \"upper\": 4.4375, \"count\": 0}, {\"lower\": 4.4375, \"upper\": 4.78125, \"count\": 0}, {\"lower\": 4.78125, \"upper\": 5.125, \"count\": 16}, {\"lower\": 5.125, \"upper\": 5.46875, \"count\": 0}, {\"lower\": 5.46875, \"upper\": 5.8125, \"count\": 0}, {\"lower\": 5.8125, \"upper\": 6.15625, \"count\": 4184}, {\"lower\": 6.15625, \"upper\": 6.5, \"count\": 0}, {\"lower\": 6.5, \"upper\": 6.84375, \"count\": 0}, {\"lower\": 6.84375, \"upper\": 7.1875, \"count\": 8293}, {\"lower\": 7.1875, \"upper\": 7.53125, \"count\": 0}, {\"lower\": 7.53125, \"upper\": 7.875, \"count\": 0}, {\"lower\": 7.875, \"upper\": 8.21875, \"count\": 8298}, {\"lower\": 8.21875, \"upper\": 8.5625, \"count\": 0}, {\"lower\": 8.5625, \"upper\": 8.90625, \"count\": 0}, {\"lower\": 8.90625, \"upper\": 9.25, \"count\": 8334}, {\"lower\": 9.25, \"upper\": 9.59375, \"count\": 0}, {\"lower\": 9.59375, \"upper\": 9.9375, \"count\": 0}, {\"lower\": 9.9375, \"upper\": 10.28125, \"count\": 8151}, {\"lower\": 10.28125, \"upper\": 10.625, \"count\": 0}, {\"lower\": 10.625, \"upper\": 10.96875, \"count\": 0}, {\"lower\": 10.96875, \"upper\": 11.3125, \"count\": 2392}, {\"lower\": 11.3125, \"upper\": 11.65625, \"count\": 0}, {\"lower\": 11.65625, \"upper\": 12.0, \"count\": 73}], \"count_uniques\": 12}, \"dtype\": \"object\"}, \"DAY_OF_WEEK\": {\"profiler_dtype\": \"date\", \"stats\": {\"mismatch\": 319073, \"missing\": 0, \"match\": 0, \"frequency\": [{\"value\": \"Friday\", \"count\": 48495}, {\"value\": \"Wednesday\", \"count\": 46729}, {\"value\": \"Thursday\", \"count\": 46656}, {\"value\": \"Tuesday\", \"count\": 46383}, {\"value\": \"Monday\", \"count\": 45679}, {\"value\": \"Saturday\", \"count\": 44818}, {\"value\": \"Sunday\", \"count\": 40313}], \"count_uniques\": 7}, \"dtype\": \"object\"}, \"HOUR\": {\"profiler_dtype\": \"int\", \"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 319073, \"hist\": [{\"lower\": 0.0, \"upper\": 0.71875, \"count\": 2069}, {\"lower\": 0.71875, \"upper\": 1.4375, \"count\": 1262}, {\"lower\": 1.4375, \"upper\": 2.15625, \"count\": 1040}, {\"lower\": 2.15625, \"upper\": 2.875, \"count\": 0}, {\"lower\": 2.875, \"upper\": 3.59375, \"count\": 611}, {\"lower\": 3.59375, \"upper\": 4.3125, \"count\": 402}, {\"lower\": 4.3125, \"upper\": 5.03125, \"count\": 461}, {\"lower\": 5.03125, \"upper\": 5.75, \"count\": 0}, {\"lower\": 5.75, \"upper\": 6.46875, \"count\": 606}, {\"lower\": 6.46875, \"upper\": 7.1875, \"count\": 1114}, {\"lower\": 7.1875, \"upper\": 7.90625, \"count\": 0}, {\"lower\": 7.90625, \"upper\": 8.625, \"count\": 1562}, {\"lower\": 8.625, \"upper\": 9.34375, \"count\": 1833}, {\"lower\": 9.34375, \"upper\": 10.0625, \"count\": 1951}, {\"lower\": 10.0625, \"upper\": 10.78125, \"count\": 0}, {\"lower\": 10.78125, \"upper\": 11.5, \"count\": 2028}, {\"lower\": 11.5, \"upper\": 12.21875, \"count\": 2330}, {\"lower\": 12.21875, \"upper\": 12.9375, \"count\": 0}, {\"lower\": 12.9375, \"upper\": 13.65625, \"count\": 2020}, {\"lower\": 13.65625, \"upper\": 14.375, \"count\": 2116}, {\"lower\": 14.375, \"upper\": 15.09375, \"count\": 1988}, {\"lower\": 15.09375, \"upper\": 15.8125, \"count\": 0}, {\"lower\": 15.8125, \"upper\": 16.53125, \"count\": 2415}, {\"lower\": 16.53125, \"upper\": 17.25, \"count\": 2625}, {\"lower\": 17.25, \"upper\": 17.96875, \"count\": 0}, {\"lower\": 17.96875, \"upper\": 18.6875, \"count\": 2584}, {\"lower\": 18.6875, \"upper\": 19.40625, \"count\": 2159}, {\"lower\": 19.40625, \"upper\": 20.125, \"count\": 2041}, {\"lower\": 20.125, \"upper\": 20.84375, \"count\": 0}, {\"lower\": 20.84375, \"upper\": 21.5625, \"count\": 1698}, {\"lower\": 21.5625, \"upper\": 22.28125, \"count\": 1604}, {\"lower\": 22.28125, \"upper\": 23.0, \"count\": 1366}], \"count_uniques\": 24}, \"dtype\": \"object\"}, \"UCR_PART\": {\"profiler_dtype\": \"string\", \"stats\": {\"mismatch\": 0, \"missing\": 90, \"match\": 318983, \"frequency\": [{\"value\": \"Part Three\", \"count\": 158553}, {\"value\": \"Part Two\", \"count\": 97569}, {\"value\": \"Part One\", \"count\": 61629}, {\"value\": \"Other\", \"count\": 1232}], \"count_uniques\": 4}, \"dtype\": \"object\"}, \"STREET\": {\"profiler_dtype\": \"string\", \"stats\": {\"mismatch\": 0, \"missing\": 10871, \"match\": 308202, \"frequency\": [{\"value\": \"WASHINGTON ST\", \"count\": 14194}, {\"value\": \"BLUE HILL AVE\", \"count\": 7794}, {\"value\": \"BOYLSTON ST\", \"count\": 7221}, {\"value\": \"DORCHESTER AVE\", \"count\": 5149}, {\"value\": \"TREMONT ST\", \"count\": 4796}, {\"value\": \"MASSACHUSETTS AVE\", \"count\": 4708}, {\"value\": \"HARRISON AVE\", \"count\": 4609}, {\"value\": \"CENTRE ST\", \"count\": 4383}, {\"value\": \"COMMONWEALTH AVE\", \"count\": 4134}, {\"value\": \"HYDE PARK AVE\", \"count\": 3470}, {\"value\": \"COLUMBIA RD\", \"count\": 3195}, {\"value\": \"HUNTINGTON AVE\", \"count\": 2989}, {\"value\": \"RIVER ST\", \"count\": 2831}, {\"value\": \"DUDLEY ST\", \"count\": 2352}, {\"value\": \"WARREN ST\", \"count\": 2336}, {\"value\": \"COLUMBUS AVE\", \"count\": 2304}, {\"value\": \"ADAMS ST\", \"count\": 2042}, {\"value\": \"BEACON ST\", \"count\": 1966}, {\"value\": \"NEWBURY ST\", \"count\": 1889}, {\"value\": \"CAMBRIDGE ST\", \"count\": 1670}, {\"value\": \"BOWDOIN ST\", \"count\": 1656}, {\"value\": \"AMERICAN LEGION HWY\", \"count\": 1631}, {\"value\": \"W BROADWAY\", \"count\": 1630}, {\"value\": \"GENEVA AVE\", \"count\": 1628}, {\"value\": \"ALBANY ST\", \"count\": 1429}, {\"value\": \"ALLSTATE RD\", \"count\": 1296}, {\"value\": \"SUMMER ST\", \"count\": 1250}, {\"value\": \"HANCOCK ST\", \"count\": 1246}, {\"value\": \"BORDER ST\", \"count\": 1213}, {\"value\": \"NEW SUDBURY ST\", \"count\": 1213}, {\"value\": \"CUMMINS HWY\", \"count\": 1151}, {\"value\": \"NORFOLK ST\", \"count\": 1115}, {\"value\": \"BENNINGTON ST\", \"count\": 1084}], \"count_uniques\": 4657}, \"dtype\": \"object\"}, \"Lat\": {\"profiler_dtype\": \"decimal\", \"stats\": {\"mismatch\": 299074, \"missing\": 19999, \"match\": 0, \"hist\": [{\"lower\": -1.0, \"upper\": 0.3560950493749999, \"count\": 33}, {\"lower\": 0.3560950493749999, \"upper\": 1.7121900987499998, \"count\": 0}, {\"lower\": 1.7121900987499998, \"upper\": 3.0682851481249998, \"count\": 0}, {\"lower\": 3.0682851481249998, \"upper\": 4.4243801975, \"count\": 0}, {\"lower\": 4.4243801975, \"upper\": 5.780475246875, \"count\": 0}, {\"lower\": 5.780475246875, \"upper\": 7.1365702962499995, \"count\": 0}, {\"lower\": 7.1365702962499995, \"upper\": 8.492665345625, \"count\": 0}, {\"lower\": 8.492665345625, \"upper\": 9.848760395, \"count\": 0}, {\"lower\": 9.848760395, \"upper\": 11.204855444375, \"count\": 0}, {\"lower\": 11.204855444375, \"upper\": 12.56095049375, \"count\": 0}, {\"lower\": 12.56095049375, \"upper\": 13.917045543124999, \"count\": 0}, {\"lower\": 13.917045543124999, \"upper\": 15.273140592499999, \"count\": 0}, {\"lower\": 15.273140592499999, \"upper\": 16.629235641875, \"count\": 0}, {\"lower\": 16.629235641875, \"upper\": 17.98533069125, \"count\": 0}, {\"lower\": 17.98533069125, \"upper\": 19.341425740625, \"count\": 0}, {\"lower\": 19.341425740625, \"upper\": 20.69752079, \"count\": 0}, {\"lower\": 20.69752079, \"upper\": 22.053615839375, \"count\": 0}, {\"lower\": 22.053615839375, \"upper\": 23.40971088875, \"count\": 0}, {\"lower\": 23.40971088875, \"upper\": 24.765805938125, \"count\": 0}, {\"lower\": 24.765805938125, \"upper\": 26.1219009875, \"count\": 0}, {\"lower\": 26.1219009875, \"upper\": 27.477996036875, \"count\": 0}, {\"lower\": 27.477996036875, \"upper\": 28.834091086249998, \"count\": 0}, {\"lower\": 28.834091086249998, \"upper\": 30.190186135624998, \"count\": 0}, {\"lower\": 30.190186135624998, \"upper\": 31.546281184999998, \"count\": 0}, {\"lower\": 31.546281184999998, \"upper\": 32.902376234375, \"count\": 0}, {\"lower\": 32.902376234375, \"upper\": 34.25847128375, \"count\": 0}, {\"lower\": 34.25847128375, \"upper\": 35.614566333125, \"count\": 0}, {\"lower\": 35.614566333125, \"upper\": 36.9706613825, \"count\": 0}, {\"lower\": 36.9706613825, \"upper\": 38.326756431875, \"count\": 0}, {\"lower\": 38.326756431875, \"upper\": 39.68285148125, \"count\": 0}, {\"lower\": 39.68285148125, \"upper\": 41.038946530625, \"count\": 0}, {\"lower\": 41.038946530625, \"upper\": 42.39504158, \"count\": 39852}], \"count_uniques\": 18178}, \"dtype\": \"object\"}, \"Long\": {\"profiler_dtype\": \"decimal\", \"stats\": {\"mismatch\": 299074, \"missing\": 19999, \"match\": 0, \"hist\": [{\"lower\": -71.17867378, \"upper\": -68.985590224375, \"count\": 39852}, {\"lower\": -68.985590224375, \"upper\": -66.79250666875, \"count\": 0}, {\"lower\": -66.79250666875, \"upper\": -64.599423113125, \"count\": 0}, {\"lower\": -64.599423113125, \"upper\": -62.4063395575, \"count\": 0}, {\"lower\": -62.4063395575, \"upper\": -60.213256001874996, \"count\": 0}, {\"lower\": -60.213256001874996, \"upper\": -58.02017244625, \"count\": 0}, {\"lower\": -58.02017244625, \"upper\": -55.827088890625, \"count\": 0}, {\"lower\": -55.827088890625, \"upper\": -53.634005335, \"count\": 0}, {\"lower\": -53.634005335, \"upper\": -51.440921779374996, \"count\": 0}, {\"lower\": -51.440921779374996, \"upper\": -49.247838223749994, \"count\": 0}, {\"lower\": -49.247838223749994, \"upper\": -47.054754668125, \"count\": 0}, {\"lower\": -47.054754668125, \"upper\": -44.8616711125, \"count\": 0}, {\"lower\": -44.8616711125, \"upper\": -42.668587556874996, \"count\": 0}, {\"lower\": -42.668587556874996, \"upper\": -40.47550400125, \"count\": 0}, {\"lower\": -40.47550400125, \"upper\": -38.282420445625, \"count\": 0}, {\"lower\": -38.282420445625, \"upper\": -36.08933689, \"count\": 0}, {\"lower\": -36.08933689, \"upper\": -33.896253334375, \"count\": 0}, {\"lower\": -33.896253334375, \"upper\": -31.703169778749995, \"count\": 0}, {\"lower\": -31.703169778749995, \"upper\": -29.510086223125, \"count\": 0}, {\"lower\": -29.510086223125, \"upper\": -27.3170026675, \"count\": 0}, {\"lower\": -27.3170026675, \"upper\": -25.123919111874997, \"count\": 0}, {\"lower\": -25.123919111874997, \"upper\": -22.930835556250003, \"count\": 0}, {\"lower\": -22.930835556250003, \"upper\": -20.737752000625, \"count\": 0}, {\"lower\": -20.737752000625, \"upper\": -18.544668445, \"count\": 0}, {\"lower\": -18.544668445, \"upper\": -16.351584889374998, \"count\": 0}, {\"lower\": -16.351584889374998, \"upper\": -14.158501333749996, \"count\": 0}, {\"lower\": -14.158501333749996, \"upper\": -11.965417778125001, \"count\": 0}, {\"lower\": -11.965417778125001, \"upper\": -9.7723342225, \"count\": 0}, {\"lower\": -9.7723342225, \"upper\": -7.579250666874998, \"count\": 0}, {\"lower\": -7.579250666874998, \"upper\": -5.386167111250003, \"count\": 0}, {\"lower\": -5.386167111250003, \"upper\": -3.1930835556249946, \"count\": 0}, {\"lower\": -3.1930835556249946, \"upper\": -1.0, \"count\": 33}], \"count_uniques\": 18178}, \"dtype\": \"object\"}, \"Location\": {\"profiler_dtype\": \"string\", \"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 319073, \"frequency\": [{\"value\": \"(0.00000000, 0.00000000)\", \"count\": 19999}, {\"value\": \"(42.34862382, -71.08277637)\", \"count\": 1243}, {\"value\": \"(42.36183857, -71.05976489)\", \"count\": 1208}, {\"value\": \"(42.28482577, -71.09137369)\", \"count\": 1121}, {\"value\": \"(42.32866284, -71.08563401)\", \"count\": 1042}, {\"value\": \"(42.25621592, -71.12401947)\", \"count\": 898}, {\"value\": \"(42.29755533, -71.05970910)\", \"count\": 783}, {\"value\": \"(42.34128751, -71.05467933)\", \"count\": 773}, {\"value\": \"(-1.00000000, -1.00000000)\", \"count\": 745}, {\"value\": \"(42.33152148, -71.07085307)\", \"count\": 735}, {\"value\": \"(42.35231190, -71.06370510)\", \"count\": 688}, {\"value\": \"(42.33954199, -71.06940877)\", \"count\": 655}, {\"value\": \"(42.32696647, -71.06198607)\", \"count\": 652}, {\"value\": \"(42.35512339, -71.06087980)\", \"count\": 584}, {\"value\": \"(42.30971857, -71.10429432)\", \"count\": 573}, {\"value\": \"(42.29848866, -71.06313294)\", \"count\": 562}, {\"value\": \"(42.33401829, -71.07638124)\", \"count\": 561}, {\"value\": \"(42.33367922, -71.09187755)\", \"count\": 550}, {\"value\": \"(42.31043400, -71.06134010)\", \"count\": 523}, {\"value\": \"(42.35095909, -71.07412780)\", \"count\": 523}, {\"value\": \"(42.35241815, -71.06525499)\", \"count\": 515}, {\"value\": \"(42.37081805, -71.03929078)\", \"count\": 507}, {\"value\": \"(42.33428841, -71.07239518)\", \"count\": 504}, {\"value\": \"(42.34980175, -71.07840978)\", \"count\": 472}, {\"value\": \"(42.32696802, -71.08051941)\", \"count\": 472}, {\"value\": \"(42.33511904, -71.07491710)\", \"count\": 455}, {\"value\": \"(42.35037870, -71.07626098)\", \"count\": 445}, {\"value\": \"(42.34653820, -71.09880598)\", \"count\": 444}, {\"value\": \"(42.36643546, -71.06135413)\", \"count\": 440}, {\"value\": \"(42.28709355, -71.14822128)\", \"count\": 438}, {\"value\": \"(42.35602373, -71.06177615)\", \"count\": 436}, {\"value\": \"(42.34840576, -71.08688339)\", \"count\": 432}, {\"value\": \"(42.34905600, -71.15049850)\", \"count\": 427}], \"count_uniques\": 18194}, \"dtype\": \"object\"}}, \"name\": null, \"file_name\": \"crime.csv\", \"summary\": {\"cols_count\": 17, \"rows_count\": 319073, \"dtypes_list\": [\"object\"], \"total_count_dtypes\": 1, \"missing_count\": 0, \"p_missing\": 0.0}}'" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "df.ext.cast_and_profile(columns=\"*\", flush=True, output=\"json\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['INCIDENT_NUMBER',\n", " 'OFFENSE_CODE',\n", " 'OFFENSE_CODE_GROUP',\n", " 'OFFENSE_DESCRIPTION',\n", " 'DISTRICT',\n", " 'REPORTING_AREA',\n", " 'SHOOTING',\n", " 'OCCURRED_ON_DATE',\n", " 'YEAR',\n", " 'MONTH',\n", " 'DAY_OF_WEEK',\n", " 'HOUR',\n", " 'UCR_PART',\n", " 'STREET',\n", " 'Lat',\n", " 'Long',\n", " 'Location']" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.names()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.OCCURRED_ON_DATE.astype()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 3.49 s\n" ] } ], "source": [ "%%time\n", "import pandas as pd\n", "from fastnumbers import fast_int\n", "import pendulum\n", "def func(value):\n", " return pendulum.parse(value)\n", "df.OCCURRED_ON_DATE.apply(func, meta=\"object\").compute()" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
INCIDENT_NUMBEROFFENSE_CODEOFFENSE_CODE_GROUPOFFENSE_DESCRIPTIONDISTRICTREPORTING_AREASHOOTINGOCCURRED_ON_DATEYEARMONTHDAY_OF_WEEKHOURUCR_PARTSTREETLatLongLocation
0nan00619LarcenyLARCENY ALL OTHERSD14808nan2018-09-02 13:00:00+00:00201892020-05-3113Part OneLINCOLN ST42.3578-71.1394(42.35779134, -71.13937053)
1nan01402VandalismVANDALISMC11347nan2018-08-21 00:00:00+00:00201882020-05-260Part TwoHECLA ST42.3068-71.0603(42.30682138, -71.06030035)
2nan03410TowedTOWED MOTOR VEHICLED4151nan2018-09-03 19:27:00+00:00201892020-06-0119Part ThreeCAZENOVE ST42.3466-71.0724(42.34658879, -71.07242943)
3nan03114Investigate PropertyINVESTIGATE PROPERTYD4272nan2018-09-03 21:16:00+00:00201892020-06-0121Part ThreeNEWCOMB ST42.3342-71.0787(42.33418175, -71.07866441)
4nan03114Investigate PropertyINVESTIGATE PROPERTYB3421nan2018-09-03 21:05:00+00:00201892020-06-0121Part ThreeDELHI ST42.2754-71.0904(42.27536542, -71.09036101)
......................................................
319068nan03125Warrant ArrestsWARRANT ARRESTD4285nan2016-06-05 17:25:00+00:00201662020-05-3117Part ThreeCOVENTRY ST42.337-71.0857(42.33695098, -71.08574813)
319069nan00111HomicideMURDER, NON-NEGLIGIENT MANSLAUGHTERE18520nan2015-07-09 13:38:00+00:00201572020-05-2813Part OneRIVER ST42.2559-71.1232(42.25592648, -71.12317207)
319070nan03125Warrant ArrestsWARRANT ARRESTE18520nan2015-07-09 13:38:00+00:00201572020-05-2813Part ThreeRIVER ST42.2559-71.1232(42.25592648, -71.12317207)
319071nan03125Warrant ArrestsWARRANT ARRESTE13569nan2016-05-31 19:35:00+00:00201652020-05-2619Part ThreeNEW WASHINGTON ST42.3023-71.1116(42.30233307, -71.11156487)
31907214205255003125Warrant ArrestsWARRANT ARRESTD4903nan2015-06-22 00:12:00+00:00201562020-06-010Part ThreeWASHINGTON ST42.3338-71.0803(42.33383935, -71.08029038)
\n", "

319073 rows × 17 columns

\n", "
" ], "text/plain": [ " INCIDENT_NUMBER OFFENSE_CODE OFFENSE_CODE_GROUP \\\n", "0 nan 00619 Larceny \n", "1 nan 01402 Vandalism \n", "2 nan 03410 Towed \n", "3 nan 03114 Investigate Property \n", "4 nan 03114 Investigate Property \n", "... ... ... ... \n", "319068 nan 03125 Warrant Arrests \n", "319069 nan 00111 Homicide \n", "319070 nan 03125 Warrant Arrests \n", "319071 nan 03125 Warrant Arrests \n", "319072 142052550 03125 Warrant Arrests \n", "\n", " OFFENSE_DESCRIPTION DISTRICT REPORTING_AREA SHOOTING \\\n", "0 LARCENY ALL OTHERS D14 808 nan \n", "1 VANDALISM C11 347 nan \n", "2 TOWED MOTOR VEHICLE D4 151 nan \n", "3 INVESTIGATE PROPERTY D4 272 nan \n", "4 INVESTIGATE PROPERTY B3 421 nan \n", "... ... ... ... ... \n", "319068 WARRANT ARREST D4 285 nan \n", "319069 MURDER, NON-NEGLIGIENT MANSLAUGHTER E18 520 nan \n", "319070 WARRANT ARREST E18 520 nan \n", "319071 WARRANT ARREST E13 569 nan \n", "319072 WARRANT ARREST D4 903 nan \n", "\n", " OCCURRED_ON_DATE YEAR MONTH DAY_OF_WEEK HOUR UCR_PART \\\n", "0 2018-09-02 13:00:00+00:00 2018 9 2020-05-31 13 Part One \n", "1 2018-08-21 00:00:00+00:00 2018 8 2020-05-26 0 Part Two \n", "2 2018-09-03 19:27:00+00:00 2018 9 2020-06-01 19 Part Three \n", "3 2018-09-03 21:16:00+00:00 2018 9 2020-06-01 21 Part Three \n", "4 2018-09-03 21:05:00+00:00 2018 9 2020-06-01 21 Part Three \n", "... ... ... ... ... ... ... \n", "319068 2016-06-05 17:25:00+00:00 2016 6 2020-05-31 17 Part Three \n", "319069 2015-07-09 13:38:00+00:00 2015 7 2020-05-28 13 Part One \n", "319070 2015-07-09 13:38:00+00:00 2015 7 2020-05-28 13 Part Three \n", "319071 2016-05-31 19:35:00+00:00 2016 5 2020-05-26 19 Part Three \n", "319072 2015-06-22 00:12:00+00:00 2015 6 2020-06-01 0 Part Three \n", "\n", " STREET Lat Long Location \n", "0 LINCOLN ST 42.3578 -71.1394 (42.35779134, -71.13937053) \n", "1 HECLA ST 42.3068 -71.0603 (42.30682138, -71.06030035) \n", "2 CAZENOVE ST 42.3466 -71.0724 (42.34658879, -71.07242943) \n", "3 NEWCOMB ST 42.3342 -71.0787 (42.33418175, -71.07866441) \n", "4 DELHI ST 42.2754 -71.0904 (42.27536542, -71.09036101) \n", "... ... ... ... ... \n", "319068 COVENTRY ST 42.337 -71.0857 (42.33695098, -71.08574813) \n", "319069 RIVER ST 42.2559 -71.1232 (42.25592648, -71.12317207) \n", "319070 RIVER ST 42.2559 -71.1232 (42.25592648, -71.12317207) \n", "319071 NEW WASHINGTON ST 42.3023 -71.1116 (42.30233307, -71.11156487) \n", "319072 WASHINGTON ST 42.3338 -71.0803 (42.33383935, -71.08029038) \n", "\n", "[319073 rows x 17 columns]" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.compute()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(0 I18207094500619\n", " 1 I18207094301402\n", " 2 I18207094103410\n", " 3 I18207094003114\n", " 4 I18207093803114\n", " ... \n", " 319068 I050310906-0003125\n", " 319069 I030217815-0800111\n", " 319070 I030217815-0803125\n", " 319071 I010370257-0003125\n", " 319072 14205255003125\n", " Length: 319073, dtype: object,)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a = df[\"INCIDENT_NUMBER\"].compute() \n", "b = df[\"OFFENSE_CODE\"].compute()\n", "from dask import dataframe as dd\n", "dd.compute(a+b)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "ename": "NotImplementedError", "evalue": "dd.DataFrame.apply only supports axis=1\n Try: df.apply(func, axis=1)", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mNotImplementedError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py\u001b[0m in \u001b[0;36mapply\u001b[1;34m(self, func, axis, broadcast, raw, reduce, args, meta, **kwds)\u001b[0m\n\u001b[0;32m 4145\u001b[0m \u001b[1;34m\" Try: df.apply(func, axis=1)\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4146\u001b[0m )\n\u001b[1;32m-> 4147\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4148\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4149\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mmeta\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mno_default\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mNotImplementedError\u001b[0m: dd.DataFrame.apply only supports axis=1\n Try: df.apply(func, axis=1)" ] } ], "source": [ "def func(value):\n", " return value\n", "df.apply(func)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 0 ns\n" ] } ], "source": [ "%%time\n", "# _output = df.ext.cast_and_profile(columns=\"*\", flush=True, output=\"json\")\n", " # df = df.cols.cast_to_profiler_dtypes(columns=cols_and_inferred_dtype).persist()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 10 of 319073 rows / 17 columns
\n", "
8 partition(s)
\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
INCIDENT_NUMBER
\n", "
1 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
OFFENSE_CODE
\n", "
2 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
OFFENSE_CODE_GROUP
\n", "
3 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
OFFENSE_DESCRIPTION
\n", "
4 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
DISTRICT
\n", "
5 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
REPORTING_AREA
\n", "
6 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
SHOOTING
\n", "
7 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
OCCURRED_ON_DATE
\n", "
8 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
YEAR
\n", "
9 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
MONTH
\n", "
10 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
DAY_OF_WEEK
\n", "
11 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
HOUR
\n", "
12 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
UCR_PART
\n", "
13 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
STREET
\n", "
14 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
Lat
\n", "
15 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
Long
\n", "
16 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
Location
\n", "
17 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070945\n", " \n", "
\n", "
\n", "
\n", " \n", " 619\n", " \n", "
\n", "
\n", "
\n", " \n", " Larceny\n", " \n", "
\n", "
\n", "
\n", " \n", " LARCENY⋅ALL⋅OTHERS\n", " \n", "
\n", "
\n", "
\n", " \n", " D14\n", " \n", "
\n", "
\n", "
\n", " \n", " 808\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-02⋅13:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Sunday\n", " \n", "
\n", "
\n", "
\n", " \n", " 13\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅One\n", " \n", "
\n", "
\n", "
\n", " \n", " LINCOLN⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.35779134\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.13937053\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.35779134,⋅-71.13937053)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070943\n", " \n", "
\n", "
\n", "
\n", " \n", " 1402\n", " \n", "
\n", "
\n", "
\n", " \n", " Vandalism\n", " \n", "
\n", "
\n", "
\n", " \n", " VANDALISM\n", " \n", "
\n", "
\n", "
\n", " \n", " C11\n", " \n", "
\n", "
\n", "
\n", " \n", " 347\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-08-21⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " Tuesday\n", " \n", "
\n", "
\n", "
\n", " \n", " 0\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Two\n", " \n", "
\n", "
\n", "
\n", " \n", " HECLA⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.30682138\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.06030035\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.30682138,⋅-71.06030035)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070941\n", " \n", "
\n", "
\n", "
\n", " \n", " 3410\n", " \n", "
\n", "
\n", "
\n", " \n", " Towed\n", " \n", "
\n", "
\n", "
\n", " \n", " TOWED⋅MOTOR⋅VEHICLE\n", " \n", "
\n", "
\n", "
\n", " \n", " D4\n", " \n", "
\n", "
\n", "
\n", " \n", " 151\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅19:27:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 19\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " CAZENOVE⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.34658879\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07242943\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.34658879,⋅-71.07242943)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070940\n", " \n", "
\n", "
\n", "
\n", " \n", " 3114\n", " \n", "
\n", "
\n", "
\n", " \n", " Investigate⋅Property\n", " \n", "
\n", "
\n", "
\n", " \n", " INVESTIGATE⋅PROPERTY\n", " \n", "
\n", "
\n", "
\n", " \n", " D4\n", " \n", "
\n", "
\n", "
\n", " \n", " 272\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:16:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " NEWCOMB⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.33418175\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07866441\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.33418175,⋅-71.07866441)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070938\n", " \n", "
\n", "
\n", "
\n", " \n", " 3114\n", " \n", "
\n", "
\n", "
\n", " \n", " Investigate⋅Property\n", " \n", "
\n", "
\n", "
\n", " \n", " INVESTIGATE⋅PROPERTY\n", " \n", "
\n", "
\n", "
\n", " \n", " B3\n", " \n", "
\n", "
\n", "
\n", " \n", " 421\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:05:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " DELHI⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.27536542\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.09036101\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.27536542,⋅-71.09036101)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070936\n", " \n", "
\n", "
\n", "
\n", " \n", " 3820\n", " \n", "
\n", "
\n", "
\n", " \n", " Motor⋅Vehicle⋅Accident⋅Response\n", " \n", "
\n", "
\n", "
\n", " \n", " M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY\n", " \n", "
\n", "
\n", "
\n", " \n", " C11\n", " \n", "
\n", "
\n", "
\n", " \n", " 398\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:09:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " TALBOT⋅AVE\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.29019621\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07159012\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.29019621,⋅-71.07159012)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070933\n", " \n", "
\n", "
\n", "
\n", " \n", " 724\n", " \n", "
\n", "
\n", "
\n", " \n", " Auto⋅Theft\n", " \n", "
\n", "
\n", "
\n", " \n", " AUTO⋅THEFT\n", " \n", "
\n", "
\n", "
\n", " \n", " B2\n", " \n", "
\n", "
\n", "
\n", " \n", " 330\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:25:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅One\n", " \n", "
\n", "
\n", "
\n", " \n", " NORMANDY⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.30607218\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.0827326\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.30607218,⋅-71.08273260)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070932\n", " \n", "
\n", "
\n", "
\n", " \n", " 3301\n", " \n", "
\n", "
\n", "
\n", " \n", " Verbal⋅Disputes\n", " \n", "
\n", "
\n", "
\n", " \n", " VERBAL⋅DISPUTE\n", " \n", "
\n", "
\n", "
\n", " \n", " B2\n", " \n", "
\n", "
\n", "
\n", " \n", " 584\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅20:39:37\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 20\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " LAWN⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.32701648\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.10555088\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.32701648,⋅-71.10555088)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070931\n", " \n", "
\n", "
\n", "
\n", " \n", " 301\n", " \n", "
\n", "
\n", "
\n", " \n", " Robbery\n", " \n", "
\n", "
\n", "
\n", " \n", " ROBBERY⋅-⋅STREET\n", " \n", "
\n", "
\n", "
\n", " \n", " C6\n", " \n", "
\n", "
\n", "
\n", " \n", " 177\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅20:48:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 20\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅One\n", " \n", "
\n", "
\n", "
\n", " \n", " MASSACHUSETTS⋅AVE\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.33152148\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07085307\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.33152148,⋅-71.07085307)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070929\n", " \n", "
\n", "
\n", "
\n", " \n", " 3301\n", " \n", "
\n", "
\n", "
\n", " \n", " Verbal⋅Disputes\n", " \n", "
\n", "
\n", "
\n", " \n", " VERBAL⋅DISPUTE\n", " \n", "
\n", "
\n", "
\n", " \n", " C11\n", " \n", "
\n", "
\n", "
\n", " \n", " 364\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅20:38:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 20\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " LESLIE⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.29514664\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.05860832\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.29514664,⋅-71.05860832)\n", " \n", "
\n", "
\n", "\n", "
Viewing 10 of 319073 rows / 17 columns
\n", "
8 partition(s) <class 'dask.dataframe.core.DataFrame'>
\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.ext.display()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "8" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.ext.partitions()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
INCIDENT_NUMBEROFFENSE_CODEOFFENSE_CODE_GROUPOFFENSE_DESCRIPTIONDISTRICTREPORTING_AREASHOOTINGOCCURRED_ON_DATEYEARMONTHDAY_OF_WEEKHOURUCR_PARTSTREETLatLongLocation
0I18207094500619LarcenyLARCENY ALL OTHERSD14808NaN2018-09-02 13:00:0020189Sunday13Part OneLINCOLN ST42.35779134-71.13937053(42.35779134, -71.13937053)
1I18207094301402VandalismVANDALISMC11347NaN2018-08-21 00:00:0020188Tuesday0Part TwoHECLA ST42.30682138-71.06030035(42.30682138, -71.06030035)
2I18207094103410TowedTOWED MOTOR VEHICLED4151NaN2018-09-03 19:27:0020189Monday19Part ThreeCAZENOVE ST42.34658879-71.07242943(42.34658879, -71.07242943)
3I18207094003114Investigate PropertyINVESTIGATE PROPERTYD4272NaN2018-09-03 21:16:0020189Monday21Part ThreeNEWCOMB ST42.33418175-71.07866441(42.33418175, -71.07866441)
4I18207093803114Investigate PropertyINVESTIGATE PROPERTYB3421NaN2018-09-03 21:05:0020189Monday21Part ThreeDELHI ST42.27536542-71.09036101(42.27536542, -71.09036101)
......................................................
319068I050310906-0003125Warrant ArrestsWARRANT ARRESTD4285NaN2016-06-05 17:25:0020166Sunday17Part ThreeCOVENTRY ST42.33695098-71.08574813(42.33695098, -71.08574813)
319069I030217815-0800111HomicideMURDER, NON-NEGLIGIENT MANSLAUGHTERE18520NaN2015-07-09 13:38:0020157Thursday13Part OneRIVER ST42.25592648-71.12317207(42.25592648, -71.12317207)
319070I030217815-0803125Warrant ArrestsWARRANT ARRESTE18520NaN2015-07-09 13:38:0020157Thursday13Part ThreeRIVER ST42.25592648-71.12317207(42.25592648, -71.12317207)
319071I010370257-0003125Warrant ArrestsWARRANT ARRESTE13569NaN2016-05-31 19:35:0020165Tuesday19Part ThreeNEW WASHINGTON ST42.30233307-71.11156487(42.30233307, -71.11156487)
31907214205255003125Warrant ArrestsWARRANT ARRESTD4903NaN2015-06-22 00:12:0020156Monday0Part ThreeWASHINGTON ST42.33383935-71.08029038(42.33383935, -71.08029038)
\n", "

319073 rows × 17 columns

\n", "
" ], "text/plain": [ " INCIDENT_NUMBER OFFENSE_CODE OFFENSE_CODE_GROUP \\\n", "0 I182070945 00619 Larceny \n", "1 I182070943 01402 Vandalism \n", "2 I182070941 03410 Towed \n", "3 I182070940 03114 Investigate Property \n", "4 I182070938 03114 Investigate Property \n", "... ... ... ... \n", "319068 I050310906-00 03125 Warrant Arrests \n", "319069 I030217815-08 00111 Homicide \n", "319070 I030217815-08 03125 Warrant Arrests \n", "319071 I010370257-00 03125 Warrant Arrests \n", "319072 142052550 03125 Warrant Arrests \n", "\n", " OFFENSE_DESCRIPTION DISTRICT REPORTING_AREA SHOOTING \\\n", "0 LARCENY ALL OTHERS D14 808 NaN \n", "1 VANDALISM C11 347 NaN \n", "2 TOWED MOTOR VEHICLE D4 151 NaN \n", "3 INVESTIGATE PROPERTY D4 272 NaN \n", "4 INVESTIGATE PROPERTY B3 421 NaN \n", "... ... ... ... ... \n", "319068 WARRANT ARREST D4 285 NaN \n", "319069 MURDER, NON-NEGLIGIENT MANSLAUGHTER E18 520 NaN \n", "319070 WARRANT ARREST E18 520 NaN \n", "319071 WARRANT ARREST E13 569 NaN \n", "319072 WARRANT ARREST D4 903 NaN \n", "\n", " OCCURRED_ON_DATE YEAR MONTH DAY_OF_WEEK HOUR UCR_PART \\\n", "0 2018-09-02 13:00:00 2018 9 Sunday 13 Part One \n", "1 2018-08-21 00:00:00 2018 8 Tuesday 0 Part Two \n", "2 2018-09-03 19:27:00 2018 9 Monday 19 Part Three \n", "3 2018-09-03 21:16:00 2018 9 Monday 21 Part Three \n", "4 2018-09-03 21:05:00 2018 9 Monday 21 Part Three \n", "... ... ... ... ... ... ... \n", "319068 2016-06-05 17:25:00 2016 6 Sunday 17 Part Three \n", "319069 2015-07-09 13:38:00 2015 7 Thursday 13 Part One \n", "319070 2015-07-09 13:38:00 2015 7 Thursday 13 Part Three \n", "319071 2016-05-31 19:35:00 2016 5 Tuesday 19 Part Three \n", "319072 2015-06-22 00:12:00 2015 6 Monday 0 Part Three \n", "\n", " STREET Lat Long \\\n", "0 LINCOLN ST 42.35779134 -71.13937053 \n", "1 HECLA ST 42.30682138 -71.06030035 \n", "2 CAZENOVE ST 42.34658879 -71.07242943 \n", "3 NEWCOMB ST 42.33418175 -71.07866441 \n", "4 DELHI ST 42.27536542 -71.09036101 \n", "... ... ... ... \n", "319068 COVENTRY ST 42.33695098 -71.08574813 \n", "319069 RIVER ST 42.25592648 -71.12317207 \n", "319070 RIVER ST 42.25592648 -71.12317207 \n", "319071 NEW WASHINGTON ST 42.30233307 -71.11156487 \n", "319072 WASHINGTON ST 42.33383935 -71.08029038 \n", "\n", " Location \n", "0 (42.35779134, -71.13937053) \n", "1 (42.30682138, -71.06030035) \n", "2 (42.34658879, -71.07242943) \n", "3 (42.33418175, -71.07866441) \n", "4 (42.27536542, -71.09036101) \n", "... ... \n", "319068 (42.33695098, -71.08574813) \n", "319069 (42.25592648, -71.12317207) \n", "319070 (42.25592648, -71.12317207) \n", "319071 (42.30233307, -71.11156487) \n", "319072 (42.33383935, -71.08029038) \n", "\n", "[319073 rows x 17 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.compute()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "Can only use .dt accessor with datetimelike values", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcols\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0myear\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"birth\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;32m~\\Documents\\Optimus\\optimus\\engines\\base\\dask\\columns.py\u001b[0m in \u001b[0;36myear\u001b[1;34m(self, input_cols, output_cols)\u001b[0m\n\u001b[0;32m 573\u001b[0m \u001b[0moutput_cols\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_output_cols\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minput_cols\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_cols\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 574\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0minput_col\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_col\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minput_cols\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_cols\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 575\u001b[1;33m \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0massign\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[1;33m{\u001b[0m\u001b[0moutput_col\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0minput_col\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0myear\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 576\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 577\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mpandas\\_libs\\properties.pyx\u001b[0m in \u001b[0;36mpandas._libs.properties.CachedProperty.__get__\u001b[1;34m()\u001b[0m\n", "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py\u001b[0m in \u001b[0;36mdt\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 2616\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdt\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2617\u001b[0m \u001b[1;34m\"\"\" Namespace of datetime methods \"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2618\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mDatetimeAccessor\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2619\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2620\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mcache_readonly\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\accessor.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, series)\u001b[0m\n\u001b[0;32m 34\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mseries_meta\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"to_series\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;31m# is index-like\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 35\u001b[0m \u001b[0mseries_meta\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mseries_meta\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_series\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 36\u001b[1;33m \u001b[0mmeta\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mseries_meta\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_accessor_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 37\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 38\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_meta\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmeta\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[1;34m(self, name)\u001b[0m\n\u001b[0;32m 5268\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mname\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_accessors\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5269\u001b[0m ):\n\u001b[1;32m-> 5270\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 5271\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5272\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\pandas\\core\\accessor.py\u001b[0m in \u001b[0;36m__get__\u001b[1;34m(self, obj, cls)\u001b[0m\n\u001b[0;32m 185\u001b[0m \u001b[1;31m# we're accessing the attribute of the class, i.e., Dataset.geo\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 186\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_accessor\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 187\u001b[1;33m \u001b[0maccessor_obj\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_accessor\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 188\u001b[0m \u001b[1;31m# Replace the property with the accessor object. Inspired by:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 189\u001b[0m \u001b[1;31m# http://www.pydanny.com/cached-property.html\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\pandas\\core\\indexes\\accessors.py\u001b[0m in \u001b[0;36m__new__\u001b[1;34m(cls, data)\u001b[0m\n\u001b[0;32m 336\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mDatetimeProperties\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morig\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 337\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 338\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mAttributeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Can only use .dt accessor with datetimelike values\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m: Can only use .dt accessor with datetimelike values" ] } ], "source": [ "df.cols.year(\"\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\"columns\": {\"customer_id\": {\"profiler_dtype\": \"string\", \"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 17, \"frequency\": [{\"value\": \"0fju234978rfjkhsdf\", \"count\": 4}, {\"value\": \"0f34ruiy23e78y2r\", \"count\": 4}, {\"value\": \"0fue298y2r23r23r5\", \"count\": 1}, {\"value\": \"0fue298y2r23r23r4\", \"count\": 1}, {\"value\": \"0fue298y2r23r23r3\", \"count\": 1}, {\"value\": \"0fue298y2r23r23r2\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy349\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy348\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy347\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy346\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy345\", \"count\": 1}], \"count_uniques\": 11}, \"dtype\": \"object\"}, \"transactoin_date\": {\"profiler_dtype\": \"date\", \"profiler_dtype_fotmat\": \"date\", \"stats\": {\"mismatch\": 11, \"missing\": 6, \"match\": 0}, \"dtype\": \"datetime64[ns]\"}, \"ticket_price\": {\"profiler_dtype\": \"decimal\", \"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 17, \"frequency\": [{\"value\": 9.99, \"count\": 7}, {\"value\": 29.99, \"count\": 5}, {\"value\": 14.99, \"count\": 4}, {\"value\": 9.91, \"count\": 1}], \"count_uniques\": 4}, \"dtype\": \"object\"}, \"discount\": {\"profiler_dtype\": \"object\", \"stats\": {\"mismatch\": 5, \"missing\": 12, \"match\": 0, \"frequency\": [{\"value\": \"5%\", \"count\": 5}], \"count_uniques\": 1}, \"dtype\": \"object\"}, \"product\": {\"profiler_dtype\": \"string\", \"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 17, \"frequency\": [{\"value\": \"basic\", \"count\": 8}, {\"value\": \"platinum\", \"count\": 5}, {\"value\": \"deluxe\", \"count\": 4}], \"count_uniques\": 3}, \"dtype\": \"object\"}, \"info\": {\"profiler_dtype\": \"int\", \"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 17, \"hist\": [{\"lower\": 1.0, \"upper\": 1.28125, \"count\": 0}, {\"lower\": 1.28125, \"upper\": 1.5625, \"count\": 0}, {\"lower\": 1.5625, \"upper\": 1.84375, \"count\": 0}, {\"lower\": 1.84375, \"upper\": 2.125, \"count\": 0}, {\"lower\": 2.125, \"upper\": 2.40625, \"count\": 0}, {\"lower\": 2.40625, \"upper\": 2.6875, \"count\": 0}, {\"lower\": 2.6875, \"upper\": 2.96875, \"count\": 0}, {\"lower\": 2.96875, \"upper\": 3.25, \"count\": 0}, {\"lower\": 3.25, \"upper\": 3.53125, \"count\": 0}, {\"lower\": 3.53125, \"upper\": 3.8125, \"count\": 0}, {\"lower\": 3.8125, \"upper\": 4.09375, \"count\": 0}, {\"lower\": 4.09375, \"upper\": 4.375, \"count\": 0}, {\"lower\": 4.375, \"upper\": 4.65625, \"count\": 0}, {\"lower\": 4.65625, \"upper\": 4.9375, \"count\": 0}, {\"lower\": 4.9375, \"upper\": 5.21875, \"count\": 0}, {\"lower\": 5.21875, \"upper\": 5.5, \"count\": 0}, {\"lower\": 5.5, \"upper\": 5.78125, \"count\": 0}, {\"lower\": 5.78125, \"upper\": 6.0625, \"count\": 0}, {\"lower\": 6.0625, \"upper\": 6.34375, \"count\": 0}, {\"lower\": 6.34375, \"upper\": 6.625, \"count\": 0}, {\"lower\": 6.625, \"upper\": 6.90625, \"count\": 0}, {\"lower\": 6.90625, \"upper\": 7.1875, \"count\": 0}, {\"lower\": 7.1875, \"upper\": 7.46875, \"count\": 0}, {\"lower\": 7.46875, \"upper\": 7.75, \"count\": 0}, {\"lower\": 7.75, \"upper\": 8.03125, \"count\": 0}, {\"lower\": 8.03125, \"upper\": 8.3125, \"count\": 0}, {\"lower\": 8.3125, \"upper\": 8.59375, \"count\": 0}, {\"lower\": 8.59375, \"upper\": 8.875, \"count\": 0}, {\"lower\": 8.875, \"upper\": 9.15625, \"count\": 0}, {\"lower\": 9.15625, \"upper\": 9.4375, \"count\": 0}, {\"lower\": 9.4375, \"upper\": 9.71875, \"count\": 0}, {\"lower\": 9.71875, \"upper\": 10.0, \"count\": 3}], \"count_uniques\": 10}, \"dtype\": \"int64\"}}, \"name\": null, \"file_name\": \"tmp71_odjvg.csv\", \"summary\": {\"cols_count\": 6, \"rows_count\": 17, \"dtypes_list\": [\"object\", \"datetime64[ns]\", \"int64\"], \"total_count_dtypes\": 3, \"missing_count\": 0, \"p_missing\": 0.0}}\n" ] } ], "source": [ "print(_output)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DateTime(1980, 11, 11, 0, 0, 0, tzinfo=Timezone('UTC'))" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pendulum\n", "pendulum.from_format(\"11 11 1980\", \"DD MM YYYY\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'customer_id': 'string', 'transactoin_date': 'date', 'ticket_price': 'decimal', 'discount': 'string', 'product': 'string', 'info': 'int'}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] } ], "source": [ "# df = op.load.file(\"data/test-types.csv\").ext.cache()\n", "df = op.load.file(\"data/dataset-transactions.csv\").ext.cache()\n", "a = df.cols.infer_profiler_dtypes(\"*\")\n", "print(a)\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "df = df.cols.cast_to_profiler_dtypes(columns={'transactoin_date': 'date'})" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# df[\"a\"] = df.transactoin_date.astype('M8[us]')\n", "# b[\"transactoin_date\"].dt.year" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "customer_id object\n", "transactoin_date datetime64[ns]\n", "ticket_price object\n", "discount object\n", "product object\n", "info object\n", "dtype: object" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customer_idtransactoin_dateticket_pricediscountproductinfonew1
00f345kjh345oiuy3452010-08-1929.99NaNplatinum119.0
10fju234978rfjkhsdf2012-01-0529.99NaNplatinum25.0
20f34ruiy23e78y2r2009-08-1129.99NaNplatinum311.0
30fue298y2r23r23r22010-08-1929.99NaNplatinum419.0
40f345kjh345oiuy3462010-08-1929.99NaNplatinum519.0
50fju234978rfjkhsdf2010-08-199.995%basic619.0
60f34ruiy23e78y2r2010-08-199.99NaNbasic719.0
70fue298y2r23r23r32010-08-199.99NaNbasic819.0
80f345kjh345oiuy3472010-08-199.99NaNbasic919.0
90fju234978rfjkhsdf2011-08-119.995%basic1011.0
100f34ruiy23e78y2r2015-08-099.995%basic109.0
110fue298y2r23r23r4NaT9.91NaNbasic10NaN
120f345kjh345oiuy348NaT9.99NaNbasic10NaN
130fju234978rfjkhsdfNaT14.995%deluxe10NaN
140f34ruiy23e78y2rNaT14.99NaNdeluxe10NaN
150fue298y2r23r23r5NaT14.995%deluxe10NaN
160f345kjh345oiuy349NaT14.99NaNdeluxe10NaN
\n", "
" ], "text/plain": [ " customer_id transactoin_date ticket_price discount product info \\\n", "0 0f345kjh345oiuy345 2010-08-19 29.99 NaN platinum 1 \n", "1 0fju234978rfjkhsdf 2012-01-05 29.99 NaN platinum 2 \n", "2 0f34ruiy23e78y2r 2009-08-11 29.99 NaN platinum 3 \n", "3 0fue298y2r23r23r2 2010-08-19 29.99 NaN platinum 4 \n", "4 0f345kjh345oiuy346 2010-08-19 29.99 NaN platinum 5 \n", "5 0fju234978rfjkhsdf 2010-08-19 9.99 5% basic 6 \n", "6 0f34ruiy23e78y2r 2010-08-19 9.99 NaN basic 7 \n", "7 0fue298y2r23r23r3 2010-08-19 9.99 NaN basic 8 \n", "8 0f345kjh345oiuy347 2010-08-19 9.99 NaN basic 9 \n", "9 0fju234978rfjkhsdf 2011-08-11 9.99 5% basic 10 \n", "10 0f34ruiy23e78y2r 2015-08-09 9.99 5% basic 10 \n", "11 0fue298y2r23r23r4 NaT 9.91 NaN basic 10 \n", "12 0f345kjh345oiuy348 NaT 9.99 NaN basic 10 \n", "13 0fju234978rfjkhsdf NaT 14.99 5% deluxe 10 \n", "14 0f34ruiy23e78y2r NaT 14.99 NaN deluxe 10 \n", "15 0fue298y2r23r23r5 NaT 14.99 5% deluxe 10 \n", "16 0f345kjh345oiuy349 NaT 14.99 NaN deluxe 10 \n", "\n", " new1 \n", "0 19.0 \n", "1 5.0 \n", "2 11.0 \n", "3 19.0 \n", "4 19.0 \n", "5 19.0 \n", "6 19.0 \n", "7 19.0 \n", "8 19.0 \n", "9 11.0 \n", "10 9.0 \n", "11 NaN \n", "12 NaN \n", "13 NaN \n", "14 NaN \n", "15 NaN \n", "16 NaN " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.year(\"transactoin_date\", \"new1\").compute()\n", "df.cols.month(\"transactoin_date\", \"new1\").compute()\n", "df.cols.day(\"transactoin_date\", \"new1\").compute()" ] }, { "cell_type": "code", "execution_count": 382, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 20 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 17 of 17 rows / 6 columns
\n", "
1 partition(s)
\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
customer_id
\n", "
1 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
transactoin_date
\n", "
2 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
ticket_price
\n", "
3 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
discount
\n", "
4 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
product
\n", "
5 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
info
\n", "
6 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f345kjh345oiuy345\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fju234978rfjkhsdf\n", " \n", "
\n", "
\n", "
\n", " \n", " 2012/01/05\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 2\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f34ruiy23e78y2r\n", " \n", "
\n", "
\n", "
\n", " \n", " 2009/08/11\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 3\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fue298y2r23r23r2\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 4\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f345kjh345oiuy346\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 5\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fju234978rfjkhsdf\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " 5%\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 6\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f34ruiy23e78y2r\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fue298y2r23r23r3\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f345kjh345oiuy347\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fju234978rfjkhsdf\n", " \n", "
\n", "
\n", "
\n", " \n", " 2011/08/11\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " 5%\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f34ruiy23e78y2r\n", " \n", "
\n", "
\n", "
\n", " \n", " 2015/08/09\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " 5%\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fue298y2r23r23r4\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.91\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f345kjh345oiuy348\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fju234978rfjkhsdf\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 14.99\n", " \n", "
\n", "
\n", "
\n", " \n", " 5%\n", " \n", "
\n", "
\n", "
\n", " \n", " deluxe\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f34ruiy23e78y2r\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 14.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " deluxe\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fue298y2r23r23r5\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 14.99\n", " \n", "
\n", "
\n", "
\n", " \n", " 5%\n", " \n", "
\n", "
\n", "
\n", " \n", " deluxe\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f345kjh345oiuy349\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 14.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " deluxe\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "\n", "
Viewing 17 of 17 rows / 6 columns
\n", "
1 partition(s) <class 'dask.dataframe.core.DataFrame'>
\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.ext.display(20)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df= df.cols.unnest(\"transactoin_date\", separator=\"/\", splits=3, output_cols=\"transactoin_date\").ext.cache()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.ext.profile(columns=\"*\", infer=True, output=\"json\", flush=True, bins=10)" ] }, { "cell_type": "code", "execution_count": 293, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 10 of 17 rows / 6 columns
\n", "
1 partition(s)
\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
customer_id
\n", "
1 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
transactoin_date
\n", "
2 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
ticket_price
\n", "
3 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
discount
\n", "
4 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
product
\n", "
5 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
info
\n", "
6 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f345kjh345oiuy345\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fju234978rfjkhsdf\n", " \n", "
\n", "
\n", "
\n", " \n", " 2012/01/05\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 2\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f34ruiy23e78y2r\n", " \n", "
\n", "
\n", "
\n", " \n", " 2009/08/11\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 3\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fue298y2r23r23r2\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 4\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f345kjh345oiuy346\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 5\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fju234978rfjkhsdf\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " 5%\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 6\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f34ruiy23e78y2r\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fue298y2r23r23r3\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f345kjh345oiuy347\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fju234978rfjkhsdf\n", " \n", "
\n", "
\n", "
\n", " \n", " 2011/08/11\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " 5%\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "\n", "
Viewing 10 of 17 rows / 6 columns
\n", "
1 partition(s) <class 'dask.dataframe.core.DataFrame'>
\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.ext.display()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 13 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] }, { "data": { "text/plain": [ "{'A': 'string', 'B': 'string'}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.infer_profiler_dtypes(\"*\")" ] }, { "cell_type": "code", "execution_count": 363, "metadata": {}, "outputs": [], "source": [ "df = op.load.file(\"http://159.65.217.17:5003/uploads/datasetFile-1590426328940.csv\").ext.cache()\n", "df = df.ext.optimize()\n", "df = df.ext.repartition(1).ext.cache()" ] }, { "cell_type": "code", "execution_count": 364, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0},\n", " 'transactoin_date': {'mismatch': 11, 'missing': 6, 'match': 0}}" ] }, "execution_count": 364, "metadata": {}, "output_type": "execute_result" } ], "source": [ "col_name = \"customer_id\"\n", "df.cols.count_mismatch({col_name:\"int\",\"transactoin_date\":\"date\"})" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "df = op.load.file(\"data/crime.csv\").ext.cache()" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 447 ms\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
INCIDENT_NUMBEROFFENSE_CODEOFFENSE_CODE_GROUPOFFENSE_DESCRIPTIONDISTRICTREPORTING_AREASHOOTINGOCCURRED_ON_DATEYEARMONTHDAY_OF_WEEKHOURUCR_PARTSTREETLatLongLocation
0I18207094500619LarcenyLARCENY ALL OTHERSD14808NaN2018-09-02 13:00:0020189Sunday13Part OneLINCOLN ST42.35779134-71.13937053(42.35779134, -71.13937053)
1I18207094301402VandalismVANDALISMC11347NaN2018-08-21 00:00:0020188Tuesday0Part TwoHECLA ST42.30682138-71.06030035(42.30682138, -71.06030035)
2I18207094103410TowedTOWED MOTOR VEHICLED4151NaN2018-09-03 19:27:0020189Monday19Part ThreeCAZENOVE ST42.34658879-71.07242943(42.34658879, -71.07242943)
3I18207094003114Investigate PropertyINVESTIGATE PROPERTYD4272NaN2018-09-03 21:16:0020189Monday21Part ThreeNEWCOMB ST42.33418175-71.07866441(42.33418175, -71.07866441)
4I18207093803114Investigate PropertyINVESTIGATE PROPERTYB3421NaN2018-09-03 21:05:0020189Monday21Part ThreeDELHI ST42.27536542-71.09036101(42.27536542, -71.09036101)
......................................................
319068I050310906-0003125Warrant ArrestsWARRANT ARRESTD4285NaN2016-06-05 17:25:0020166Sunday17Part ThreeCOVENTRY ST42.33695098-71.08574813(42.33695098, -71.08574813)
319069I030217815-0800111HomicideMURDER, NON-NEGLIGIENT MANSLAUGHTERE18520NaN2015-07-09 13:38:0020157Thursday13Part OneRIVER ST42.25592648-71.12317207(42.25592648, -71.12317207)
319070I030217815-0803125Warrant ArrestsWARRANT ARRESTE18520NaN2015-07-09 13:38:0020157Thursday13Part ThreeRIVER ST42.25592648-71.12317207(42.25592648, -71.12317207)
319071I010370257-0003125Warrant ArrestsWARRANT ARRESTE13569NaN2016-05-31 19:35:0020165Tuesday19Part ThreeNEW WASHINGTON ST42.30233307-71.11156487(42.30233307, -71.11156487)
31907214205255003125Warrant ArrestsWARRANT ARRESTD4903NaN2015-06-22 00:12:0020156Monday0Part ThreeWASHINGTON ST42.33383935-71.08029038(42.33383935, -71.08029038)
\n", "

319073 rows × 17 columns

\n", "
" ], "text/plain": [ " INCIDENT_NUMBER OFFENSE_CODE OFFENSE_CODE_GROUP \\\n", "0 I182070945 00619 Larceny \n", "1 I182070943 01402 Vandalism \n", "2 I182070941 03410 Towed \n", "3 I182070940 03114 Investigate Property \n", "4 I182070938 03114 Investigate Property \n", "... ... ... ... \n", "319068 I050310906-00 03125 Warrant Arrests \n", "319069 I030217815-08 00111 Homicide \n", "319070 I030217815-08 03125 Warrant Arrests \n", "319071 I010370257-00 03125 Warrant Arrests \n", "319072 142052550 03125 Warrant Arrests \n", "\n", " OFFENSE_DESCRIPTION DISTRICT REPORTING_AREA SHOOTING \\\n", "0 LARCENY ALL OTHERS D14 808 NaN \n", "1 VANDALISM C11 347 NaN \n", "2 TOWED MOTOR VEHICLE D4 151 NaN \n", "3 INVESTIGATE PROPERTY D4 272 NaN \n", "4 INVESTIGATE PROPERTY B3 421 NaN \n", "... ... ... ... ... \n", "319068 WARRANT ARREST D4 285 NaN \n", "319069 MURDER, NON-NEGLIGIENT MANSLAUGHTER E18 520 NaN \n", "319070 WARRANT ARREST E18 520 NaN \n", "319071 WARRANT ARREST E13 569 NaN \n", "319072 WARRANT ARREST D4 903 NaN \n", "\n", " OCCURRED_ON_DATE YEAR MONTH DAY_OF_WEEK HOUR UCR_PART \\\n", "0 2018-09-02 13:00:00 2018 9 Sunday 13 Part One \n", "1 2018-08-21 00:00:00 2018 8 Tuesday 0 Part Two \n", "2 2018-09-03 19:27:00 2018 9 Monday 19 Part Three \n", "3 2018-09-03 21:16:00 2018 9 Monday 21 Part Three \n", "4 2018-09-03 21:05:00 2018 9 Monday 21 Part Three \n", "... ... ... ... ... ... ... \n", "319068 2016-06-05 17:25:00 2016 6 Sunday 17 Part Three \n", "319069 2015-07-09 13:38:00 2015 7 Thursday 13 Part One \n", "319070 2015-07-09 13:38:00 2015 7 Thursday 13 Part Three \n", "319071 2016-05-31 19:35:00 2016 5 Tuesday 19 Part Three \n", "319072 2015-06-22 00:12:00 2015 6 Monday 0 Part Three \n", "\n", " STREET Lat Long \\\n", "0 LINCOLN ST 42.35779134 -71.13937053 \n", "1 HECLA ST 42.30682138 -71.06030035 \n", "2 CAZENOVE ST 42.34658879 -71.07242943 \n", "3 NEWCOMB ST 42.33418175 -71.07866441 \n", "4 DELHI ST 42.27536542 -71.09036101 \n", "... ... ... ... \n", "319068 COVENTRY ST 42.33695098 -71.08574813 \n", "319069 RIVER ST 42.25592648 -71.12317207 \n", "319070 RIVER ST 42.25592648 -71.12317207 \n", "319071 NEW WASHINGTON ST 42.30233307 -71.11156487 \n", "319072 WASHINGTON ST 42.33383935 -71.08029038 \n", "\n", " Location \n", "0 (42.35779134, -71.13937053) \n", "1 (42.30682138, -71.06030035) \n", "2 (42.34658879, -71.07242943) \n", "3 (42.33418175, -71.07866441) \n", "4 (42.27536542, -71.09036101) \n", "... ... \n", "319068 (42.33695098, -71.08574813) \n", "319069 (42.25592648, -71.12317207) \n", "319070 (42.25592648, -71.12317207) \n", "319071 (42.30233307, -71.11156487) \n", "319072 (42.33383935, -71.08029038) \n", "\n", "[319073 rows x 17 columns]" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "df.cols.cast(\"INCIDENT_NUMBER\",\"int\").compute()" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 10 of 319073 rows / 17 columns
\n", "
8 partition(s)
\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
INCIDENT_NUMBER
\n", "
1 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
OFFENSE_CODE
\n", "
2 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
OFFENSE_CODE_GROUP
\n", "
3 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
OFFENSE_DESCRIPTION
\n", "
4 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
DISTRICT
\n", "
5 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
REPORTING_AREA
\n", "
6 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
SHOOTING
\n", "
7 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
OCCURRED_ON_DATE
\n", "
8 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
YEAR
\n", "
9 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
MONTH
\n", "
10 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
DAY_OF_WEEK
\n", "
11 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
HOUR
\n", "
12 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
UCR_PART
\n", "
13 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
STREET
\n", "
14 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
Lat
\n", "
15 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
Long
\n", "
16 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
Location
\n", "
17 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070945\n", " \n", "
\n", "
\n", "
\n", " \n", " 00619\n", " \n", "
\n", "
\n", "
\n", " \n", " Larceny\n", " \n", "
\n", "
\n", "
\n", " \n", " LARCENY⋅ALL⋅OTHERS\n", " \n", "
\n", "
\n", "
\n", " \n", " D14\n", " \n", "
\n", "
\n", "
\n", " \n", " 808\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-02⋅13:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Sunday\n", " \n", "
\n", "
\n", "
\n", " \n", " 13\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅One\n", " \n", "
\n", "
\n", "
\n", " \n", " LINCOLN⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.35779134\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.13937053\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.35779134,⋅-71.13937053)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070943\n", " \n", "
\n", "
\n", "
\n", " \n", " 01402\n", " \n", "
\n", "
\n", "
\n", " \n", " Vandalism\n", " \n", "
\n", "
\n", "
\n", " \n", " VANDALISM\n", " \n", "
\n", "
\n", "
\n", " \n", " C11\n", " \n", "
\n", "
\n", "
\n", " \n", " 347\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-08-21⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " Tuesday\n", " \n", "
\n", "
\n", "
\n", " \n", " 0\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Two\n", " \n", "
\n", "
\n", "
\n", " \n", " HECLA⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.30682138\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.06030035\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.30682138,⋅-71.06030035)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070941\n", " \n", "
\n", "
\n", "
\n", " \n", " 03410\n", " \n", "
\n", "
\n", "
\n", " \n", " Towed\n", " \n", "
\n", "
\n", "
\n", " \n", " TOWED⋅MOTOR⋅VEHICLE\n", " \n", "
\n", "
\n", "
\n", " \n", " D4\n", " \n", "
\n", "
\n", "
\n", " \n", " 151\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅19:27:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 19\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " CAZENOVE⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.34658879\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07242943\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.34658879,⋅-71.07242943)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070940\n", " \n", "
\n", "
\n", "
\n", " \n", " 03114\n", " \n", "
\n", "
\n", "
\n", " \n", " Investigate⋅Property\n", " \n", "
\n", "
\n", "
\n", " \n", " INVESTIGATE⋅PROPERTY\n", " \n", "
\n", "
\n", "
\n", " \n", " D4\n", " \n", "
\n", "
\n", "
\n", " \n", " 272\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:16:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " NEWCOMB⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.33418175\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07866441\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.33418175,⋅-71.07866441)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070938\n", " \n", "
\n", "
\n", "
\n", " \n", " 03114\n", " \n", "
\n", "
\n", "
\n", " \n", " Investigate⋅Property\n", " \n", "
\n", "
\n", "
\n", " \n", " INVESTIGATE⋅PROPERTY\n", " \n", "
\n", "
\n", "
\n", " \n", " B3\n", " \n", "
\n", "
\n", "
\n", " \n", " 421\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:05:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " DELHI⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.27536542\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.09036101\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.27536542,⋅-71.09036101)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070936\n", " \n", "
\n", "
\n", "
\n", " \n", " 03820\n", " \n", "
\n", "
\n", "
\n", " \n", " Motor⋅Vehicle⋅Accident⋅Response\n", " \n", "
\n", "
\n", "
\n", " \n", " M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY\n", " \n", "
\n", "
\n", "
\n", " \n", " C11\n", " \n", "
\n", "
\n", "
\n", " \n", " 398\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:09:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " TALBOT⋅AVE\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.29019621\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07159012\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.29019621,⋅-71.07159012)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070933\n", " \n", "
\n", "
\n", "
\n", " \n", " 00724\n", " \n", "
\n", "
\n", "
\n", " \n", " Auto⋅Theft\n", " \n", "
\n", "
\n", "
\n", " \n", " AUTO⋅THEFT\n", " \n", "
\n", "
\n", "
\n", " \n", " B2\n", " \n", "
\n", "
\n", "
\n", " \n", " 330\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:25:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅One\n", " \n", "
\n", "
\n", "
\n", " \n", " NORMANDY⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.30607218\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.08273260\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.30607218,⋅-71.08273260)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070932\n", " \n", "
\n", "
\n", "
\n", " \n", " 03301\n", " \n", "
\n", "
\n", "
\n", " \n", " Verbal⋅Disputes\n", " \n", "
\n", "
\n", "
\n", " \n", " VERBAL⋅DISPUTE\n", " \n", "
\n", "
\n", "
\n", " \n", " B2\n", " \n", "
\n", "
\n", "
\n", " \n", " 584\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅20:39:37\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 20\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " LAWN⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.32701648\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.10555088\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.32701648,⋅-71.10555088)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070931\n", " \n", "
\n", "
\n", "
\n", " \n", " 00301\n", " \n", "
\n", "
\n", "
\n", " \n", " Robbery\n", " \n", "
\n", "
\n", "
\n", " \n", " ROBBERY⋅-⋅STREET\n", " \n", "
\n", "
\n", "
\n", " \n", " C6\n", " \n", "
\n", "
\n", "
\n", " \n", " 177\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅20:48:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 20\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅One\n", " \n", "
\n", "
\n", "
\n", " \n", " MASSACHUSETTS⋅AVE\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.33152148\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07085307\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.33152148,⋅-71.07085307)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070929\n", " \n", "
\n", "
\n", "
\n", " \n", " 03301\n", " \n", "
\n", "
\n", "
\n", " \n", " Verbal⋅Disputes\n", " \n", "
\n", "
\n", "
\n", " \n", " VERBAL⋅DISPUTE\n", " \n", "
\n", "
\n", "
\n", " \n", " C11\n", " \n", "
\n", "
\n", "
\n", " \n", " 364\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅20:38:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 20\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " LESLIE⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.29514664\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.05860832\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.29514664,⋅-71.05860832)\n", " \n", "
\n", "
\n", "\n", "
Viewing 10 of 319073 rows / 17 columns
\n", "
8 partition(s) <class 'dask.dataframe.core.DataFrame'>
\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.ext.display()" ] }, { "cell_type": "code", "execution_count": 140, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "nan" ] }, "execution_count": 140, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "a = np.array([1,2,3])\n", "b = np.array([1,2,\"a\"])\n", "\n", "def func(a, b):\n", " try:\n", " r =a+b\n", " except:\n", " r = np.nan\n", " return r\n", "\n", "func(a,b)" ] }, { "cell_type": "code", "execution_count": 224, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['I182070945', '808'], dtype=object)" ] }, "execution_count": 224, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[[\"INCIDENT_NUMBER\",\"REPORTING_AREA\"]].compute().to_numpy()[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def op(value):\n", " return value\n", "# return value[\"INCIDENT_NUMBER\"]\n", "# print(value)\n", "# try:\n", "# return value[\"INCIDENT_NUMBER\"] + value[\"REPORTING_AREA\"]\n", "# except():\n", "# return np.nan\n", "\n", "def func(pdf): \n", "# print(type(pdf))\n", " return pdf.apply(op, axis=1)\n", " \n", "\n", "df[[\"INCIDENT_NUMBER\",\"REPORTING_AREA\"]].map_partitions(func, meta=object).compute()" ] }, { "cell_type": "code", "execution_count": 198, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Wall time: 5.37 s\n" ] } ], "source": [ "%%time\n", "from numpy.core._exceptions import UFuncTypeError\n", "\n", "def func(a,b):\n", " return a+b\n", "\n", "def myfunc(a, b, expr):\n", " \"Return a-b if a>b, otherwise return a+b\"\n", " \n", " try:\n", " return func(a,b)\n", " except (UFuncTypeError, TypeError):\n", " return np.nan\n", " \n", "vfunc = np.vectorize(myfunc)\n", "print(func)\n", "arr= [1, 2, 3,4,5,6,7,8,9,\"A\"]*1000000\n", "\n", "b = vfunc(arr, 2, func)\n", "# vfunc([1, 2, 3, \"A\"], 2)" ] }, { "cell_type": "code", "execution_count": 189, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10000000" ] }, "execution_count": 189, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(arr)" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [], "source": [ "df = df.ext.repartition(8)" ] }, { "cell_type": "code", "execution_count": 128, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 326 ms\n" ] }, { "data": { "text/plain": [ "0 False\n", "1 False\n", "2 False\n", "3 False\n", "4 False\n", " ... \n", "319068 False\n", "319069 False\n", "319070 False\n", "319071 False\n", "319072 True\n", "Name: INCIDENT_NUMBER, Length: 319073, dtype: bool" ] }, "execution_count": 128, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "comp = re.compile(\"^(?=.)([+-]?([0-9]*)(\\.([0-9]+))?)$\") \n", "%time df[\"INCIDENT_NUMBER\"].str.match(comp).compute()" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['INCIDENT_NUMBER',\n", " 'OFFENSE_CODE',\n", " 'OFFENSE_CODE_GROUP',\n", " 'OFFENSE_DESCRIPTION',\n", " 'DISTRICT',\n", " 'REPORTING_AREA',\n", " 'SHOOTING',\n", " 'OCCURRED_ON_DATE',\n", " 'YEAR',\n", " 'MONTH',\n", " 'DAY_OF_WEEK',\n", " 'HOUR',\n", " 'UCR_PART',\n", " 'STREET',\n", " 'Lat',\n", " 'Long',\n", " 'Location']" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.names()" ] }, { "cell_type": "code", "execution_count": 366, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "datetime.datetime(2018, 9, 2, 13, 0)" ] }, "execution_count": 366, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from dateutil.parser import parse as dparse\n", "dparse(\"2018-09-02 13:00:00\")" ] }, { "cell_type": "code", "execution_count": 367, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 10 of 319073 rows / 17 columns
\n", "
1 partition(s)
\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
INCIDENT_NUMBER
\n", "
1 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
OFFENSE_CODE
\n", "
2 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
OFFENSE_CODE_GROUP
\n", "
3 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
OFFENSE_DESCRIPTION
\n", "
4 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
DISTRICT
\n", "
5 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
REPORTING_AREA
\n", "
6 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
SHOOTING
\n", "
7 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
OCCURRED_ON_DATE
\n", "
8 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
YEAR
\n", "
9 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
MONTH
\n", "
10 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
DAY_OF_WEEK
\n", "
11 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
HOUR
\n", "
12 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
UCR_PART
\n", "
13 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
STREET
\n", "
14 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
Lat
\n", "
15 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
Long
\n", "
16 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
Location
\n", "
17 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070945\n", " \n", "
\n", "
\n", "
\n", " \n", " 00619\n", " \n", "
\n", "
\n", "
\n", " \n", " Larceny\n", " \n", "
\n", "
\n", "
\n", " \n", " LARCENY⋅ALL⋅OTHERS\n", " \n", "
\n", "
\n", "
\n", " \n", " D14\n", " \n", "
\n", "
\n", "
\n", " \n", " 808\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-02⋅13:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Sunday\n", " \n", "
\n", "
\n", "
\n", " \n", " 13\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅One\n", " \n", "
\n", "
\n", "
\n", " \n", " LINCOLN⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.35779134\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.13937053\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.35779134,⋅-71.13937053)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070943\n", " \n", "
\n", "
\n", "
\n", " \n", " 01402\n", " \n", "
\n", "
\n", "
\n", " \n", " Vandalism\n", " \n", "
\n", "
\n", "
\n", " \n", " VANDALISM\n", " \n", "
\n", "
\n", "
\n", " \n", " C11\n", " \n", "
\n", "
\n", "
\n", " \n", " 347\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-08-21⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " Tuesday\n", " \n", "
\n", "
\n", "
\n", " \n", " 0\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Two\n", " \n", "
\n", "
\n", "
\n", " \n", " HECLA⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.30682138\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.06030035\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.30682138,⋅-71.06030035)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070941\n", " \n", "
\n", "
\n", "
\n", " \n", " 03410\n", " \n", "
\n", "
\n", "
\n", " \n", " Towed\n", " \n", "
\n", "
\n", "
\n", " \n", " TOWED⋅MOTOR⋅VEHICLE\n", " \n", "
\n", "
\n", "
\n", " \n", " D4\n", " \n", "
\n", "
\n", "
\n", " \n", " 151\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅19:27:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 19\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " CAZENOVE⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.34658879\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07242943\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.34658879,⋅-71.07242943)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070940\n", " \n", "
\n", "
\n", "
\n", " \n", " 03114\n", " \n", "
\n", "
\n", "
\n", " \n", " Investigate⋅Property\n", " \n", "
\n", "
\n", "
\n", " \n", " INVESTIGATE⋅PROPERTY\n", " \n", "
\n", "
\n", "
\n", " \n", " D4\n", " \n", "
\n", "
\n", "
\n", " \n", " 272\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:16:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " NEWCOMB⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.33418175\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07866441\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.33418175,⋅-71.07866441)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070938\n", " \n", "
\n", "
\n", "
\n", " \n", " 03114\n", " \n", "
\n", "
\n", "
\n", " \n", " Investigate⋅Property\n", " \n", "
\n", "
\n", "
\n", " \n", " INVESTIGATE⋅PROPERTY\n", " \n", "
\n", "
\n", "
\n", " \n", " B3\n", " \n", "
\n", "
\n", "
\n", " \n", " 421\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:05:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " DELHI⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.27536542\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.09036101\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.27536542,⋅-71.09036101)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070936\n", " \n", "
\n", "
\n", "
\n", " \n", " 03820\n", " \n", "
\n", "
\n", "
\n", " \n", " Motor⋅Vehicle⋅Accident⋅Response\n", " \n", "
\n", "
\n", "
\n", " \n", " M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY\n", " \n", "
\n", "
\n", "
\n", " \n", " C11\n", " \n", "
\n", "
\n", "
\n", " \n", " 398\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:09:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " TALBOT⋅AVE\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.29019621\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07159012\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.29019621,⋅-71.07159012)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070933\n", " \n", "
\n", "
\n", "
\n", " \n", " 00724\n", " \n", "
\n", "
\n", "
\n", " \n", " Auto⋅Theft\n", " \n", "
\n", "
\n", "
\n", " \n", " AUTO⋅THEFT\n", " \n", "
\n", "
\n", "
\n", " \n", " B2\n", " \n", "
\n", "
\n", "
\n", " \n", " 330\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:25:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅One\n", " \n", "
\n", "
\n", "
\n", " \n", " NORMANDY⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.30607218\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.08273260\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.30607218,⋅-71.08273260)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070932\n", " \n", "
\n", "
\n", "
\n", " \n", " 03301\n", " \n", "
\n", "
\n", "
\n", " \n", " Verbal⋅Disputes\n", " \n", "
\n", "
\n", "
\n", " \n", " VERBAL⋅DISPUTE\n", " \n", "
\n", "
\n", "
\n", " \n", " B2\n", " \n", "
\n", "
\n", "
\n", " \n", " 584\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅20:39:37\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 20\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " LAWN⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.32701648\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.10555088\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.32701648,⋅-71.10555088)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070931\n", " \n", "
\n", "
\n", "
\n", " \n", " 00301\n", " \n", "
\n", "
\n", "
\n", " \n", " Robbery\n", " \n", "
\n", "
\n", "
\n", " \n", " ROBBERY⋅-⋅STREET\n", " \n", "
\n", "
\n", "
\n", " \n", " C6\n", " \n", "
\n", "
\n", "
\n", " \n", " 177\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅20:48:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 20\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅One\n", " \n", "
\n", "
\n", "
\n", " \n", " MASSACHUSETTS⋅AVE\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.33152148\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07085307\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.33152148,⋅-71.07085307)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070929\n", " \n", "
\n", "
\n", "
\n", " \n", " 03301\n", " \n", "
\n", "
\n", "
\n", " \n", " Verbal⋅Disputes\n", " \n", "
\n", "
\n", "
\n", " \n", " VERBAL⋅DISPUTE\n", " \n", "
\n", "
\n", "
\n", " \n", " C11\n", " \n", "
\n", "
\n", "
\n", " \n", " 364\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅20:38:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 20\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " LESLIE⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.29514664\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.05860832\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.29514664,⋅-71.05860832)\n", " \n", "
\n", "
\n", "\n", "
Viewing 10 of 319073 rows / 17 columns
\n", "
1 partition(s) <class 'dask.dataframe.core.DataFrame'>
\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.ext.display()" ] }, { "cell_type": "code", "execution_count": 368, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'INCIDENT_NUMBER': 'string',\n", " 'OFFENSE_CODE': 'zip_code',\n", " 'OFFENSE_CODE_GROUP': 'string',\n", " 'OFFENSE_DESCRIPTION': 'string',\n", " 'DISTRICT': 'string',\n", " 'REPORTING_AREA': 'int',\n", " 'SHOOTING': 'object',\n", " 'OCCURRED_ON_DATE': 'string',\n", " 'YEAR': 'int',\n", " 'MONTH': 'int',\n", " 'DAY_OF_WEEK': 'string',\n", " 'HOUR': 'int',\n", " 'UCR_PART': 'string',\n", " 'STREET': 'string',\n", " 'Lat': 'decimal',\n", " 'Long': 'decimal',\n", " 'Location': 'string'}" ] }, "execution_count": 368, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.infer_profiler_dtypes(\"*\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 20 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 17 of 17 rows / 6 columns
\n", "
1 partition(s)
\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
customer_id
\n", "
1 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
transactoin_date
\n", "
2 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
ticket_price
\n", "
3 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
discount
\n", "
4 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
product
\n", "
5 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
info
\n", "
6 (uint8)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f345kjh345oiuy345\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fju234978rfjkhsdf\n", " \n", "
\n", "
\n", "
\n", " \n", " 2012/01/05\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 2\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f34ruiy23e78y2r\n", " \n", "
\n", "
\n", "
\n", " \n", " 2009/08/11\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 3\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fue298y2r23r23r2\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 4\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f345kjh345oiuy346\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 29.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " platinum\n", " \n", "
\n", "
\n", "
\n", " \n", " 5\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fju234978rfjkhsdf\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " 5%\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 6\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f34ruiy23e78y2r\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fue298y2r23r23r3\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f345kjh345oiuy347\n", " \n", "
\n", "
\n", "
\n", " \n", " 2010/08/19\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fju234978rfjkhsdf\n", " \n", "
\n", "
\n", "
\n", " \n", " 2011/08/11\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " 5%\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f34ruiy23e78y2r\n", " \n", "
\n", "
\n", "
\n", " \n", " 2015/08/09\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " 5%\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fue298y2r23r23r4\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.91\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f345kjh345oiuy348\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 9.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " basic\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fju234978rfjkhsdf\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 14.99\n", " \n", "
\n", "
\n", "
\n", " \n", " 5%\n", " \n", "
\n", "
\n", "
\n", " \n", " deluxe\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f34ruiy23e78y2r\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 14.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " deluxe\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0fue298y2r23r23r5\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 14.99\n", " \n", "
\n", "
\n", "
\n", " \n", " 5%\n", " \n", "
\n", "
\n", "
\n", " \n", " deluxe\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 0f345kjh345oiuy349\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 14.99\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " deluxe\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "\n", "
Viewing 17 of 17 rows / 6 columns
\n", "
1 partition(s) <class 'dask.dataframe.core.DataFrame'>
\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.ext.display(20)" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'customer_id': {'mismatch': 0, 'missing': 0, 'match': 17, 'profiler_dtype': 'string'}}\n", "{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'int'}}\n", "{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'decimal'}}\n", "{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'boolean'}}\n", "{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'date'}}\n", "{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'array'}}\n", "{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'object'}}\n", "{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'gender'}}\n", "{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'ip'}}\n", "{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'url'}}\n", "{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'email'}}\n", "{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'credit_card_number'}}\n", "{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'zip_code'}}\n" ] } ], "source": [ "col_name =\"customer_id\"\n", "\n", "print(df.cols.count_mismatch({col_name:\"int\"}))\n", "print(df.cols.count_mismatch({col_name:\"decimal\"}))\n", "print(df.cols.count_mismatch({col_name:\"boolean\"}))\n", "print(df.cols.count_mismatch({col_name:\"date\"}))\n", "print(df.cols.count_mismatch({col_name:\"array\"}))\n", "print(df.cols.count_mismatch({col_name:\"object\"}))\n", "print(df.cols.count_mismatch({col_name:\"gender\"}))\n", "print(df.cols.count_mismatch({col_name:\"ip\"}))\n", "print(df.cols.count_mismatch({col_name:\"url\"}))\n", "print(df.cols.count_mismatch({col_name:\"email\"}))\n", "print(df.cols.count_mismatch({col_name:\"credit_card_number\"}))\n", "print(df.cols.count_mismatch({col_name:\"zip_code\"}))" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "cols_and_inferred_dtype {'customer_id': 'string'}\n", "{\"columns\": {\"customer_id\": {\"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 3, \"frequency\": [{\"value\": \"0fju234978rfjkhsdf\", \"count\": 4}, {\"value\": \"0f34ruiy23e78y2r\", \"count\": 4}, {\"value\": \"0fue298y2r23r23r5\", \"count\": 1}, {\"value\": \"0fue298y2r23r23r4\", \"count\": 1}, {\"value\": \"0fue298y2r23r23r3\", \"count\": 1}, {\"value\": \"0fue298y2r23r23r2\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy349\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy348\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy347\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy346\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy345\", \"count\": 1}], \"count_uniques\": 11}, \"dtype\": \"object\", \"profiler_dtype\": \"string\"}}, \"name\": null, \"file_name\": \"tmpthwlzht1.csv\", \"summary\": {\"cols_count\": 6, \"rows_count\": 17, \"dtypes_list\": [\"object\", \"uint8\"], \"total_count_dtypes\": 2, \"missing_count\": 0, \"p_missing\": 0.0}}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 2 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] } ], "source": [ "\n", "_output = df.ext.profile(columns=col_name, infer=True, output=\"json\", flush=True)\n", "# _output = df.ext.set_buffer(\"*\")\n", "# _output = df.ext.set_buffer(\"*\")\n", "# _output = df.ext.buffer_window(\"*\", 0, 17).ext.to_json(\"*\")\n", "print(_output)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'customer_id': 'string', 'transactoin_date': 'string', 'ticket_price': 'decimal', 'discount': 'object', 'product': 'string', 'info': 'int'}\n", "{'customer_id': {'mismatch': 0, 'missing': 0, 'match': 3, 'profiler_dtype': 'string'}, 'transactoin_date': {'mismatch': 0, 'missing': 3, 'match': 0, 'profiler_dtype': 'string'}, 'ticket_price': {'mismatch': 0, 'missing': 0, 'match': 3, 'profiler_dtype': 'decimal'}, 'discount': {'mismatch': 1, 'missing': 2, 'match': 0, 'profiler_dtype': 'object'}, 'product': {'mismatch': 0, 'missing': 0, 'match': 3, 'profiler_dtype': 'string'}, 'info': {'mismatch': 0, 'missing': 0, 'match': 3, 'profiler_dtype': 'int'}}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 2 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] } ], "source": [ "cols_and_inferred_dtype = df.cols.infer_profiler_dtypes(\"*\")\n", "print(cols_and_inferred_dtype)\n", "mismatch = df.cols.count_mismatch(cols_and_inferred_dtype, infer=True, compute=True)\n", "print(mismatch)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "df = op.load.file(\"data/crime.csv\").ext.cache()\n", "df = df.ext.optimize().ext.cache()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'OFFENSE_DESCRIPTION': {'mismatch': 0,\n", " 'missing': 0,\n", " 'match': 319073,\n", " 'profiler_dtype': 'string'}}" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.count_mismatch({\"OFFENSE_DESCRIPTION\":\"string\"})" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 10 of 319073 rows / 17 columns
\n", "
1 partition(s)
\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
INCIDENT_NUMBER
\n", "
1 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
OFFENSE_CODE
\n", "
2 (uint16)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
OFFENSE_CODE_GROUP
\n", "
3 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
OFFENSE_DESCRIPTION
\n", "
4 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
DISTRICT
\n", "
5 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
REPORTING_AREA
\n", "
6 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
SHOOTING
\n", "
7 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
OCCURRED_ON_DATE
\n", "
8 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
YEAR
\n", "
9 (uint16)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
MONTH
\n", "
10 (uint8)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
DAY_OF_WEEK
\n", "
11 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
HOUR
\n", "
12 (uint8)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
UCR_PART
\n", "
13 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
STREET
\n", "
14 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
Lat
\n", "
15 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
Long
\n", "
16 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
Location
\n", "
17 (object)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070945\n", " \n", "
\n", "
\n", "
\n", " \n", " 619\n", " \n", "
\n", "
\n", "
\n", " \n", " Larceny\n", " \n", "
\n", "
\n", "
\n", " \n", " LARCENY⋅ALL⋅OTHERS\n", " \n", "
\n", "
\n", "
\n", " \n", " D14\n", " \n", "
\n", "
\n", "
\n", " \n", " 808\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-02⋅13:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Sunday\n", " \n", "
\n", "
\n", "
\n", " \n", " 13\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅One\n", " \n", "
\n", "
\n", "
\n", " \n", " LINCOLN⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.35779134\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.13937053\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.35779134,⋅-71.13937053)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070943\n", " \n", "
\n", "
\n", "
\n", " \n", " 1402\n", " \n", "
\n", "
\n", "
\n", " \n", " Vandalism\n", " \n", "
\n", "
\n", "
\n", " \n", " VANDALISM\n", " \n", "
\n", "
\n", "
\n", " \n", " C11\n", " \n", "
\n", "
\n", "
\n", " \n", " 347\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-08-21⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " Tuesday\n", " \n", "
\n", "
\n", "
\n", " \n", " 0\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Two\n", " \n", "
\n", "
\n", "
\n", " \n", " HECLA⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.30682138\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.06030035\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.30682138,⋅-71.06030035)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070941\n", " \n", "
\n", "
\n", "
\n", " \n", " 3410\n", " \n", "
\n", "
\n", "
\n", " \n", " Towed\n", " \n", "
\n", "
\n", "
\n", " \n", " TOWED⋅MOTOR⋅VEHICLE\n", " \n", "
\n", "
\n", "
\n", " \n", " D4\n", " \n", "
\n", "
\n", "
\n", " \n", " 151\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅19:27:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 19\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " CAZENOVE⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.34658879\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07242943\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.34658879,⋅-71.07242943)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070940\n", " \n", "
\n", "
\n", "
\n", " \n", " 3114\n", " \n", "
\n", "
\n", "
\n", " \n", " Investigate⋅Property\n", " \n", "
\n", "
\n", "
\n", " \n", " INVESTIGATE⋅PROPERTY\n", " \n", "
\n", "
\n", "
\n", " \n", " D4\n", " \n", "
\n", "
\n", "
\n", " \n", " 272\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:16:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " NEWCOMB⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.33418175\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07866441\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.33418175,⋅-71.07866441)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070938\n", " \n", "
\n", "
\n", "
\n", " \n", " 3114\n", " \n", "
\n", "
\n", "
\n", " \n", " Investigate⋅Property\n", " \n", "
\n", "
\n", "
\n", " \n", " INVESTIGATE⋅PROPERTY\n", " \n", "
\n", "
\n", "
\n", " \n", " B3\n", " \n", "
\n", "
\n", "
\n", " \n", " 421\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:05:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " DELHI⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.27536542\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.09036101\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.27536542,⋅-71.09036101)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070936\n", " \n", "
\n", "
\n", "
\n", " \n", " 3820\n", " \n", "
\n", "
\n", "
\n", " \n", " Motor⋅Vehicle⋅Accident⋅Response\n", " \n", "
\n", "
\n", "
\n", " \n", " M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY\n", " \n", "
\n", "
\n", "
\n", " \n", " C11\n", " \n", "
\n", "
\n", "
\n", " \n", " 398\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:09:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " TALBOT⋅AVE\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.29019621\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07159012\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.29019621,⋅-71.07159012)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070933\n", " \n", "
\n", "
\n", "
\n", " \n", " 724\n", " \n", "
\n", "
\n", "
\n", " \n", " Auto⋅Theft\n", " \n", "
\n", "
\n", "
\n", " \n", " AUTO⋅THEFT\n", " \n", "
\n", "
\n", "
\n", " \n", " B2\n", " \n", "
\n", "
\n", "
\n", " \n", " 330\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:25:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅One\n", " \n", "
\n", "
\n", "
\n", " \n", " NORMANDY⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.30607218\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.08273260\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.30607218,⋅-71.08273260)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070932\n", " \n", "
\n", "
\n", "
\n", " \n", " 3301\n", " \n", "
\n", "
\n", "
\n", " \n", " Verbal⋅Disputes\n", " \n", "
\n", "
\n", "
\n", " \n", " VERBAL⋅DISPUTE\n", " \n", "
\n", "
\n", "
\n", " \n", " B2\n", " \n", "
\n", "
\n", "
\n", " \n", " 584\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅20:39:37\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 20\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " LAWN⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.32701648\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.10555088\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.32701648,⋅-71.10555088)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070931\n", " \n", "
\n", "
\n", "
\n", " \n", " 301\n", " \n", "
\n", "
\n", "
\n", " \n", " Robbery\n", " \n", "
\n", "
\n", "
\n", " \n", " ROBBERY⋅-⋅STREET\n", " \n", "
\n", "
\n", "
\n", " \n", " C6\n", " \n", "
\n", "
\n", "
\n", " \n", " 177\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅20:48:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 20\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅One\n", " \n", "
\n", "
\n", "
\n", " \n", " MASSACHUSETTS⋅AVE\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.33152148\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07085307\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.33152148,⋅-71.07085307)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070929\n", " \n", "
\n", "
\n", "
\n", " \n", " 3301\n", " \n", "
\n", "
\n", "
\n", " \n", " Verbal⋅Disputes\n", " \n", "
\n", "
\n", "
\n", " \n", " VERBAL⋅DISPUTE\n", " \n", "
\n", "
\n", "
\n", " \n", " C11\n", " \n", "
\n", "
\n", "
\n", " \n", " 364\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅20:38:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 20\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " LESLIE⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.29514664\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.05860832\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.29514664,⋅-71.05860832)\n", " \n", "
\n", "
\n", "\n", "
Viewing 10 of 319073 rows / 17 columns
\n", "
1 partition(s) <class 'dask.dataframe.core.DataFrame'>
\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.ext.display()" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of partitions 8\n", " x y z\n", "13 13 13 13\n", "14 14 14 14\n", "15 15 15 15\n", "16 16 16 16\n", "17 17 17 17\n", "18 18 18 18\n", "19 19 19 19\n", "20 20 20 20\n", "21 21 21 21\n", "22 22 22 22\n", "23 23 23 23\n", "24 24 24 24\n", "25 25 25 25 x y z\n", "26 26 26 26\n", "27 27 27 27\n", "28 28 28 28\n", "29 29 29 29\n", "30 30 30 30\n", "31 31 31 31\n", "32 32 32 32\n", "33 33 33 33\n", "34 34 34 34\n", "35 35 35 35\n", "36 36 36 36\n", "37 37 37 37\n", "38 38 38 38\n", "99999999999999999999999999999999999999999999999\n", " x y z\n", "39 39 39 39\n", "40 40 40 40\n", "41 41 41 41\n", "42 42 42 42\n", "43 43 43 43\n", "44 44 44 44\n", "45 45 45 45\n", "46 46 46 46\n", "47 47 47 47\n", "48 48 48 48\n", "49 49 49 49\n", "50 50 50 50\n", "51 51 51 51\n", "99999999999999999999999999999999999999999999999\n", " x y z\n", "0 0 0 0\n", "1 1 1 1\n", "2 2 2 2\n", "3 3 3 3\n", "4 4 4 4\n", "5 5 5 5\n", "6 6 6 6\n", "7 7 7 7\n", "8 8 8 8\n", "9 9 9 9\n", "10 10 10 10\n", "11 11 11 11\n", "12 12 12 12\n", "99999999999999999999999999999999999999999999999\n", "\n", "99999999999999999999999999999999999999999999999 x y z\n", "52 52 52 52\n", "53 53 53 53\n", "54 54 54 54\n", "55 55 55 55\n", "56 56 56 56\n", "57 57 57 57\n", "58 58 58 58\n", "59 59 59 59\n", "60 60 60 60\n", "61 61 61 61\n", "62 62 62 62\n", "63 63 63 63\n", "64 64 64 64\n", "99999999999999999999999999999999999999999999999\n", " x y z\n", "91 91 91 91\n", "92 92 92 92\n", "93 93 93 93\n", "94 94 94 94\n", "95 95 95 95\n", "96 96 96 96\n", "97 97 97 97\n", "98 98 98 98\n", "99 99 99 99\n", "99999999999999999999999999999999999999999999999\n", "\n", " x y z\n", "65 65 65 65\n", "66 66 66 66\n", "67 67 67 67\n", "68 68 68 68\n", "69 69 69 69\n", "70 70 70 70\n", "71 71 71 71\n", "72 72 72 72\n", "73 73 73 73\n", "74 74 74 74\n", "75 75 75 75\n", "76 76 76 76\n", "77 77 77 77\n", "99999999999999999999999999999999999999999999999\n", " x y z\n", "78 78 78 78\n", "79 79 79 79\n", "80 80 80 80\n", "81 81 81 81\n", "82 82 82 82\n", "83 83 83 83\n", "84 84 84 84\n", "85 85 85 85\n", "86 86 86 86\n", "87 87 87 87\n", "88 88 88 88\n", "89 89 89 89\n", "90 90 90 90\n", "99999999999999999999999999999999999999999999999\n", " 0 1 \\\n", "0 {'sum': 0, 'min': 0, 'max': 0} {'sum': 3, 'min': 1, 'max': 1} \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "\n", " 2 3 \\\n", "0 {'sum': 6, 'min': 2, 'max': 2} {'sum': 9, 'min': 3, 'max': 3} \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "\n", " 4 5 \\\n", "0 {'sum': 12, 'min': 4, 'max': 4} {'sum': 15, 'min': 5, 'max': 5} \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "\n", " 6 7 \\\n", "0 {'sum': 18, 'min': 6, 'max': 6} {'sum': 21, 'min': 7, 'max': 7} \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "\n", " 8 9 ... \\\n", "0 {'sum': 24, 'min': 8, 'max': 8} {'sum': 27, 'min': 9, 'max': 9} ... \n", "0 NaN NaN ... \n", "0 NaN NaN ... \n", "0 NaN NaN ... \n", "0 NaN NaN ... \n", "0 NaN NaN ... \n", "0 NaN NaN ... \n", "0 NaN NaN ... \n", "\n", " 90 91 \\\n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 {'sum': 270, 'min': 90, 'max': 90} NaN \n", "0 NaN {'sum': 273, 'min': 91, 'max': 91} \n", "\n", " 92 93 \\\n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 {'sum': 276, 'min': 92, 'max': 92} {'sum': 279, 'min': 93, 'max': 93} \n", "\n", " 94 95 \\\n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 {'sum': 282, 'min': 94, 'max': 94} {'sum': 285, 'min': 95, 'max': 95} \n", "\n", " 96 97 \\\n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 {'sum': 288, 'min': 96, 'max': 96} {'sum': 291, 'min': 97, 'max': 97} \n", "\n", " 98 99 \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 {'sum': 294, 'min': 98, 'max': 98} {'sum': 297, 'min': 99, 'max': 99} \n", "\n", "[8 rows x 100 columns]\n", "--\n" ] } ], "source": [ "import functools\n", "import dask\n", "import dask.dataframe as dd\n", "import pandas as pd\n", "\n", "pdf = pd.DataFrame({\n", " 'x': range(0, 100),\n", " 'y': range(0, 100),\n", " 'z': range(0, 100)\n", "})\n", "\n", "ddf = dd.from_pandas(pdf, npartitions=8)\n", "\n", "print('Number of partitions', ddf.npartitions)\n", "\n", "\n", "def compute_stats(row):\n", " return {\n", " 'sum': row['x'] + row['y'] + row['z'],\n", " 'min': min(row),\n", " 'max': max(row)\n", " }\n", "\n", "\n", "def accum_stats(stats_accum, stats):\n", " return {\n", " 'sum': stats_accum['sum'] + stats['sum'],\n", " 'min': min(stats_accum['min'], stats['min']),\n", " 'max': max(stats_accum['max'], stats['max'])\n", " }\n", "\n", "\n", "def compute_stats_partition(pdf):\n", " pds = pdf.apply(compute_stats, axis=1)\n", " print(pdf)\n", " print(\"99999999999999999999999999999999999999999999999\")\n", " return pds\n", " return functools.reduce(accum_stats, pds)\n", "\n", "\n", "def merge_stats_series(pds):\n", " print(pds)\n", " print(\"--\")\n", " return pds\n", " return functools.reduce(accum_stats, pds)\n", "\n", "\n", "res = ddf.reduction(\n", " compute_stats_partition,\n", " merge_stats_series,\n", " meta={\n", " 'sum': 'int64',\n", " 'min': 'int64',\n", " 'max': 'int64'\n", " })\n", "\n", "# singleton dataframe to list of delayed objects\n", "# where each row is a delayed object\n", "# and in this case we just want the first one\n", "delayed_dict = res.to_delayed()[0]\n", "a = dd.compute(delayed_dict)" ] }, { "cell_type": "code", "execution_count": 361, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\AppData\\Roaming\\Python\\Python37\\site-packages\\dask\\dataframe\\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 2 elements available. Try passing larger `npartitions` to `head`.\n", " warnings.warn(msg.format(n, len(r)))\n" ] } ], "source": [ "df = op.load.file(\"http://159.65.217.17:5003/uploads/datasetFile-1590447280612.csv\").ext.cache()\n", "df = df.ext.optimize()\n", "df = df.ext.repartition(8).ext.cache()\n", "_output = df.ext.profile(columns=\"*\", infer=True, output=\"json\")" ] }, { "cell_type": "code", "execution_count": 362, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\"columns\": {\"customer_id\": {\"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 17, \"frequency\": [{\"value\": \"0fju234978rfjkhsdf\", \"count\": 4}, {\"value\": \"0f34ruiy23e78y2r\", \"count\": 4}, {\"value\": \"0fue298y2r23r23r5\", \"count\": 1}, {\"value\": \"0fue298y2r23r23r4\", \"count\": 1}, {\"value\": \"0fue298y2r23r23r3\", \"count\": 1}, {\"value\": \"0fue298y2r23r23r2\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy349\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy348\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy347\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy346\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy345\", \"count\": 1}], \"count_uniques\": 11}, \"dtype\": \"object\", \"profiler_dtype\": {\"mismatch\": 0, \"missing\": 0, \"match\": 17, \"frequency\": [{\"value\": \"0fju234978rfjkhsdf\", \"count\": 4}, {\"value\": \"0f34ruiy23e78y2r\", \"count\": 4}, {\"value\": \"0fue298y2r23r23r5\", \"count\": 1}, {\"value\": \"0fue298y2r23r23r4\", \"count\": 1}, {\"value\": \"0fue298y2r23r23r3\", \"count\": 1}, {\"value\": \"0fue298y2r23r23r2\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy349\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy348\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy347\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy346\", \"count\": 1}, {\"value\": \"0f345kjh345oiuy345\", \"count\": 1}], \"count_uniques\": 11}}, \"transactoin_date\": {\"stats\": {\"mismatch\": 0, \"missing\": 6, \"match\": 11, \"frequency\": [{\"value\": \"2010/08/19\", \"count\": 7}, {\"value\": \"2015/08/09\", \"count\": 1}, {\"value\": \"2012/01/05\", \"count\": 1}, {\"value\": \"2011/08/11\", \"count\": 1}, {\"value\": \"2009/08/11\", \"count\": 1}], \"count_uniques\": 5}, \"dtype\": \"object\", \"profiler_dtype\": {\"mismatch\": 0, \"missing\": 6, \"match\": 11, \"frequency\": [{\"value\": \"2010/08/19\", \"count\": 7}, {\"value\": \"2015/08/09\", \"count\": 1}, {\"value\": \"2012/01/05\", \"count\": 1}, {\"value\": \"2011/08/11\", \"count\": 1}, {\"value\": \"2009/08/11\", \"count\": 1}], \"count_uniques\": 5}}, \"ticket_price\": {\"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 17, \"frequency\": [{\"value\": \"9.99\", \"count\": 7}, {\"value\": \"29.99\", \"count\": 5}, {\"value\": \"14.99\", \"count\": 4}, {\"value\": \"9.91\", \"count\": 1}], \"count_uniques\": 4}, \"dtype\": \"object\", \"profiler_dtype\": {\"mismatch\": 0, \"missing\": 0, \"match\": 17, \"frequency\": [{\"value\": \"9.99\", \"count\": 7}, {\"value\": \"29.99\", \"count\": 5}, {\"value\": \"14.99\", \"count\": 4}, {\"value\": \"9.91\", \"count\": 1}], \"count_uniques\": 4}}, \"discount\": {\"stats\": {\"mismatch\": 5, \"missing\": 12, \"match\": 0, \"frequency\": [{\"value\": \"5%\", \"count\": 5}], \"count_uniques\": 1}, \"dtype\": \"object\", \"profiler_dtype\": {\"mismatch\": 5, \"missing\": 12, \"match\": 0, \"frequency\": [{\"value\": \"5%\", \"count\": 5}], \"count_uniques\": 1}}, \"product\": {\"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 17, \"frequency\": [{\"value\": \"basic\", \"count\": 8}, {\"value\": \"platinum\", \"count\": 5}, {\"value\": \"deluxe\", \"count\": 4}], \"count_uniques\": 3}, \"dtype\": \"object\", \"profiler_dtype\": {\"mismatch\": 0, \"missing\": 0, \"match\": 17, \"frequency\": [{\"value\": \"basic\", \"count\": 8}, {\"value\": \"platinum\", \"count\": 5}, {\"value\": \"deluxe\", \"count\": 4}], \"count_uniques\": 3}}, \"info\": {\"stats\": {\"mismatch\": 0, \"missing\": 0, \"match\": 17, \"hist\": [{\"lower\": 1.0, \"upper\": 1.28125, \"count\": 0}, {\"lower\": 1.28125, \"upper\": 1.5625, \"count\": 0}, {\"lower\": 1.5625, \"upper\": 1.84375, \"count\": 0}, {\"lower\": 1.84375, \"upper\": 2.125, \"count\": 0}, {\"lower\": 2.125, \"upper\": 2.40625, \"count\": 0}, {\"lower\": 2.40625, \"upper\": 2.6875, \"count\": 0}, {\"lower\": 2.6875, \"upper\": 2.96875, \"count\": 0}, {\"lower\": 2.96875, \"upper\": 3.25, \"count\": 0}, {\"lower\": 3.25, \"upper\": 3.53125, \"count\": 0}, {\"lower\": 3.53125, \"upper\": 3.8125, \"count\": 0}, {\"lower\": 3.8125, \"upper\": 4.09375, \"count\": 0}, {\"lower\": 4.09375, \"upper\": 4.375, \"count\": 0}, {\"lower\": 4.375, \"upper\": 4.65625, \"count\": 0}, {\"lower\": 4.65625, \"upper\": 4.9375, \"count\": 0}, {\"lower\": 4.9375, \"upper\": 5.21875, \"count\": 0}, {\"lower\": 5.21875, \"upper\": 5.5, \"count\": 0}, {\"lower\": 5.5, \"upper\": 5.78125, \"count\": 0}, {\"lower\": 5.78125, \"upper\": 6.0625, \"count\": 0}, {\"lower\": 6.0625, \"upper\": 6.34375, \"count\": 0}, {\"lower\": 6.34375, \"upper\": 6.625, \"count\": 0}, {\"lower\": 6.625, \"upper\": 6.90625, \"count\": 0}, {\"lower\": 6.90625, \"upper\": 7.1875, \"count\": 0}, {\"lower\": 7.1875, \"upper\": 7.46875, \"count\": 0}, {\"lower\": 7.46875, \"upper\": 7.75, \"count\": 0}, {\"lower\": 7.75, \"upper\": 8.03125, \"count\": 0}, {\"lower\": 8.03125, \"upper\": 8.3125, \"count\": 0}, {\"lower\": 8.3125, \"upper\": 8.59375, \"count\": 0}, {\"lower\": 8.59375, \"upper\": 8.875, \"count\": 0}, {\"lower\": 8.875, \"upper\": 9.15625, \"count\": 0}, {\"lower\": 9.15625, \"upper\": 9.4375, \"count\": 0}, {\"lower\": 9.4375, \"upper\": 9.71875, \"count\": 0}, {\"lower\": 9.71875, \"upper\": 10.0, \"count\": 3}], \"count_uniques\": 10}, \"dtype\": \"uint8\", \"profiler_dtype\": {\"mismatch\": 0, \"missing\": 0, \"match\": 17, \"hist\": [{\"lower\": 1.0, \"upper\": 1.28125, \"count\": 0}, {\"lower\": 1.28125, \"upper\": 1.5625, \"count\": 0}, {\"lower\": 1.5625, \"upper\": 1.84375, \"count\": 0}, {\"lower\": 1.84375, \"upper\": 2.125, \"count\": 0}, {\"lower\": 2.125, \"upper\": 2.40625, \"count\": 0}, {\"lower\": 2.40625, \"upper\": 2.6875, \"count\": 0}, {\"lower\": 2.6875, \"upper\": 2.96875, \"count\": 0}, {\"lower\": 2.96875, \"upper\": 3.25, \"count\": 0}, {\"lower\": 3.25, \"upper\": 3.53125, \"count\": 0}, {\"lower\": 3.53125, \"upper\": 3.8125, \"count\": 0}, {\"lower\": 3.8125, \"upper\": 4.09375, \"count\": 0}, {\"lower\": 4.09375, \"upper\": 4.375, \"count\": 0}, {\"lower\": 4.375, \"upper\": 4.65625, \"count\": 0}, {\"lower\": 4.65625, \"upper\": 4.9375, \"count\": 0}, {\"lower\": 4.9375, \"upper\": 5.21875, \"count\": 0}, {\"lower\": 5.21875, \"upper\": 5.5, \"count\": 0}, {\"lower\": 5.5, \"upper\": 5.78125, \"count\": 0}, {\"lower\": 5.78125, \"upper\": 6.0625, \"count\": 0}, {\"lower\": 6.0625, \"upper\": 6.34375, \"count\": 0}, {\"lower\": 6.34375, \"upper\": 6.625, \"count\": 0}, {\"lower\": 6.625, \"upper\": 6.90625, \"count\": 0}, {\"lower\": 6.90625, \"upper\": 7.1875, \"count\": 0}, {\"lower\": 7.1875, \"upper\": 7.46875, \"count\": 0}, {\"lower\": 7.46875, \"upper\": 7.75, \"count\": 0}, {\"lower\": 7.75, \"upper\": 8.03125, \"count\": 0}, {\"lower\": 8.03125, \"upper\": 8.3125, \"count\": 0}, {\"lower\": 8.3125, \"upper\": 8.59375, \"count\": 0}, {\"lower\": 8.59375, \"upper\": 8.875, \"count\": 0}, {\"lower\": 8.875, \"upper\": 9.15625, \"count\": 0}, {\"lower\": 9.15625, \"upper\": 9.4375, \"count\": 0}, {\"lower\": 9.4375, \"upper\": 9.71875, \"count\": 0}, {\"lower\": 9.71875, \"upper\": 10.0, \"count\": 3}], \"count_uniques\": 10}}}, \"name\": null, \"file_name\": \"tmp_vk79fcm.csv\", \"summary\": {\"cols_count\": 6, \"rows_count\": 17, \"dtypes_list\": [\"object\", \"uint8\"], \"total_count_dtypes\": 2, \"missing_count\": 0, \"p_missing\": 0.0}}\n" ] } ], "source": [ "print(_output)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1, 2, 3)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##Bag\n", "# each element is an integer\n", "import dask.bag as db\n", "b = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], npartitions=1)\n", "b.take(3)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 850 ms\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
01
12
23
34
45
56
67
78
89
910
\n", "
" ], "text/plain": [ " 0\n", "0 1\n", "1 2\n", "2 3\n", "3 4\n", "4 5\n", "5 6\n", "6 7\n", "7 8\n", "8 9\n", "9 10" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "b.to_dataframe().compute()" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "from optimus.helpers.functions import match_date" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['dd', ' ', 'MM', ' ', 'yy', ' ', 'h']\n", "False\n" ] } ], "source": [ "import re\n", "pattern = match_date(\"dd MM yy h\")\n", "string = \"11 05 1980 2\"\n", "prog = re.compile(pattern)\n", "result = prog.match(string)\n", "print(bool(result))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }