{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.append(\"..\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from optimus import Optimus" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:root:Found pyspark version \"3.2.0\" installed. The pyspark version 3.2 and above has a built-in \"pandas APIs on Spark\" module ported from Koalas. Try `import pyspark.pandas as ps` instead. \n", "WARNING:root:'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. Koalas will set it for you but it does not work if there is a Spark context already launched.\n", "INFO:optimus:Operative System:Windows\n", "INFO:optimus:Just check that all necessary environments vars are present...\n", "INFO:optimus:-----\n", "INFO:optimus:SPARK_HOME=C:\\opt\\spark\\spark-3.1.1-bin-hadoop3.2\n", "INFO:optimus:HADOOP_HOME=C:\\opt\\hadoop-3.2.2\n", "INFO:optimus:PYSPARK_PYTHON=C:\\Users\\argenisleon\\Anaconda3\\envs\\python38\\python.EXE\n", "INFO:optimus:PYSPARK_DRIVER_PYTHON=jupyter\n", "INFO:optimus:PYSPARK_SUBMIT_ARGS=--conf \"spark.sql.catalogImplementation=hive\" pyspark-shell\n", "INFO:optimus:JAVA_HOME=C:\\java\n", "INFO:optimus:Pyarrow Installed\n", "INFO:optimus:-----\n", "INFO:optimus:Starting or getting SparkSession and SparkContext...\n", "INFO:optimus:Spark Version:3.1.1\n", "INFO:optimus:\n", " ____ __ _\n", " / __ \\____ / /_(_)___ ___ __ _______\n", " / / / / __ \\/ __/ / __ `__ \\/ / / / ___/\n", " / /_/ / /_/ / /_/ / / / / / / /_/ (__ )\n", " \\____/ .___/\\__/_/_/ /_/ /_/\\__,_/____/\n", " /_/\n", " \n", "INFO:optimus:Transform and Roll out...\n", "INFO:optimus:Optimus successfully imported. Have fun :).\n" ] } ], "source": [ "op= Optimus(engine=\"spark\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# df = op.load.csv(\"https://raw.githubusercontent.com/hi-primus/optimus/develop-22.1/examples/data/crime.csv\")\n", "df = op.load.csv(\"data/crime.csv\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 10 of 319.1 thousand rows / 17 columns
\n", "
8 partition(s)
\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
INCIDENT_NUMBER
\n", "
1 (object)
\n", " \n", "
\n", "
OFFENSE_CODE
\n", "
2 (object)
\n", " \n", "
\n", "
OFFENSE_CODE_GROUP
\n", "
3 (object)
\n", " \n", "
\n", "
OFFENSE_DESCRIPTION
\n", "
4 (object)
\n", " \n", "
\n", "
DISTRICT
\n", "
5 (object)
\n", " \n", "
\n", "
REPORTING_AREA
\n", "
6 (object)
\n", " \n", "
\n", "
SHOOTING
\n", "
7 (object)
\n", " \n", "
\n", "
OCCURRED_ON_DATE
\n", "
8 (object)
\n", " \n", "
\n", "
YEAR
\n", "
9 (object)
\n", " \n", "
\n", "
MONTH
\n", "
10 (object)
\n", " \n", "
\n", "
DAY_OF_WEEK
\n", "
11 (object)
\n", " \n", "
\n", "
HOUR
\n", "
12 (object)
\n", " \n", "
\n", "
UCR_PART
\n", "
13 (object)
\n", " \n", "
\n", "
STREET
\n", "
14 (object)
\n", " \n", "
\n", "
Lat
\n", "
15 (object)
\n", " \n", "
\n", "
Long
\n", "
16 (object)
\n", " \n", "
\n", "
Location
\n", "
17 (object)
\n", " \n", "
\n", "
\n", " \n", " I182070945\n", " \n", "
\n", "
\n", "
\n", " \n", " 619\n", " \n", "
\n", "
\n", "
\n", " \n", " LARCENY\n", " \n", "
\n", "
\n", "
\n", " \n", " LARCENY⋅ALL⋅OTHERS\n", " \n", "
\n", "
\n", "
\n", " \n", " D14\n", " \n", "
\n", "
\n", "
\n", " \n", " 808\n", " \n", "
\n", "
\n", "
\n", " \n", " NONE\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-02⋅13:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " SUNDAY\n", " \n", "
\n", "
\n", "
\n", " \n", " 13\n", " \n", "
\n", "
\n", "
\n", " \n", " PART⋅ONE\n", " \n", "
\n", "
\n", "
\n", " \n", " LINCOLN⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.35779134\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.13937053\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.35779134,⋅-71.13937053)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070943\n", " \n", "
\n", "
\n", "
\n", " \n", " 1402\n", " \n", "
\n", "
\n", "
\n", " \n", " VANDALISM\n", " \n", "
\n", "
\n", "
\n", " \n", " VANDALISM\n", " \n", "
\n", "
\n", "
\n", " \n", " C11\n", " \n", "
\n", "
\n", "
\n", " \n", " 347\n", " \n", "
\n", "
\n", "
\n", " \n", " NONE\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-08-21⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " TUESDAY\n", " \n", "
\n", "
\n", "
\n", " \n", " 0\n", " \n", "
\n", "
\n", "
\n", " \n", " PART⋅TWO\n", " \n", "
\n", "
\n", "
\n", " \n", " HECLA⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.30682138\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.06030035\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.30682138,⋅-71.06030035)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070941\n", " \n", "
\n", "
\n", "
\n", " \n", " 3410\n", " \n", "
\n", "
\n", "
\n", " \n", " TOWED\n", " \n", "
\n", "
\n", "
\n", " \n", " TOWED⋅MOTOR⋅VEHICLE\n", " \n", "
\n", "
\n", "
\n", " \n", " D4\n", " \n", "
\n", "
\n", "
\n", " \n", " 151\n", " \n", "
\n", "
\n", "
\n", " \n", " NONE\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅19:27:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " MONDAY\n", " \n", "
\n", "
\n", "
\n", " \n", " 19\n", " \n", "
\n", "
\n", "
\n", " \n", " PART⋅THREE\n", " \n", "
\n", "
\n", "
\n", " \n", " CAZENOVE⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.34658879\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07242943\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.34658879,⋅-71.07242943)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070940\n", " \n", "
\n", "
\n", "
\n", " \n", " 3114\n", " \n", "
\n", "
\n", "
\n", " \n", " INVESTIGATE⋅PROPERTY\n", " \n", "
\n", "
\n", "
\n", " \n", " INVESTIGATE⋅PROPERTY\n", " \n", "
\n", "
\n", "
\n", " \n", " D4\n", " \n", "
\n", "
\n", "
\n", " \n", " 272\n", " \n", "
\n", "
\n", "
\n", " \n", " NONE\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:16:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " MONDAY\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " PART⋅THREE\n", " \n", "
\n", "
\n", "
\n", " \n", " NEWCOMB⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.33418175\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07866441\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.33418175,⋅-71.07866441)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070938\n", " \n", "
\n", "
\n", "
\n", " \n", " 3114\n", " \n", "
\n", "
\n", "
\n", " \n", " INVESTIGATE⋅PROPERTY\n", " \n", "
\n", "
\n", "
\n", " \n", " INVESTIGATE⋅PROPERTY\n", " \n", "
\n", "
\n", "
\n", " \n", " B3\n", " \n", "
\n", "
\n", "
\n", " \n", " 421\n", " \n", "
\n", "
\n", "
\n", " \n", " NONE\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:05:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " MONDAY\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " PART⋅THREE\n", " \n", "
\n", "
\n", "
\n", " \n", " DELHI⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.27536542\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.09036101\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.27536542,⋅-71.09036101)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070936\n", " \n", "
\n", "
\n", "
\n", " \n", " 3820\n", " \n", "
\n", "
\n", "
\n", " \n", " MOTOR⋅VEHICLE⋅ACCIDENT⋅RESPONSE\n", " \n", "
\n", "
\n", "
\n", " \n", " M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY\n", " \n", "
\n", "
\n", "
\n", " \n", " C11\n", " \n", "
\n", "
\n", "
\n", " \n", " 398\n", " \n", "
\n", "
\n", "
\n", " \n", " NONE\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:09:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " MONDAY\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " PART⋅THREE\n", " \n", "
\n", "
\n", "
\n", " \n", " TALBOT⋅AVE\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.29019621\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07159012\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.29019621,⋅-71.07159012)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070933\n", " \n", "
\n", "
\n", "
\n", " \n", " 724\n", " \n", "
\n", "
\n", "
\n", " \n", " AUTO⋅THEFT\n", " \n", "
\n", "
\n", "
\n", " \n", " AUTO⋅THEFT\n", " \n", "
\n", "
\n", "
\n", " \n", " B2\n", " \n", "
\n", "
\n", "
\n", " \n", " 330\n", " \n", "
\n", "
\n", "
\n", " \n", " NONE\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:25:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " MONDAY\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " PART⋅ONE\n", " \n", "
\n", "
\n", "
\n", " \n", " NORMANDY⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.30607218\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.0827326\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.30607218,⋅-71.08273260)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070932\n", " \n", "
\n", "
\n", "
\n", " \n", " 3301\n", " \n", "
\n", "
\n", "
\n", " \n", " VERBAL⋅DISPUTES\n", " \n", "
\n", "
\n", "
\n", " \n", " VERBAL⋅DISPUTE\n", " \n", "
\n", "
\n", "
\n", " \n", " B2\n", " \n", "
\n", "
\n", "
\n", " \n", " 584\n", " \n", "
\n", "
\n", "
\n", " \n", " NONE\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅20:39:37\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " MONDAY\n", " \n", "
\n", "
\n", "
\n", " \n", " 20\n", " \n", "
\n", "
\n", "
\n", " \n", " PART⋅THREE\n", " \n", "
\n", "
\n", "
\n", " \n", " LAWN⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.32701648\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.10555088\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.32701648,⋅-71.10555088)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070931\n", " \n", "
\n", "
\n", "
\n", " \n", " 301\n", " \n", "
\n", "
\n", "
\n", " \n", " ROBBERY\n", " \n", "
\n", "
\n", "
\n", " \n", " ROBBERY⋅-⋅STREET\n", " \n", "
\n", "
\n", "
\n", " \n", " C6\n", " \n", "
\n", "
\n", "
\n", " \n", " 177\n", " \n", "
\n", "
\n", "
\n", " \n", " NONE\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅20:48:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " MONDAY\n", " \n", "
\n", "
\n", "
\n", " \n", " 20\n", " \n", "
\n", "
\n", "
\n", " \n", " PART⋅ONE\n", " \n", "
\n", "
\n", "
\n", " \n", " MASSACHUSETTS⋅AVE\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.33152148\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07085307\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.33152148,⋅-71.07085307)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070929\n", " \n", "
\n", "
\n", "
\n", " \n", " 3301\n", " \n", "
\n", "
\n", "
\n", " \n", " VERBAL⋅DISPUTES\n", " \n", "
\n", "
\n", "
\n", " \n", " VERBAL⋅DISPUTE\n", " \n", "
\n", "
\n", "
\n", " \n", " C11\n", " \n", "
\n", "
\n", "
\n", " \n", " 364\n", " \n", "
\n", "
\n", "
\n", " \n", " NONE\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅20:38:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " MONDAY\n", " \n", "
\n", "
\n", "
\n", " \n", " 20\n", " \n", "
\n", "
\n", "
\n", " \n", " PART⋅THREE\n", " \n", "
\n", "
\n", "
\n", " \n", " LESLIE⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.29514664\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.05860832\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.29514664,⋅-71.05860832)\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070928\n", " \n", "
\n", "
\n", "
\n", " \n", " 3301\n", " \n", "
\n", "
\n", "
\n", " \n", " VERBAL⋅DISPUTES\n", " \n", "
\n", "
\n", "
\n", " \n", " VERBAL⋅DISPUTE\n", " \n", "
\n", "
\n", "
\n", " \n", " C6\n", " \n", "
\n", "
\n", "
\n", " \n", " 913\n", " \n", "
\n", "
\n", "
\n", " \n", " NONE\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅19:55:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " MONDAY\n", " \n", "
\n", "
\n", "
\n", " \n", " 19\n", " \n", "
\n", "
\n", "
\n", " \n", " PART⋅THREE\n", " \n", "
\n", "
\n", "
\n", " \n", " OCEAN⋅VIEW⋅DR\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.31957856\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.04032766\n", " \n", "
\n", "
\n", "
\n", " \n", " (42.31957856,⋅-71.04032766)\n", " \n", "
\n", "
\n", "
\n", "
\n", "\n", "
Viewing 10 of 319.1 thousand rows / 17 columns
\n", "
8 partition(s) <class 'optimus.engines.spark.dataframe.SparkDataFrame'>
\n", "\n" ], "text/plain": [ "INCIDENT_NUMBER OFFENSE_CODE OFFENSE_CODE_GROUP OFFENSE_DESCRIPTION DISTRICT REPORTING_AREA SHOOTING OCCURRED_ON_DATE YEAR MONTH DAY_OF_WEEK HOUR UCR_PART STREET Lat Long Location\n", "(object) (object) (object) (object) (object) (object) (object) (object) (object) (object) (object) (object) (object) (object) (object) (object) (object)\n", "----------------- -------------- ------------------------------- ------------------------------------------ ---------- ---------------- ---------- ------------------- ---------- ---------- ------------- ---------- ---------- ----------------- ---------- ---------- ---------------------------\n", "I182070945 619 LARCENY LARCENY ALL OTHERS D14 808 NONE 2018-09-02 13:00:00 2018 9 SUNDAY 13 PART ONE LINCOLN ST 42.3578 -71.1394 (42.35779134, -71.13937053)\n", "I182070943 1402 VANDALISM VANDALISM C11 347 NONE 2018-08-21 00:00:00 2018 8 TUESDAY 0 PART TWO HECLA ST 42.3068 -71.0603 (42.30682138, -71.06030035)\n", "I182070941 3410 TOWED TOWED MOTOR VEHICLE D4 151 NONE 2018-09-03 19:27:00 2018 9 MONDAY 19 PART THREE CAZENOVE ST 42.3466 -71.0724 (42.34658879, -71.07242943)\n", "I182070940 3114 INVESTIGATE PROPERTY INVESTIGATE PROPERTY D4 272 NONE 2018-09-03 21:16:00 2018 9 MONDAY 21 PART THREE NEWCOMB ST 42.3342 -71.0787 (42.33418175, -71.07866441)\n", "I182070938 3114 INVESTIGATE PROPERTY INVESTIGATE PROPERTY B3 421 NONE 2018-09-03 21:05:00 2018 9 MONDAY 21 PART THREE DELHI ST 42.2754 -71.0904 (42.27536542, -71.09036101)\n", "I182070936 3820 MOTOR VEHICLE ACCIDENT RESPONSE M/V ACCIDENT INVOLVING PEDESTRIAN - INJURY C11 398 NONE 2018-09-03 21:09:00 2018 9 MONDAY 21 PART THREE TALBOT AVE 42.2902 -71.0716 (42.29019621, -71.07159012)\n", "I182070933 724 AUTO THEFT AUTO THEFT B2 330 NONE 2018-09-03 21:25:00 2018 9 MONDAY 21 PART ONE NORMANDY ST 42.3061 -71.0827 (42.30607218, -71.08273260)\n", "I182070932 3301 VERBAL DISPUTES VERBAL DISPUTE B2 584 NONE 2018-09-03 20:39:37 2018 9 MONDAY 20 PART THREE LAWN ST 42.327 -71.1056 (42.32701648, -71.10555088)\n", "I182070931 301 ROBBERY ROBBERY - STREET C6 177 NONE 2018-09-03 20:48:00 2018 9 MONDAY 20 PART ONE MASSACHUSETTS AVE 42.3315 -71.0709 (42.33152148, -71.07085307)\n", "I182070929 3301 VERBAL DISPUTES VERBAL DISPUTE C11 364 NONE 2018-09-03 20:38:00 2018 9 MONDAY 20 PART THREE LESLIE ST 42.2951 -71.0586 (42.29514664, -71.05860832)\n", "I182070928 3301 VERBAL DISPUTES VERBAL DISPUTE C6 913 NONE 2018-09-03 19:55:00 2018 9 MONDAY 19 PART THREE OCEAN VIEW DR 42.3196 -71.0403 (42.31957856, -71.04032766)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.upper()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 619.0\n", "1 1402.0\n", "2 3410.0\n", "3 3114.0\n", "4 3114.0\n", "5 3820.0\n", "6 724.0\n", "7 3301.0\n", "8 301.0\n", "9 3301.0\n", "10 3301.0\n", "11 3114.0\n", "12 3108.0\n", "13 2647.0\n", "14 3201.0\n", "15 3006.0\n", "16 3301.0\n", "17 3305.0\n", "18 2647.0\n", "19 614.0\n", "20 3006.0\n", "21 3801.0\n", "22 3006.0\n", "23 3803.0\n", "24 522.0\n", "25 3831.0\n", "26 3006.0\n", "27 802.0\n", "28 2007.0\n", "29 2900.0\n", "30 2907.0\n", "31 2629.0\n", "32 802.0\n", "33 2662.0\n", "34 3207.0\n", "35 614.0\n", "36 613.0\n", "37 3109.0\n", "38 2612.0\n", "39 1843.0\n", "40 3125.0\n", "41 1841.0\n", "42 301.0\n", "43 1402.0\n", "44 3802.0\n", "45 619.0\n", "46 3801.0\n", "47 1402.0\n", "48 3018.0\n", "49 3301.0\n", "50 3831.0\n", "51 3803.0\n", "52 802.0\n", "53 2405.0\n", "54 617.0\n", "55 3625.0\n", "56 3106.0\n", "57 1402.0\n", "58 3501.0\n", "59 3115.0\n", "60 616.0\n", "61 3807.0\n", "62 301.0\n", "63 1102.0\n", "64 3801.0\n", "65 3006.0\n", "66 3114.0\n", "67 3006.0\n", "68 2629.0\n", "69 3201.0\n", "70 613.0\n", "71 3006.0\n", "72 802.0\n", "73 802.0\n", "74 3115.0\n", "75 3006.0\n", "76 1849.0\n", "77 802.0\n", "78 613.0\n", "79 724.0\n", "80 3831.0\n", "81 3006.0\n", "82 1102.0\n", "83 2647.0\n", "84 3820.0\n", "85 613.0\n", "86 3115.0\n", "87 3301.0\n", "88 3803.0\n", "89 3201.0\n", "90 617.0\n", "91 2905.0\n", "92 3802.0\n", "93 3410.0\n", "94 1102.0\n", "95 2647.0\n", "96 1810.0\n", "97 1842.0\n", "98 613.0\n", "99 3801.0\n", "100 3831.0\n", "101 3006.0\n", "102 3114.0\n", "103 614.0\n", "104 1402.0\n", "105 1806.0\n", "106 1848.0\n", "107 1844.0\n", "108 3831.0\n", "109 3301.0\n", "110 619.0\n", "111 423.0\n", "112 522.0\n", "113 3115.0\n", "114 3006.0\n", "115 3831.0\n", "116 3205.0\n", "117 2647.0\n", "118 413.0\n", "119 1501.0\n", "120 616.0\n", "121 522.0\n", "122 1402.0\n", "123 3001.0\n", "124 3831.0\n", "125 2900.0\n", "126 3410.0\n", "127 2629.0\n", "128 3301.0\n", "129 3115.0\n", "130 301.0\n", "131 3301.0\n", "132 3006.0\n", "133 613.0\n", "134 2900.0\n", "135 3801.0\n", "136 3114.0\n", "137 3410.0\n", "138 3830.0\n", "139 3801.0\n", "140 3301.0\n", "141 3207.0\n", "142 561.0\n", "143 2914.0\n", "144 3006.0\n", "145 423.0\n", "146 802.0\n", "147 2405.0\n", "148 2610.0\n", "149 3006.0\n", "150 619.0\n", "151 3410.0\n", "152 3115.0\n", "153 802.0\n", "154 3301.0\n", "155 1109.0\n", "156 3201.0\n", "157 3115.0\n", "158 3201.0\n", "159 706.0\n", "160 1402.0\n", "161 3115.0\n", "162 1402.0\n", "163 3006.0\n", "164 3301.0\n", "165 614.0\n", "166 3115.0\n", "167 413.0\n", "168 2403.0\n", "169 3801.0\n", "170 3301.0\n", "171 3114.0\n", "172 3115.0\n", "173 3006.0\n", "174 3115.0\n", "175 1402.0\n", "176 301.0\n", "177 423.0\n", "178 3160.0\n", "179 900.0\n", "180 2610.0\n", "181 3301.0\n", "182 3301.0\n", "183 413.0\n", "184 3115.0\n", "185 3125.0\n", "186 3831.0\n", "187 3115.0\n", "188 3114.0\n", "189 3301.0\n", "190 801.0\n", "191 3801.0\n", "192 3301.0\n", "193 3002.0\n", "194 802.0\n", "195 413.0\n", "196 2900.0\n", "197 3410.0\n", "198 802.0\n", "199 3820.0\n", "200 3114.0\n", "201 3301.0\n", "202 802.0\n", "203 802.0\n", "204 802.0\n", "205 619.0\n", "206 520.0\n", "207 2007.0\n", "208 3801.0\n", "209 802.0\n", "210 1402.0\n", "211 613.0\n", "212 802.0\n", "213 619.0\n", "214 3006.0\n", "215 3115.0\n", "216 3114.0\n", "217 3006.0\n", "218 3114.0\n", "219 413.0\n", "220 2405.0\n", "221 2610.0\n", "222 3301.0\n", "223 413.0\n", "224 3006.0\n", "225 3801.0\n", "226 3115.0\n", "227 1402.0\n", "228 2648.0\n", "229 3108.0\n", "230 2905.0\n", "231 3802.0\n", "232 3115.0\n", "233 3803.0\n", "234 3501.0\n", "235 3006.0\n", "236 423.0\n", "237 802.0\n", "238 802.0\n", "239 2647.0\n", "240 3207.0\n", "241 613.0\n", "242 802.0\n", "243 3802.0\n", "244 3803.0\n", "245 3301.0\n", "246 3802.0\n", "247 3301.0\n", "248 1402.0\n", "249 613.0\n", "250 670.0\n", "251 615.0\n", "252 3116.0\n", "253 802.0\n", "254 3802.0\n", "255 3006.0\n", "256 3301.0\n", "257 3201.0\n", "258 1504.0\n", "259 423.0\n", "260 2405.0\n", "261 3006.0\n", "262 2629.0\n", "263 3831.0\n", "264 1830.0\n", "265 3831.0\n", "266 3006.0\n", "267 614.0\n", "268 614.0\n", "269 1402.0\n", "270 2405.0\n", "271 3108.0\n", "272 3820.0\n", "273 361.0\n", "274 3006.0\n", "275 802.0\n", "276 2006.0\n", "277 801.0\n", "278 3831.0\n", "279 3114.0\n", "280 1843.0\n", "281 1849.0\n", "282 1810.0\n", "283 3006.0\n", "284 3831.0\n", "285 3115.0\n", "286 3006.0\n", "287 724.0\n", "288 3831.0\n", "289 616.0\n", "290 1402.0\n", "291 619.0\n", "292 1402.0\n", "293 3410.0\n", "294 802.0\n", "295 616.0\n", "296 3114.0\n", "297 3114.0\n", "298 3114.0\n", "299 311.0\n", "300 3006.0\n", "301 619.0\n", "302 2647.0\n", "303 3301.0\n", "304 3006.0\n", "305 1402.0\n", "306 1402.0\n", "307 301.0\n", "308 3115.0\n", "309 3114.0\n", "310 3831.0\n", "311 3006.0\n", "312 3831.0\n", "313 3201.0\n", "314 613.0\n", "315 3115.0\n", "316 617.0\n", "317 802.0\n", "318 413.0\n", "319 3301.0\n", "320 3410.0\n", "321 3006.0\n", "322 520.0\n", "323 3301.0\n", "324 1001.0\n", "325 2647.0\n", "326 2907.0\n", "327 613.0\n", "328 1830.0\n", "329 3803.0\n", "330 706.0\n", "331 3410.0\n", "332 3006.0\n", "333 3201.0\n", "334 3810.0\n", "335 802.0\n", "336 2647.0\n", "337 2401.0\n", "338 2405.0\n", "339 3501.0\n", "340 2629.0\n", "341 613.0\n", "342 1402.0\n", "343 3501.0\n", "344 3301.0\n", "345 301.0\n", "346 413.0\n", "347 2403.0\n", "348 3301.0\n", "349 413.0\n", "350 3201.0\n", "351 616.0\n", "352 3802.0\n", "353 3831.0\n", "354 2646.0\n", "355 3201.0\n", "356 3115.0\n", "357 3201.0\n", "358 1402.0\n", "359 3001.0\n", "360 613.0\n", "361 3831.0\n", "362 3115.0\n", "363 3802.0\n", "364 3802.0\n", "365 3301.0\n", "366 3821.0\n", "367 619.0\n", "368 3831.0\n", "369 3831.0\n", "370 613.0\n", "371 613.0\n", "372 613.0\n", "373 3811.0\n", "374 2647.0\n", "375 3114.0\n", "376 3201.0\n", "377 3831.0\n", "378 3201.0\n", "379 3006.0\n", "380 3410.0\n", "381 1402.0\n", "382 3115.0\n", "383 3831.0\n", "384 3115.0\n", "385 3410.0\n", "386 361.0\n", "387 3831.0\n", "388 619.0\n", "389 3410.0\n", "390 1402.0\n", "391 619.0\n", "392 1402.0\n", "393 3301.0\n", "394 3006.0\n", "395 3410.0\n", "396 619.0\n", "397 616.0\n", "398 3006.0\n", "399 3001.0\n", "400 3007.0\n", "401 1402.0\n", "402 1402.0\n", "403 3802.0\n", "404 2660.0\n", "405 413.0\n", "406 3115.0\n", "407 619.0\n", "408 802.0\n", "409 3830.0\n", "410 3006.0\n", "411 3114.0\n", "412 301.0\n", "413 3802.0\n", "414 3115.0\n", "415 2647.0\n", "416 3301.0\n", "417 3115.0\n", "418 3114.0\n", "419 3803.0\n", "420 3207.0\n", "421 801.0\n", "422 802.0\n", "423 3114.0\n", "424 3006.0\n", "425 3002.0\n", "426 801.0\n", "427 1402.0\n", "428 3803.0\n", "429 3201.0\n", "430 3006.0\n", "431 3831.0\n", "432 614.0\n", "433 3802.0\n", "434 3115.0\n", "435 1402.0\n", "436 3006.0\n", "437 1402.0\n", "438 423.0\n", "439 3125.0\n", "440 3115.0\n", "441 1831.0\n", "442 3114.0\n", "443 3108.0\n", "444 1402.0\n", "445 706.0\n", "446 3803.0\n", "447 3119.0\n", "448 3820.0\n", "449 3115.0\n", "450 3301.0\n", "451 3108.0\n", "452 3802.0\n", "453 3115.0\n", "454 1402.0\n", "455 3115.0\n", "456 802.0\n", "457 613.0\n", "458 1402.0\n", "459 614.0\n", "460 613.0\n", "461 3006.0\n", "462 619.0\n", "463 724.0\n", "464 3006.0\n", "465 614.0\n", "466 3115.0\n", "467 3831.0\n", "468 802.0\n", "469 2660.0\n", "470 413.0\n", "471 802.0\n", "472 1402.0\n", "473 3170.0\n", "474 3301.0\n", "475 3207.0\n", "476 3006.0\n", "477 2629.0\n", "478 3006.0\n", "479 3301.0\n", "480 802.0\n", "481 615.0\n", "482 1849.0\n", "483 1874.0\n", "484 3006.0\n", "485 2405.0\n", "486 3831.0\n", "487 3112.0\n", "488 3201.0\n", "489 1842.0\n", "490 2610.0\n", "491 802.0\n", "492 2610.0\n", "493 3301.0\n", "494 3115.0\n", "495 3501.0\n", "496 3502.0\n", "497 3831.0\n", "498 3112.0\n", "499 2647.0\n", "500 1102.0\n", "501 617.0\n", "502 3114.0\n", "503 3007.0\n", "504 613.0\n", "505 3115.0\n", "506 2647.0\n", "507 3301.0\n", "508 3301.0\n", "509 3114.0\n", "510 619.0\n", "511 3301.0\n", "512 735.0\n", "513 2905.0\n", "514 2900.0\n", "515 3115.0\n", "516 3301.0\n", "517 1402.0\n", "518 612.0\n", "519 3410.0\n", "520 3115.0\n", "521 560.0\n", "522 3115.0\n", "523 3831.0\n", "524 619.0\n", "525 3006.0\n", "526 2647.0\n", "527 381.0\n", "528 2647.0\n", "529 1849.0\n", "530 613.0\n", "531 3402.0\n", "532 735.0\n", "533 3115.0\n", "534 1402.0\n", "535 724.0\n", "536 3831.0\n", "537 802.0\n", "538 724.0\n", "539 3119.0\n", "540 3115.0\n", "541 3115.0\n", "542 1810.0\n", "543 1843.0\n", "544 1815.0\n", "545 1402.0\n", "546 802.0\n", "547 2647.0\n", "548 614.0\n", "549 613.0\n", "550 3125.0\n", "551 3831.0\n", "552 3125.0\n", "553 3410.0\n", "554 802.0\n", "555 3410.0\n", "556 3802.0\n", "557 3802.0\n", "558 3410.0\n", "559 3115.0\n", "560 3006.0\n", "561 3301.0\n", "562 361.0\n", "563 802.0\n", "564 3810.0\n", "565 2900.0\n", "566 3301.0\n", "567 3802.0\n", "568 2905.0\n", "569 1107.0\n", "570 423.0\n", "571 1402.0\n", "572 3831.0\n", "573 3301.0\n", "574 3207.0\n", "575 3207.0\n", "576 3201.0\n", "577 3114.0\n", "578 3801.0\n", "579 1402.0\n", "580 522.0\n", "581 3410.0\n", "582 2101.0\n", "583 2900.0\n", "584 3410.0\n", "585 3802.0\n", "586 3115.0\n", "587 413.0\n", "588 3831.0\n", "589 2610.0\n", "590 612.0\n", "591 3115.0\n", "592 3301.0\n", "593 3831.0\n", "594 3410.0\n", "595 3831.0\n", "596 3410.0\n", "597 3006.0\n", "598 3115.0\n", "599 802.0\n", "600 3301.0\n", "601 3301.0\n", "602 3114.0\n", "603 802.0\n", "604 3802.0\n", "605 3201.0\n", "606 1106.0\n", "607 3410.0\n", "608 1849.0\n", "609 3831.0\n", "610 3831.0\n", "611 413.0\n", "612 1402.0\n", "613 3831.0\n", "614 3410.0\n", "615 3301.0\n", "616 3006.0\n", "617 617.0\n", "618 3201.0\n", "619 3115.0\n", "620 3201.0\n", "621 3112.0\n", "622 3301.0\n", "623 3301.0\n", "624 3410.0\n", "625 3802.0\n", "626 724.0\n", "627 1402.0\n", "628 614.0\n", "629 2914.0\n", "630 3831.0\n", "631 3410.0\n", "632 1402.0\n", "633 3802.0\n", "634 3831.0\n", "635 613.0\n", "636 619.0\n", "637 615.0\n", "638 3001.0\n", "639 3410.0\n", "640 3006.0\n", "641 3410.0\n", "642 3202.0\n", "643 3006.0\n", "644 3410.0\n", "645 1402.0\n", "646 3831.0\n", "647 1402.0\n", "648 3115.0\n", "649 3410.0\n", "650 3410.0\n", "651 3410.0\n", "652 724.0\n", "653 3301.0\n", "654 2007.0\n", "655 3115.0\n", "656 614.0\n", "657 3410.0\n", "658 3301.0\n", "659 619.0\n", "660 3410.0\n", "661 724.0\n", "662 3410.0\n", "663 3114.0\n", "664 3114.0\n", "665 3410.0\n", "666 3001.0\n", "667 3006.0\n", "668 3115.0\n", "669 2610.0\n", "670 724.0\n", "671 3410.0\n", "672 522.0\n", "673 3006.0\n", "674 3006.0\n", "675 413.0\n", "676 3114.0\n", "677 3006.0\n", "678 3006.0\n", "679 1402.0\n", "680 2629.0\n", "681 2900.0\n", "682 413.0\n", "683 1503.0\n", "684 3006.0\n", "685 619.0\n", "686 3114.0\n", "687 3301.0\n", "688 3115.0\n", "689 301.0\n", "690 413.0\n", "691 3115.0\n", "692 802.0\n", "693 3802.0\n", "694 361.0\n", "695 802.0\n", "696 3114.0\n", "697 2403.0\n", "698 619.0\n", "699 3831.0\n", "700 3114.0\n", "701 3006.0\n", "702 3301.0\n", "703 2900.0\n", "704 1402.0\n", "705 3802.0\n", "706 3831.0\n", "707 2662.0\n", "708 3006.0\n", "709 3115.0\n", "710 1846.0\n", "711 1849.0\n", "712 1841.0\n", "713 3410.0\n", "714 3001.0\n", "715 3006.0\n", "716 802.0\n", "717 2629.0\n", "718 619.0\n", "719 522.0\n", "720 520.0\n", "721 3301.0\n", "722 1402.0\n", "723 1402.0\n", "724 3502.0\n", "725 802.0\n", "726 802.0\n", "727 2647.0\n", "728 3119.0\n", "729 3301.0\n", "730 3202.0\n", "731 2629.0\n", "732 2905.0\n", "733 3002.0\n", "734 3301.0\n", "735 613.0\n", "736 619.0\n", "737 3016.0\n", "738 3006.0\n", "739 619.0\n", "740 3831.0\n", "741 3115.0\n", "742 3115.0\n", "743 3114.0\n", "744 3114.0\n", "745 3201.0\n", "746 3304.0\n", "747 611.0\n", "748 3803.0\n", "749 2629.0\n", "750 413.0\n", "751 3006.0\n", "752 2900.0\n", "753 802.0\n", "754 2914.0\n", "755 3115.0\n", "756 3831.0\n", "757 560.0\n", "758 3501.0\n", "759 802.0\n", "760 619.0\n", "761 3410.0\n", "762 1849.0\n", "763 2647.0\n", "764 3115.0\n", "765 3831.0\n", "766 423.0\n", "767 1402.0\n", "768 2647.0\n", "769 3802.0\n", "770 361.0\n", "771 2622.0\n", "772 423.0\n", "773 3006.0\n", "774 3114.0\n", "775 3201.0\n", "776 3802.0\n", "777 2662.0\n", "778 3831.0\n", "779 2610.0\n", "780 1102.0\n", "781 2646.0\n", "782 1849.0\n", "783 3130.0\n", "784 3006.0\n", "785 1402.0\n", "786 3301.0\n", "787 801.0\n", "788 1503.0\n", "789 2407.0\n", "790 2657.0\n", "791 3801.0\n", "792 3831.0\n", "793 3801.0\n", "794 3831.0\n", "795 1849.0\n", "796 3112.0\n", "797 2610.0\n", "798 3115.0\n", "799 3114.0\n", "800 3115.0\n", "801 3115.0\n", "802 1402.0\n", "803 617.0\n", "804 413.0\n", "805 1849.0\n", "806 3115.0\n", "807 3831.0\n", "808 3831.0\n", "809 3810.0\n", "810 1402.0\n", "811 1831.0\n", "812 3006.0\n", "813 619.0\n", "814 3006.0\n", "815 3111.0\n", "816 3115.0\n", "817 3201.0\n", "818 3831.0\n", "819 3801.0\n", "820 1810.0\n", "821 1849.0\n", "822 619.0\n", "823 3115.0\n", "824 617.0\n", "825 3006.0\n", "826 2647.0\n", "827 2900.0\n", "828 3301.0\n", "829 2647.0\n", "830 613.0\n", "831 3410.0\n", "832 706.0\n", "833 802.0\n", "834 3006.0\n", "835 3006.0\n", "836 614.0\n", "837 3802.0\n", "838 3006.0\n", "839 619.0\n", "840 3503.0\n", "841 2900.0\n", "842 3201.0\n", "843 616.0\n", "844 3115.0\n", "845 1106.0\n", "846 614.0\n", "847 3410.0\n", "848 3831.0\n", "849 3201.0\n", "850 1102.0\n", "851 3201.0\n", "852 617.0\n", "853 3006.0\n", "854 3115.0\n", "855 1106.0\n", "856 802.0\n", "857 3119.0\n", "858 3207.0\n", "859 3115.0\n", "860 706.0\n", "861 3301.0\n", "862 3831.0\n", "863 311.0\n", "864 1106.0\n", "865 3410.0\n", "866 2906.0\n", "867 2629.0\n", "868 413.0\n", "869 802.0\n", "870 3114.0\n", "871 2905.0\n", "872 2647.0\n", "873 1109.0\n", "874 802.0\n", "875 2647.0\n", "876 3301.0\n", "877 724.0\n", "878 2647.0\n", "879 3002.0\n", "880 3112.0\n", "881 619.0\n", "882 614.0\n", "883 3115.0\n", "884 614.0\n", "885 3006.0\n", "886 3006.0\n", "887 3410.0\n", "888 802.0\n", "889 614.0\n", "890 1402.0\n", "891 3112.0\n", "892 423.0\n", "893 3006.0\n", "894 3820.0\n", "895 3301.0\n", "896 3114.0\n", "897 616.0\n", "898 619.0\n", "899 3201.0\n", "900 613.0\n", "901 3831.0\n", "902 2647.0\n", "903 3207.0\n", "904 3801.0\n", "905 3410.0\n", "906 3301.0\n", "907 3802.0\n", "908 3803.0\n", "909 3006.0\n", "910 3115.0\n", "911 619.0\n", "912 3301.0\n", "913 3831.0\n", "914 1106.0\n", "915 3114.0\n", "916 3114.0\n", "917 1402.0\n", "918 724.0\n", "919 3201.0\n", "920 1402.0\n", "921 706.0\n", "922 616.0\n", "923 3201.0\n", "924 3301.0\n", "925 3410.0\n", "926 1402.0\n", "927 3115.0\n", "928 3410.0\n", "929 3125.0\n", "930 802.0\n", "931 1402.0\n", "932 3301.0\n", "933 3125.0\n", "934 2647.0\n", "935 619.0\n", "936 2900.0\n", "937 3802.0\n", "938 724.0\n", "939 3802.0\n", "940 3115.0\n", "941 2646.0\n", "942 2646.0\n", "943 3125.0\n", "944 3201.0\n", "945 3006.0\n", "946 3125.0\n", "947 3114.0\n", "948 2405.0\n", "949 2647.0\n", "950 2647.0\n", "951 1402.0\n", "952 3201.0\n", "953 3831.0\n", "954 706.0\n", "955 3831.0\n", "956 3410.0\n", "957 1402.0\n", "958 2646.0\n", "959 3410.0\n", "960 619.0\n", "961 3115.0\n", "962 3115.0\n", "963 3410.0\n", "964 3115.0\n", "965 3301.0\n", "966 3820.0\n", "967 3831.0\n", "968 802.0\n", "969 1402.0\n", "970 3410.0\n", "971 1402.0\n", "972 3001.0\n", "973 3410.0\n", "974 613.0\n", "975 3803.0\n", "976 3802.0\n", "977 3109.0\n", "978 614.0\n", "979 3006.0\n", "980 540.0\n", "981 1402.0\n", "982 3006.0\n", "983 413.0\n", "984 1402.0\n", "985 3006.0\n", "986 3006.0\n", "987 3501.0\n", "988 613.0\n", "989 1402.0\n", "990 3410.0\n", "991 3006.0\n", "992 802.0\n", "993 1402.0\n", "994 301.0\n", "995 413.0\n", "996 3115.0\n", "997 3410.0\n", "998 2906.0\n", "999 3831.0\n", "Name: OFFENSE_CODE, dtype: float64\n", "Showing only the first 1000" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.data[\"OFFENSE_CODE\"].astype(\"float\")" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'DataFrame' object has no attribute 'cols'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m~\\Anaconda3\\envs\\python38\\lib\\site-packages\\databricks\\koalas\\frame.py\u001b[0m in \u001b[0;36m__getattr__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 11826\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m> 11827\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 11828\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\envs\\python38\\lib\\site-packages\\databricks\\koalas\\indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 456\u001b[0m \u001b[0mseries_name\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 457\u001b[1;33m ) = self._select_cols(cols_sel)\n\u001b[0m\u001b[0;32m 458\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\envs\\python38\\lib\\site-packages\\databricks\\koalas\\indexing.py\u001b[0m in \u001b[0;36m_select_cols\u001b[1;34m(self, cols_sel, missing_keys)\u001b[0m\n\u001b[0;32m 324\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 325\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_select_cols_else\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcols_sel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmissing_keys\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 326\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\envs\\python38\\lib\\site-packages\\databricks\\koalas\\indexing.py\u001b[0m in \u001b[0;36m_select_cols_else\u001b[1;34m(self, cols_sel, missing_keys)\u001b[0m\n\u001b[0;32m 1305\u001b[0m \u001b[0mcols_sel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mcols_sel\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1306\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_from_multiindex_column\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcols_sel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmissing_keys\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1307\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\envs\\python38\\lib\\site-packages\\databricks\\koalas\\indexing.py\u001b[0m in \u001b[0;36m_get_from_multiindex_column\u001b[1;34m(self, key, missing_keys, labels, recursed)\u001b[0m\n\u001b[0;32m 1145\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mmissing_keys\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1146\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mk\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1147\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mKeyError\u001b[0m: 'cols'", "\nDuring handling of the above exception, another exception occurred:\n", "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mC:\\Users\\ARGENI~1\\AppData\\Local\\Temp/ipykernel_2660/4205330054.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcols\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mremove_special_chars\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"OFFENSE_CODE\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;32m~\\Documents\\Optimus\\examples\\..\\optimus\\engines\\base\\columns.py\u001b[0m in \u001b[0;36mremove_special_chars\u001b[1;34m(self, cols, output_cols)\u001b[0m\n\u001b[0;32m 2418\u001b[0m \"\"\"\n\u001b[0;32m 2419\u001b[0m \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mroot\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2420\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcols\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcols\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0ms\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0ms\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mstring\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpunctuation\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"chars\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_cols\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0moutput_cols\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2421\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2422\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mto_datetime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcols\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"*\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_cols\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtransform_format\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m->\u001b[0m \u001b[1;34m'DataFrameType'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Documents\\Optimus\\examples\\..\\optimus\\engines\\spark\\columns.py\u001b[0m in \u001b[0;36mreplace\u001b[1;34m(self, input_cols, search, replace_by, search_by, output_cols)\u001b[0m\n\u001b[0;32m 658\u001b[0m \u001b[0mfilter_dtype\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconstants\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mNUMERIC_TYPES\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 659\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 660\u001b[1;33m \u001b[0mcolumns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mprepare_columns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minput_cols\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_cols\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 661\u001b[0m \u001b[1;31m# columns = prepare_columns(df.data, input_cols, output_cols, filter_by_column_types=filter_dtype)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 662\u001b[0m \u001b[0mdfd\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Documents\\Optimus\\examples\\..\\optimus\\helpers\\columns.py\u001b[0m in \u001b[0;36mprepare_columns\u001b[1;34m(df, cols, output_cols, is_regex, filter_by_column_types, accepts_missing_cols, invert, default, cols_dict, auto_increment, args)\u001b[0m\n\u001b[0;32m 260\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mzip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mcols_dict\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 261\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 262\u001b[1;33m cols = parse_columns(df, cols, is_regex, filter_by_column_types,\n\u001b[0m\u001b[0;32m 263\u001b[0m accepts_missing_cols, invert)\n\u001b[0;32m 264\u001b[0m \u001b[0mmerge\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Documents\\Optimus\\examples\\..\\optimus\\helpers\\columns.py\u001b[0m in \u001b[0;36mparse_columns\u001b[1;34m(df, cols_args, is_regex, filter_by_column_types, accepts_missing_cols, invert)\u001b[0m\n\u001b[0;32m 149\u001b[0m \u001b[1;31m# if columns value is * get all dataframes columns\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 150\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 151\u001b[1;33m \u001b[0mdf_columns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcols\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_names\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 152\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 153\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mis_regex\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\envs\\python38\\lib\\site-packages\\databricks\\koalas\\frame.py\u001b[0m in \u001b[0;36m__getattr__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 11827\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11828\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m> 11829\u001b[1;33m raise AttributeError(\n\u001b[0m\u001b[0;32m 11830\u001b[0m \u001b[1;34m\"'%s' object has no attribute '%s'\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11831\u001b[0m )\n", "\u001b[1;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'cols'" ] } ], "source": [ "df.cols.remove_special_chars(\"OFFENSE_CODE\")" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 73 ms\n" ] }, { "data": { "text/plain": [ "SICK/INJURED/MEDICAL - PERSON 18783\n", "INVESTIGATE PERSON 18754\n", "M/V - LEAVING SCENE - PROPERTY DAMAGE 16323\n", "VANDALISM 15154\n", "ASSAULT SIMPLE - BATTERY 14791\n", "VERBAL DISPUTE 13099\n", "TOWED MOTOR VEHICLE 11287\n", "INVESTIGATE PROPERTY 11124\n", "LARCENY THEFT FROM BUILDING 9069\n", "THREATS TO DO BODILY HARM 9042\n", "LARCENY THEFT FROM MV - NON-ACCESSORY 8893\n", "PROPERTY - LOST 8893\n", "WARRANT ARREST 8407\n", "LARCENY SHOPLIFTING 7949\n", "M/V ACCIDENT - PROPERTY DAMAGE 6557\n", "LARCENY ALL OTHERS 5963\n", "M/V ACCIDENT - PERSONAL INJURY 5131\n", "ASSAULT - AGGRAVATED - BATTERY 4886\n", "FRAUD - FALSE PRETENSE / SCHEME 4413\n", "MISSING PERSON - LOCATED 4365\n", "HARASSMENT 4007\n", "MISSING PERSON 3766\n", "PROPERTY - FOUND 3698\n", "AUTO THEFT 3630\n", "TRESPASSING 3254\n", "FRAUD - CREDIT CARD / ATM FRAUD 3147\n", "ROBBERY - STREET 3056\n", "M/V ACCIDENT - OTHER 2925\n", "ASSAULT - AGGRAVATED 2910\n", "VAL - VIOLATION OF AUTO LAW - OTHER 2894\n", "VAL - OPERATING AFTER REV/SUSP. 2618\n", "DRUGS - POSS CLASS B - COCAINE, ETC. 2591\n", "BURGLARY - RESIDENTIAL - FORCE 2585\n", "SERVICE TO OTHER PD INSIDE OF MA. 2559\n", "BURGLARY - RESIDENTIAL - NO FORCE 2358\n", "LARCENY THEFT OF BICYCLE 2265\n", "DRUGS - POSS CLASS B - INTENT TO MFR DIST DISP 2117\n", "DRUGS - SALE / MANUFACTURING 1991\n", "VAL - OPERATING WITHOUT LICENSE 1963\n", "LARCENY THEFT OF MV PARTS & ACCESSORIES 1951\n", "LICENSE PREMISE VIOLATION 1701\n", "M/V ACCIDENT INVOLVING PEDESTRIAN - INJURY 1656\n", "DRUGS - POSS CLASS A - HEROIN, ETC. 1514\n", "FORGERY / COUNTERFEITING 1451\n", "SICK/INJURED/MEDICAL - POLICE 1405\n", "DISORDERLY CONDUCT 1397\n", "SUDDEN DEATH 1347\n", "VIOL. OF RESTRAINING ORDER W NO ARREST 1338\n", "DRUGS - POSS CLASS A - INTENT TO MFR DIST DISP 1329\n", "M/V - LEAVING SCENE - PERSONAL INJURY 1286\n", "DRUGS - OTHER 1274\n", "FIRE REPORT - HOUSE, BUILDING, ETC. 1269\n", "DEATH INVESTIGATION 1215\n", "DRUGS - SICK ASSIST - HEROIN 1205\n", "FRAUD - IMPERSONATION 1065\n", "VAL - OPERATING UNREG/UNINS �CAR 1051\n", "RECOVERED - MV RECOVERED IN BOSTON (STOLEN OUTSIDE BOSTON) 1050\n", "STOLEN PROPERTY - BUYING / RECEIVING / POSSESSING 1001\n", "BALLISTICS EVIDENCE/FOUND 981\n", "LANDLORD - TENANT SERVICE 968\n", "SEARCH WARRANT 966\n", "ASSAULT - SIMPLE 962\n", "M/V ACCIDENT - INVOLVING �BICYCLE - INJURY 916\n", "BURGLARY - COMMERICAL - FORCE 912\n", "PROPERTY - ACCIDENTAL DAMAGE 912\n", "WEAPON - FIREARM - CARRYING / POSSESSING, ETC 899\n", "DRUGS - POSS CLASS D 897\n", "DRUGS - POSS CLASS D - INTENT TO MFR DIST DISP 879\n", "PROPERTY - MISSING 858\n", "DISTURBING THE PEACE 837\n", "AUTO THEFT - MOTORCYCLE / SCOOTER 815\n", "LIQUOR - DRINKING IN PUBLIC 796\n", "M/V ACCIDENT - POLICE VEHICLE 766\n", "M/V ACCIDENT - OTHER CITY VEHICLE 732\n", "FIREARM/WEAPON - FOUND OR CONFISCATED 688\n", "BURGLARY - RESIDENTIAL - ATTEMPT 661\n", "OTHER OFFENSE 644\n", "ROBBERY - OTHER 635\n", "WEAPON - OTHER - CARRYING / POSSESSING, ETC 623\n", "NOISY PARTY/RADIO-NO ARREST 620\n", "MISSING PERSON - NOT REPORTED - LOCATED 593\n", "DRUGS - POSS CLASS E 586\n", "ROBBERY - COMMERCIAL 554\n", "FIRE REPORT - CAR, BRUSH, ETC. 525\n", "M/V PLATES - LOST 493\n", "OPERATING UNDER THE INFLUENCE ALCOHOL 484\n", "VAL - OPERATING W/O AUTHORIZATION LAWFUL 463\n", "DRUGS - POSS CLASS C 458\n", "LARCENY PICK-POCKET 455\n", "PROPERTY - STOLEN THEN RECOVERED 452\n", "M/V ACCIDENT - INVOLVING BICYCLE - NO INJURY 426\n", "M/V ACCIDENT - INVOLVING PEDESTRIAN - NO INJURY 414\n", "EVADING FARE 407\n", "AUTO THEFT - LEASED/RENTED VEHICLE 402\n", "VIOLATION - CITY ORDINANCE 370\n", "ANIMAL CONTROL - DOG BITES - ETC. 367\n", "SUICIDE / SUICIDE ATTEMPT 356\n", "DRUGS - SICK ASSIST - OTHER HARMFUL DRUG 337\n", "FRAUD - WIRE 334\n", "DRUGS - SICK ASSIST - OTHER NARCOTIC 311\n", "INTIMIDATING WITNESS 309\n", "BURGLARY - COMMERICAL - NO FORCE 308\n", "DANGEROUS OR HAZARDOUS CONDITION 307\n", "EMBEZZLEMENT 296\n", "ANIMAL INCIDENTS 285\n", "DEMONSTRATIONS/RIOT 279\n", "VIOL. OF RESTRAINING ORDER W ARREST 268\n", "GRAFFITI 261\n", "CHILD ENDANGERMENT 256\n", "LIQUOR LAW VIOLATION 256\n", "BURGLARY - OTHER - FORCE 249\n", "AFFRAY 249\n", "WEAPON - FIREARM - OTHER VIOLATION 238\n", "PROPERTY - LOST THEN LOCATED 227\n", "POSSESSION OF BURGLARIOUS TOOLS 227\n", "FUGITIVE FROM JUSTICE 220\n", "HARBOR INCIDENT / VIOLATION 212\n", "DRUGS - POSS CLASS C - INTENT TO MFR DIST DISP 205\n", "DRUGS - POSSESSION 195\n", "SERVICE TO OTHER PD OUTSIDE OF MA. 195\n", "INVESTIGATION FOR ANOTHER AGENCY 194\n", "EXTORTION OR BLACKMAIL 188\n", "LARCENY PURSE SNATCH - NO FORCE 188\n", "CHILD ENDANGERMENT (NO ASSAULT) 185\n", "ROBBERY - BANK 181\n", "DRUGS - CLASS B TRAFFICKING OVER 18 GRAMS 177\n", "BURGLARY - OTHER - NO FORCE 171\n", "REPORT AFFECTING OTHER DEPTS. 166\n", "PROSTITUTION - SOLICITING 161\n", "MURDER, NON-NEGLIGIENT MANSLAUGHTER 161\n", "DRUGS - POSS CLASS E - INTENT TO MFR DIST DISP 154\n", "WEAPON - OTHER - OTHER VIOLATION 147\n", "DRUGS - CLASS A TRAFFICKING OVER 18 GRAMS 132\n", "CRIMINAL HARASSMENT 131\n", "FIRE REPORT/ALARM - FALSE 126\n", "ANNOYING AND ACCOSTING 126\n", "BURGLARY - COMMERICAL - ATTEMPT 112\n", "ROBBERY - HOME INVASION 104\n", "ARSON 94\n", "OPERATING UNDER THE INFLUENCE DRUGS 90\n", "ROBBERY - CAR JACKING 86\n", "DRUGS - CONSP TO VIOL CONTROLLED SUBSTANCE 86\n", "OBSCENE MATERIALS - PORNOGRAPHY 80\n", "HOME INVASION 77\n", "DRUGS - POSSESSION OF DRUG PARAPHANALIA 75\n", "BOMB THREAT 75\n", "RECOVERED STOLEN PLATE 67\n", "ANIMAL ABUSE 64\n", "STALKING 62\n", "CHILD ABANDONMENT (NO ASSAULT) 59\n", "INJURY BICYCLE NO M/V INVOLVED 57\n", "VIOLATION - CITY ORDINANCE CONSTRUCTION PERMIT 51\n", "KIDNAPPING/CUSTODIAL KIDNAPPING 48\n", "KIDNAPPING - ENTICING OR ATTEMPTED 44\n", "BURGLARY - OTHER - ATTEMPT 41\n", "CHINS 36\n", "AIRCRAFT INCIDENTS 36\n", "OBSCENE PHONE CALLS 31\n", "TRUANCY / RUNAWAY 31\n", "NOISY PARTY/RADIO-ARREST 29\n", "PROSTITUTION 29\n", "PROTECTIVE CUSTODY / SAFEKEEPING 27\n", "GATHERING CAUSING ANNOYANCE 27\n", "PRISONER - SUICIDE / SUICIDE ATTEMPT 23\n", "CONSPIRACY EXCEPT DRUG LAW 19\n", "LARCENY THEFT FROM COIN-OP MACHINE 16\n", "FRAUD - WELFARE 15\n", "VIOLATION - HAWKER AND PEDDLER 15\n", "EXPLOSIVES - TURNED IN OR FOUND 14\n", "EXPLOSIVES - POSSESSION OR USE 13\n", "WEAPON - FIREARM - SALE / TRAFFICKING 13\n", "ABDUCTION - INTICING 12\n", "PROSTITUTION - COMMON NIGHTWALKER 12\n", "PROPERTY - CONCEALING LEASED 10\n", "FIREARM/WEAPON - LOST 10\n", "FIREARM/WEAPON - ACCIDENTAL INJURY / DEATH 10\n", "PRISONER ESCAPE / ESCAPE & RECAPTURE 9\n", "CUSTODIAL KIDNAPPING 9\n", "CONTRIBUTING TO DELINQUENCY OF MINOR 8\n", "GAMBLING - BETTING / WAGERING 8\n", "ASSAULT & BATTERY 8\n", "HUMAN TRAFFICKING - COMMERCIAL SEX ACTS 7\n", "LARCENY OTHER $200 & OVER 7\n", "DRUGS - CLASS D TRAFFICKING OVER 50 GRAMS 6\n", "ASSAULT & BATTERY D/W - OTHER 6\n", "LARCENY IN A BUILDING $200 & OVER 5\n", "MANSLAUGHTER - VEHICLE - NEGLIGENCE 5\n", "DRUGS - POSS CLASS A - HEROIN, ETC. 5\n", "PROSTITUTION - ASSISTING OR PROMOTING 4\n", "LARCENY SHOPLIFTING UNDER $50 4\n", "LARCENY SHOPLIFTING $200 & OVER 4\n", "LARCENY BICYCLE $200 & OVER 3\n", "ROBBERY - UNARMED - STREET 3\n", "A&B ON POLICE OFFICER 3\n", "FIREARM/WEAPON - POSSESSION OF DANGEROUS 3\n", "B&E NON-RESIDENCE DAY - NO FORCE 2\n", "B&E NON-RESIDENCE DAY - FORCIBLE 2\n", "PROPERTY - RECEIVING STOLEN 2\n", "LARCENY SHOPLIFTING $50 TO $199 2\n", "FORGERY OR UTTERING 2\n", "DRUGS - POSS CLASS E INTENT TO MF DIST DISP 2\n", "MANSLAUGHTER - NON-VEHICLE - NEGLIGENCE 2\n", "LARCENY OTHER $50 TO $199 2\n", "AUTO THEFT - RECOVERED IN BY POLICE 2\n", "BIOLOGICAL THREATS 2\n", "HUMAN TRAFFICKING - INVOLUNTARY SERVITUDE 2\n", "DRUGS - POSS CLASS D - MARIJUANA, ETC. 1\n", "ROBBERY - KNIFE - STREET 1\n", "B&E NON-RESIDENCE DAY - NO PROP TAKEN 1\n", "COUNTERFEITING 1\n", "LARCENY OTHER UNDER $50 1\n", "AUTO THEFT LEASE/RENT VEHICLE 1\n", "LARCENY IN A BUILDING UNDER $50 1\n", "A&B HANDS, FEET, ETC. - MED. ATTENTION REQ. 1\n", "ROBBERY ATTEMPT - KNIFE - BANK 1\n", "ROBBERY - UNARMED - CHAIN STORE 1\n", "AUTO THEFT - OUTSIDE - RECOVERED IN BOSTON 1\n", "KILLING OF FELON BY POLICE 1\n", "ROBBERY - UNARMED - BUSINESS 1\n", "PROSTITUTE - COMMON NIGHTWALKER 1\n", "VIOLATION - RESTRAINING ORDER 1\n", "ASSAULT D/W - KNIFE ON POLICE OFFICER 1\n", "B&E NON-RESIDENCE NIGHT - FORCE 1\n", "AUTO THEFT OTHER 1\n", "B&E RESIDENCE NIGHT - ATTEMPT FORCE 1\n", "DISORDERLY PERSON 1\n", "DRUGS - GLUE INHALATION 1\n", "FRAUD - FALSE PRETENSE 1\n", "CHILD ABUSE 1\n", "LARCENY NON-ACCESSORY FROM VEH. $200 & OVER 1\n", "LARCENY IN A BUILDING $50 TO $199 1\n", "B&E RESIDENCE DAY - NO PROP TAKEN 1\n", "LARCENY NON-ACCESSORY FROM VEH. $50 TO $199 1\n", "ASSAULT & BATTERY D/W - OTHER ON POLICE OFFICER 1\n", "DRUGS - POSS CLASS D - INTENT MFR DIST DISP 1\n", "ROBBERY - UNARMED - RESIDENCE 1\n", "ASSAULT & BATTERY D/W - KNIFE 1\n", "B&E RESIDENCE DAY - NO FORCE 1\n", "FIREARM/WEAPON - CARRY - SELL - RENT 1\n", "FRAUDS - ALL OTHER 1\n", "ASSAULT D/W - OTHER 1\n", "LARCENY NON-ACCESSORY FROM VEH. UNDER $50 1\n", "ANNOYING AND ACCOSTIN 1\n", "PRISONER ATTEMPT TO RESCUE 1\n", "Name: OFFENSE_DESCRIPTION, dtype: int64" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "df.data[\"OFFENSE_DESCRIPTION\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 13.1 s\n" ] }, { "data": { "text/plain": [ "{'frequency': {'OFFENSE_DESCRIPTION': {'values': [{'value': 'SICK/INJURED/MEDICAL - PERSON',\n", " 'count': 18783},\n", " {'value': 'INVESTIGATE PERSON', 'count': 18754},\n", " {'value': 'M/V - LEAVING SCENE - PROPERTY DAMAGE', 'count': 16323},\n", " {'value': 'VANDALISM', 'count': 15154},\n", " {'value': 'ASSAULT SIMPLE - BATTERY', 'count': 14791},\n", " {'value': 'VERBAL DISPUTE', 'count': 13099},\n", " {'value': 'TOWED MOTOR VEHICLE', 'count': 11287},\n", " {'value': 'INVESTIGATE PROPERTY', 'count': 11124},\n", " {'value': 'LARCENY THEFT FROM BUILDING', 'count': 9069},\n", " {'value': 'THREATS TO DO BODILY HARM', 'count': 9042},\n", " {'value': 'LARCENY THEFT FROM MV - NON-ACCESSORY', 'count': 8893},\n", " {'value': 'PROPERTY - LOST', 'count': 8893},\n", " {'value': 'WARRANT ARREST', 'count': 8407},\n", " {'value': 'LARCENY SHOPLIFTING', 'count': 7949},\n", " {'value': 'M/V ACCIDENT - PROPERTY DAMAGE', 'count': 6557},\n", " {'value': 'LARCENY ALL OTHERS', 'count': 5963},\n", " {'value': 'M/V ACCIDENT - PERSONAL INJURY', 'count': 5131},\n", " {'value': 'ASSAULT - AGGRAVATED - BATTERY', 'count': 4886},\n", " {'value': 'FRAUD - FALSE PRETENSE / SCHEME', 'count': 4413},\n", " {'value': 'MISSING PERSON - LOCATED', 'count': 4365},\n", " {'value': 'HARASSMENT', 'count': 4007},\n", " {'value': 'MISSING PERSON', 'count': 3766},\n", " {'value': 'PROPERTY - FOUND', 'count': 3698},\n", " {'value': 'AUTO THEFT', 'count': 3630},\n", " {'value': 'TRESPASSING', 'count': 3254},\n", " {'value': 'FRAUD - CREDIT CARD / ATM FRAUD', 'count': 3147},\n", " {'value': 'ROBBERY - STREET', 'count': 3056},\n", " {'value': 'M/V ACCIDENT - OTHER', 'count': 2925},\n", " {'value': 'ASSAULT - AGGRAVATED', 'count': 2910},\n", " {'value': 'VAL - VIOLATION OF AUTO LAW - OTHER', 'count': 2894},\n", " {'value': 'VAL - OPERATING AFTER REV/SUSP.', 'count': 2618},\n", " {'value': 'DRUGS - POSS CLASS B - COCAINE, ETC.', 'count': 2591}]}}}" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "df.cols.frequency(\"OFFENSE_DESCRIPTION\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.cols." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from pyspark.sql.types import *\n", "from datetime import date, datetime\n", "\n", "cols = [\n", " (\"names\", \"str\"),\n", " (\"height(ft)\", ShortType()),\n", " (\"function\", \"str\"),\n", " (\"rank\", ByteType()),\n", " (\"age\", \"int\"),\n", " (\"weight(t)\", \"float\"),\n", " \"japanese name\",\n", " \"last position seen\",\n", " \"date arrival\",\n", " \"last date seen\",\n", " (\"attributes\", ArrayType(FloatType())),\n", " (\"DateType\", DateType()),\n", " (\"timestamp\", TimestampType()),\n", " (\"Cybertronian\", BooleanType()),\n", " (\"function(binary)\", BinaryType()),\n", " (\"NullType\", NullType())\n", "\n", " ]\n", "\n", "rows = [\n", " (\"argenisleon@gmail.com\", 28, \"Leader\", 10, 5000000, 4.30, [\"Inochi\", \"Convoy\"], \"19.442735,-99.201111\", \"1980/04/10\",\n", " \"2016/09/10\", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray(\"Leader\", \"utf-8\"),\n", " None),\n", " (\"bumbl#ebéé \", 17, \"Espionage\", 7, 5000000, 2.0, [\"Bumble\", \"Goldback\"], \"10.642707,-71.612534\", \"1980/04/10\",\n", " \"2015/08/10\", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray(\"Espionage\", \"utf-8\"),\n", " None),\n", " (\"ironhide&\", 26, \"Security\", 7, 5000000, 4.0, [\"Roadbuster\"], \"37.789563,-122.400356\", \"1980/04/10\",\n", " \"2014/07/10\", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray(\"Security\", \"utf-8\"),\n", " None),\n", " (\"1 Megatron\", 13, \"First Lieutenant\", 8, 5000000, 1.80, [\"Meister\"], \"33.670666,-117.841553\", \"1980/04/10\",\n", " \"2013/06/10\", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,\n", " bytearray(\"First Lieutenant\", \"utf-8\"), None),\n", " (\"1 Megatron\", None, \"None\", 10, 5000000, 5.70, [\"Megatron\"], None, \"1980/04/10\", \"2012/05/10\", [None, 5700.0],\n", " date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray(\"None\", \"utf-8\"), None),\n", " (None, 300, \"Battle Station\", 8, 5000000, None, [\"Metroflex\"], None, \"1980/04/10\", \"2011/04/10\",\n", " [91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray(\"Battle Station\", \"utf-8\"), None),\n", "\n", " ]\n", "df = op.create.df(cols ,rows, False).cache().repartition(1)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 6 of 6 rows / 16 columns
\n", "
1 partition(s)
\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
names
\n", "
1 (string)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
height(ft)
\n", "
2 (smallint)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
function
\n", "
3 (string)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
rank
\n", "
4 (tinyint)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
age
\n", "
5 (int)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
weight(t)
\n", "
6 (float)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
japanese name
\n", "
7 (string)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
last position seen
\n", "
8 (string)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
date arrival
\n", "
9 (string)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
last date seen
\n", "
10 (string)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
attributes
\n", "
11 (array<float>)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
DateType
\n", "
12 (date)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
timestamp
\n", "
13 (timestamp)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
Cybertronian
\n", "
14 (boolean)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
function(binary)
\n", "
15 (binary)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
NullType
\n", "
16 (null)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " argenisleon@gmail.com\n", " \n", "
\n", "
\n", "
\n", " \n", " 28.0\n", " \n", "
\n", "
\n", "
\n", " \n", " Leader\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 4.300000190734863\n", " \n", "
\n", "
\n", "
\n", " \n", " [Inochi,⋅Convoy]\n", " \n", "
\n", "
\n", "
\n", " \n", " 19.442735,-99.201111\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2016/09/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [8.53439998626709,⋅4300.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2016-09-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Leader')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " bumbl#ebéé⋅⋅\n", " \n", "
\n", "
\n", "
\n", " \n", " 17.0\n", " \n", "
\n", "
\n", "
\n", " \n", " Espionage\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 2.0\n", " \n", "
\n", "
\n", "
\n", " \n", " [Bumble,⋅Goldback]\n", " \n", "
\n", "
\n", "
\n", " \n", " 10.642707,-71.612534\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2015/08/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [5.334000110626221,⋅2000.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2015-08-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Espionage')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " ironhide&\n", " \n", "
\n", "
\n", "
\n", " \n", " 26.0\n", " \n", "
\n", "
\n", "
\n", " \n", " Security\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 4.0\n", " \n", "
\n", "
\n", "
\n", " \n", " [Roadbuster]\n", " \n", "
\n", "
\n", "
\n", " \n", " 37.789563,-122.400356\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014/07/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [7.924799919128418,⋅4000.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Security')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1⋅Megatron\n", " \n", "
\n", "
\n", "
\n", " \n", " 13.0\n", " \n", "
\n", "
\n", "
\n", " \n", " First⋅Lieutenant\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 1.7999999523162842\n", " \n", "
\n", "
\n", "
\n", " \n", " [Meister]\n", " \n", "
\n", "
\n", "
\n", " \n", " 33.670666,-117.841553\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2013/06/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [3.962399959564209,⋅1800.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2013-06-24\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'First⋅Lieutenant')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1⋅Megatron\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 5.699999809265137\n", " \n", "
\n", "
\n", "
\n", " \n", " [Megatron]\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2012/05/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [None,⋅5700.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2012-05-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'None')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 300.0\n", " \n", "
\n", "
\n", "
\n", " \n", " Battle⋅Station\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " nan\n", " \n", "
\n", "
\n", "
\n", " \n", " [Metroflex]\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2011/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [91.44000244140625,⋅None]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2011-04-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Battle⋅Station')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "\n", "
Viewing 6 of 6 rows / 16 columns
\n", "
1 partition(s) <class 'pyspark.sql.dataframe.DataFrame'>
\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.ext.display(20)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 10 of 311.5 thousand rows / 17 columns
\n", "
1 partition(s)
\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
INCIDENT_NUMBER
\n", "
1 (object)
\n", " \n", "
\n", "
OFFENSE_CODE
\n", "
2 (int64)
\n", " \n", "
\n", "
OFFENSE_CODE_GROUP
\n", "
3 (object)
\n", " \n", "
\n", "
OFFENSE_DESCRIPTION
\n", "
4 (object)
\n", " \n", "
\n", "
DISTRICT
\n", "
5 (object)
\n", " \n", "
\n", "
REPORTING_AREA
\n", "
6 (object)
\n", " \n", "
\n", "
SHOOTING
\n", "
7 (object)
\n", " \n", "
\n", "
OCCURRED_ON_DATE
\n", "
8 (object)
\n", " \n", "
\n", "
YEAR
\n", "
9 (int64)
\n", " \n", "
\n", "
MONTH
\n", "
10 (int64)
\n", " \n", "
\n", "
DAY_OF_WEEK
\n", "
11 (object)
\n", " \n", "
\n", "
HOUR
\n", "
12 (int64)
\n", " \n", "
\n", "
UCR_PART
\n", "
13 (object)
\n", " \n", "
\n", "
STREET
\n", "
14 (object)
\n", " \n", "
\n", "
Lat
\n", "
15 (object)
\n", " \n", "
\n", "
Long
\n", "
16 (object)
\n", " \n", "
\n", "
Location
\n", "
17 (object)
\n", " \n", "
\n", "
\n", " \n", " I182070945\n", " \n", "
\n", "
\n", "
\n", " \n", " 619\n", " \n", "
\n", "
\n", "
\n", " \n", " Larceny\n", " \n", "
\n", "
\n", "
\n", " \n", " LARCENY⋅ALL⋅OTHERS\n", " \n", "
\n", "
\n", "
\n", " \n", " D14\n", " \n", "
\n", "
\n", "
\n", " \n", " 808\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-02⋅13:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Sunday\n", " \n", "
\n", "
\n", "
\n", " \n", " 13\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅One\n", " \n", "
\n", "
\n", "
\n", " \n", " LINCOLN⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.35779134\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.13937053\n", " \n", "
\n", "
\n", "
\n", " \n", " \"(42.35779134\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070943\n", " \n", "
\n", "
\n", "
\n", " \n", " 1402\n", " \n", "
\n", "
\n", "
\n", " \n", " Vandalism\n", " \n", "
\n", "
\n", "
\n", " \n", " VANDALISM\n", " \n", "
\n", "
\n", "
\n", " \n", " C11\n", " \n", "
\n", "
\n", "
\n", " \n", " 347\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-08-21⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " Tuesday\n", " \n", "
\n", "
\n", "
\n", " \n", " 0\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Two\n", " \n", "
\n", "
\n", "
\n", " \n", " HECLA⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.30682138\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.06030035\n", " \n", "
\n", "
\n", "
\n", " \n", " \"(42.30682138\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070941\n", " \n", "
\n", "
\n", "
\n", " \n", " 3410\n", " \n", "
\n", "
\n", "
\n", " \n", " Towed\n", " \n", "
\n", "
\n", "
\n", " \n", " TOWED⋅MOTOR⋅VEHICLE\n", " \n", "
\n", "
\n", "
\n", " \n", " D4\n", " \n", "
\n", "
\n", "
\n", " \n", " 151\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅19:27:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 19\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " CAZENOVE⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.34658879\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07242943\n", " \n", "
\n", "
\n", "
\n", " \n", " \"(42.34658879\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070940\n", " \n", "
\n", "
\n", "
\n", " \n", " 3114\n", " \n", "
\n", "
\n", "
\n", " \n", " Investigate⋅Property\n", " \n", "
\n", "
\n", "
\n", " \n", " INVESTIGATE⋅PROPERTY\n", " \n", "
\n", "
\n", "
\n", " \n", " D4\n", " \n", "
\n", "
\n", "
\n", " \n", " 272\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:16:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " NEWCOMB⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.33418175\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07866441\n", " \n", "
\n", "
\n", "
\n", " \n", " \"(42.33418175\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070938\n", " \n", "
\n", "
\n", "
\n", " \n", " 3114\n", " \n", "
\n", "
\n", "
\n", " \n", " Investigate⋅Property\n", " \n", "
\n", "
\n", "
\n", " \n", " INVESTIGATE⋅PROPERTY\n", " \n", "
\n", "
\n", "
\n", " \n", " B3\n", " \n", "
\n", "
\n", "
\n", " \n", " 421\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:05:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " DELHI⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.27536542\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.09036101\n", " \n", "
\n", "
\n", "
\n", " \n", " \"(42.27536542\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070936\n", " \n", "
\n", "
\n", "
\n", " \n", " 3820\n", " \n", "
\n", "
\n", "
\n", " \n", " Motor⋅Vehicle⋅Accident⋅Response\n", " \n", "
\n", "
\n", "
\n", " \n", " M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY\n", " \n", "
\n", "
\n", "
\n", " \n", " C11\n", " \n", "
\n", "
\n", "
\n", " \n", " 398\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:09:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " TALBOT⋅AVE\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.29019621\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07159012\n", " \n", "
\n", "
\n", "
\n", " \n", " \"(42.29019621\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070933\n", " \n", "
\n", "
\n", "
\n", " \n", " 724\n", " \n", "
\n", "
\n", "
\n", " \n", " Auto⋅Theft\n", " \n", "
\n", "
\n", "
\n", " \n", " AUTO⋅THEFT\n", " \n", "
\n", "
\n", "
\n", " \n", " B2\n", " \n", "
\n", "
\n", "
\n", " \n", " 330\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅21:25:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 21\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅One\n", " \n", "
\n", "
\n", "
\n", " \n", " NORMANDY⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.30607218\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.08273260\n", " \n", "
\n", "
\n", "
\n", " \n", " \"(42.30607218\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070932\n", " \n", "
\n", "
\n", "
\n", " \n", " 3301\n", " \n", "
\n", "
\n", "
\n", " \n", " Verbal⋅Disputes\n", " \n", "
\n", "
\n", "
\n", " \n", " VERBAL⋅DISPUTE\n", " \n", "
\n", "
\n", "
\n", " \n", " B2\n", " \n", "
\n", "
\n", "
\n", " \n", " 584\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅20:39:37\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 20\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " LAWN⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.32701648\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.10555088\n", " \n", "
\n", "
\n", "
\n", " \n", " \"(42.32701648\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070931\n", " \n", "
\n", "
\n", "
\n", " \n", " 301\n", " \n", "
\n", "
\n", "
\n", " \n", " Robbery\n", " \n", "
\n", "
\n", "
\n", " \n", " ROBBERY⋅-⋅STREET\n", " \n", "
\n", "
\n", "
\n", " \n", " C6\n", " \n", "
\n", "
\n", "
\n", " \n", " 177\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅20:48:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 20\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅One\n", " \n", "
\n", "
\n", "
\n", " \n", " MASSACHUSETTS⋅AVE\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.33152148\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.07085307\n", " \n", "
\n", "
\n", "
\n", " \n", " \"(42.33152148\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070929\n", " \n", "
\n", "
\n", "
\n", " \n", " 3301\n", " \n", "
\n", "
\n", "
\n", " \n", " Verbal⋅Disputes\n", " \n", "
\n", "
\n", "
\n", " \n", " VERBAL⋅DISPUTE\n", " \n", "
\n", "
\n", "
\n", " \n", " C11\n", " \n", "
\n", "
\n", "
\n", " \n", " 364\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅20:38:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 20\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " LESLIE⋅ST\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.29514664\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.05860832\n", " \n", "
\n", "
\n", "
\n", " \n", " \"(42.29514664\n", " \n", "
\n", "
\n", "
\n", " \n", " I182070928\n", " \n", "
\n", "
\n", "
\n", " \n", " 3301\n", " \n", "
\n", "
\n", "
\n", " \n", " Verbal⋅Disputes\n", " \n", "
\n", "
\n", "
\n", " \n", " VERBAL⋅DISPUTE\n", " \n", "
\n", "
\n", "
\n", " \n", " C6\n", " \n", "
\n", "
\n", "
\n", " \n", " 913\n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " 2018-09-03⋅19:55:00\n", " \n", "
\n", "
\n", "
\n", " \n", " 2018\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Monday\n", " \n", "
\n", "
\n", "
\n", " \n", " 19\n", " \n", "
\n", "
\n", "
\n", " \n", " Part⋅Three\n", " \n", "
\n", "
\n", "
\n", " \n", " OCEAN⋅VIEW⋅DR\n", " \n", "
\n", "
\n", "
\n", " \n", " 42.31957856\n", " \n", "
\n", "
\n", "
\n", " \n", " -71.04032766\n", " \n", "
\n", "
\n", "
\n", " \n", " \"(42.31957856\n", " \n", "
\n", "
\n", "
\n", "
\n", "\n", "
Viewing 10 of 311.5 thousand rows / 17 columns
\n", "
1 partition(s) <class 'optimus.engines.pandas.dataframe.PandasDataFrame'>
\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.display()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'SHOOTING': {'null': 318054, 'missing': 0, 'string': 1019},\n", " 'MONTH': {'null': 0, 'missing': 0, 'int': 319073},\n", " 'HOUR': {'null': 0, 'missing': 0, 'int': 319073},\n", " 'Lat': {'null': 19999, 'missing': 0, 'decimal': 299074},\n", " 'STREET': {'null': 10871, 'missing': 0, 'string': 308202},\n", " 'DISTRICT': {'null': 1765, 'missing': 0, 'string': 317308},\n", " 'OFFENSE_CODE_GROUP': {'null': 0, 'missing': 0, 'string': 319073},\n", " 'REPORTING_AREA': {'null': 0, 'missing': 0, 'string': 319073},\n", " 'OCCURRED_ON_DATE': {'null': 0, 'missing': 0, 'date': 319073},\n", " 'UCR_PART': {'null': 90, 'missing': 0, 'string': 318983},\n", " 'INCIDENT_NUMBER': {'null': 0, 'missing': 0, 'string': 319073},\n", " 'DAY_OF_WEEK': {'null': 0, 'missing': 0, 'string': 319073},\n", " 'OFFENSE_DESCRIPTION': {'null': 0, 'missing': 0, 'string': 319073},\n", " 'YEAR': {'null': 0, 'missing': 0, 'int': 319073},\n", " 'Long': {'null': 19999, 'missing': 0, 'decimal': 299074},\n", " 'OFFENSE_CODE': {'null': 0, 'missing': 0, 'int': 319073},\n", " 'Location': {'null': 0, 'missing': 0, 'string': 319073}}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.count_by_dtypes(\"*\", infer=False)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TimestampType\n" ] } ], "source": [ "from optimus.helpers.check import is_column_a\n", "is_column_a(df,\"OCCURRED_ON_DATE\",\"timestamp\")\n", "print(df.cols.schema_dtype(\"OCCURRED_ON_DATE\"))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('INCIDENT_NUMBER', 'string'),\n", " ('OFFENSE_CODE', 'int'),\n", " ('OFFENSE_CODE_GROUP', 'string'),\n", " ('OFFENSE_DESCRIPTION', 'string'),\n", " ('DISTRICT', 'string'),\n", " ('REPORTING_AREA', 'string'),\n", " ('SHOOTING', 'string'),\n", " ('OCCURRED_ON_DATE', 'timestamp'),\n", " ('YEAR', 'int'),\n", " ('MONTH', 'int'),\n", " ('DAY_OF_WEEK', 'string'),\n", " ('HOUR', 'int'),\n", " ('UCR_PART', 'string'),\n", " ('STREET', 'string'),\n", " ('Lat', 'double'),\n", " ('Long', 'double'),\n", " ('Location', 'string')]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "'columns' must be 'str' or 'list', received 'None'. Maybe the columns selected do not match a specified datatype filter.", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcols\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstd\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"OCCURRED_ON_DATE\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;32m~\\Documents\\Optimus\\optimus\\engines\\spark\\columns.py\u001b[0m in \u001b[0;36mstd\u001b[1;34m(columns)\u001b[0m\n\u001b[0;32m 745\u001b[0m \"\"\"\n\u001b[0;32m 746\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparse_columns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfilter_by_column_dtypes\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconstants\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mNUMERIC_TYPES\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 747\u001b[1;33m \u001b[0mcheck_column_numbers\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"*\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 748\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 749\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mformat_dict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mCols\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0magg_exprs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mF\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstddev\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\columns.py\u001b[0m in \u001b[0;36mcheck_column_numbers\u001b[1;34m(columns, number)\u001b[0m\n\u001b[0;32m 216\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 217\u001b[0m RaiseIt.value_error(columns, [\"str\", \"list\"],\n\u001b[1;32m--> 218\u001b[1;33m extra_text=\"Maybe the columns selected do not match a specified datatype filter.\")\n\u001b[0m\u001b[0;32m 219\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 220\u001b[0m \u001b[0mcount\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\raiseit.py\u001b[0m in \u001b[0;36mvalue_error\u001b[1;34m(var, data_values, extra_text)\u001b[0m\n\u001b[0;32m 74\u001b[0m type=divisor.join(map(\n\u001b[0;32m 75\u001b[0m \u001b[1;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m\"'\"\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mx\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m\"'\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 76\u001b[1;33m data_values)), var_type=one_list_to_val(var), extra_text=extra_text))\n\u001b[0m\u001b[0;32m 77\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 78\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mValueError\u001b[0m: 'columns' must be 'str' or 'list', received 'None'. Maybe the columns selected do not match a specified datatype filter." ] } ], "source": [ "df.cols.std(\"OCCURRED_ON_DATE\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.ext.send(\"OCCURRED_ON_DATE\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.cols.hist(\"*\")" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "VVV StringType\n", "DATA (, , , , , )\n", "VVV StringType\n", "DATA (,)\n", "EXEC AGG 1\n" ] }, { "data": { "text/plain": [ "{'INCIDENT_NUMBER': {'hist': [{'count': 0.0, 'lower': 0.0, 'upper': 2.5},\n", " {'count': 0.0, 'lower': 2.5, 'upper': 5.0},\n", " {'count': 0.0, 'lower': 5.0, 'upper': 7.5},\n", " {'count': 1.0, 'lower': 7.5, 'upper': 10.0},\n", " {'count': 318719.0, 'lower': 10.0, 'upper': 12.5},\n", " {'count': 353.0, 'lower': 12.5, 'upper': 15.0},\n", " {'count': 0.0, 'lower': 15.0, 'upper': 17.5},\n", " {'count': 0.0, 'lower': 17.5, 'upper': 20.0},\n", " {'count': 0.0, 'lower': 20.0, 'upper': 22.5},\n", " {'count': 0.0, 'lower': 22.5, 'upper': 25.0},\n", " {'count': 0.0, 'lower': 25.0, 'upper': 27.5},\n", " {'count': 0.0, 'lower': 27.5, 'upper': 30.0},\n", " {'count': 0.0, 'lower': 30.0, 'upper': 32.5},\n", " {'count': 0.0, 'lower': 32.5, 'upper': 35.0},\n", " {'count': 0.0, 'lower': 35.0, 'upper': 37.5},\n", " {'count': 0.0, 'lower': 37.5, 'upper': 40.0},\n", " {'count': 0.0, 'lower': 40.0, 'upper': 42.5},\n", " {'count': 0.0, 'lower': 42.5, 'upper': 45.0},\n", " {'count': 0.0, 'lower': 45.0, 'upper': 47.5},\n", " {'count': 0.0, 'lower': 47.5, 'upper': 50.0}]}}" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.hist(\"INCIDENT_NUMBER\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ShortType (, , , , , )\n", "0.5 44.5\n", "ShortType (, , , , , )\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 1 of 1 rows / 16 columns
\n", "
1 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
names
\n", "
1 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
height(ft)
\n", "
2 (smallint)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
function
\n", "
3 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
rank
\n", "
4 (tinyint)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
age
\n", "
5 (int)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
weight(t)
\n", "
6 (float)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
japanese name
\n", "
7 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
last position seen
\n", "
8 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
date arrival
\n", "
9 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
last date seen
\n", "
10 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
attributes
\n", "
11 (array<float>)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
DateType
\n", "
12 (date)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
timestamp
\n", "
13 (timestamp)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
Cybertronian
\n", "
14 (boolean)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
function(binary)
\n", "
15 (binary)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
NullType
\n", "
16 (null)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 300\n", " \n", "
\n", "
\n", "
\n", " \n", " Battle⋅Station\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " [Metroflex]\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2011/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [91.44000244140625,⋅None]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2011-04-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Battle⋅Station')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "\n", "\n", "
Viewing 1 of 1 rows / 16 columns
\n", "
1 partition(s)
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.outliers.tukey(\"height(ft)\").select().ext.display()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'{\"price\": {\"hist\": [{\"count\": 6.0, \"lower\": 8.0, \"upper\": 8.1}, {\"count\": 0.0, \"lower\": 8.1, \"upper\": 8.2}, {\"count\": 0.0, \"lower\": 8.2, \"upper\": 8.3}, {\"count\": 0.0, \"lower\": 8.3, \"upper\": 8.4}, {\"count\": 0.0, \"lower\": 8.4, \"upper\": 8.5}, {\"count\": 0.0, \"lower\": 8.5, \"upper\": 8.6}, {\"count\": 0.0, \"lower\": 8.6, \"upper\": 8.7}, {\"count\": 0.0, \"lower\": 8.7, \"upper\": 8.8}, {\"count\": 0.0, \"lower\": 8.8, \"upper\": 8.9}, {\"count\": 0.0, \"lower\": 8.9, \"upper\": 9.0}, {\"count\": 2.0, \"lower\": 9.0, \"upper\": 9.1}, {\"count\": 0.0, \"lower\": 9.1, \"upper\": 9.2}, {\"count\": 0.0, \"lower\": 9.2, \"upper\": 9.3}, {\"count\": 0.0, \"lower\": 9.3, \"upper\": 9.4}, {\"count\": 0.0, \"lower\": 9.4, \"upper\": 9.5}, {\"count\": 0.0, \"lower\": 9.5, \"upper\": 9.6}, {\"count\": 0.0, \"lower\": 9.6, \"upper\": 9.7}, {\"count\": 0.0, \"lower\": 9.7, \"upper\": 9.8}, {\"count\": 0.0, \"lower\": 9.8, \"upper\": 9.9}, {\"count\": 0.0, \"lower\": 9.9, \"upper\": 10.0}]}}'" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "outlier.hist(\"price\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': {'null': 0, 'missing': 0, 'int': 19}}" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.count_by_dtypes(\"id\")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "19" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.count()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "6\n" ] }, { "data": { "text/plain": [ "{'count_outliers': 9,\n", " 'count_non_outliers': 10,\n", " 'lower_bound': 6,\n", " 'lower_bound_count': 9,\n", " 'upper_bound': 10,\n", " 'upper_bound_count': 0}" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "outlier.info()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# df.table()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'names': {'email': 1, 'mismatch': 4, 'null': 1, 'missing': 0}}" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.count_mismatch({\"names\":\"argenisleon@gmail.com\",\"names\":\"email\"})" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "a = {'names': {'email': 1, 'mismatch': 4, 'null': 1}}" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('string', 'array')" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tuple({\"firstName\":\"string\",\"lastName\":\"array\"}.values())" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "from infer import Infer" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(('names', 'null'), 1)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from infer import Infer\n", "Infer.mismatch((\"names\",None),{\"names\":\"email\"})" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "Infer.value(12, \"string\")" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['firstName', 'lastName']" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list({\"firstName\":\"string\",\"lastName\":\"string\"}.keys())" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'df' is not defined", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrows\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mselect_by_dtypes\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"names\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m\"str\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mNameError\u001b[0m: name 'df' is not defined" ] } ], "source": [ "df.rows.select_by_dtypes(\"names\",\"str\")" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 3 of 3 rows / 16 columns
\n", "
1 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
names
\n", "
1 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
height(ft)
\n", "
2 (smallint)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
function
\n", "
3 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
rank
\n", "
4 (tinyint)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
age
\n", "
5 (int)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
weight(t)
\n", "
6 (float)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
japanese name
\n", "
7 (array<string>)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
last position seen
\n", "
8 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
date arrival
\n", "
9 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
last date seen
\n", "
10 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
attributes
\n", "
11 (array<float>)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
DateType
\n", "
12 (date)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
timestamp
\n", "
13 (timestamp)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
Cybertronian
\n", "
14 (boolean)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
function(binary)
\n", "
15 (binary)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
NullType
\n", "
16 (null)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " bumbl#ebéé⋅⋅\n", " \n", "
\n", "
\n", "
\n", " \n", " 17\n", " \n", "
\n", "
\n", "
\n", " \n", " Espionage\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 2.0\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Bumble',⋅'Goldback']\n", " \n", "
\n", "
\n", "
\n", " \n", " 10.642707,-71.612534\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2015/08/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [5.334000110626221,⋅2000.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2015-08-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Espionage')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " ironhide&\n", " \n", "
\n", "
\n", "
\n", " \n", " 26\n", " \n", "
\n", "
\n", "
\n", " \n", " Security\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 4.0\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Roadbuster']\n", " \n", "
\n", "
\n", "
\n", " \n", " 37.789563,-122.400356\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014/07/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [7.924799919128418,⋅4000.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Security')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1⋅Megatron\n", " \n", "
\n", "
\n", "
\n", " \n", " 13\n", " \n", "
\n", "
\n", "
\n", " \n", " First⋅Lieutenant\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 1.7999999523162842\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Meister']\n", " \n", "
\n", "
\n", "
\n", " \n", " 33.670666,-117.841553\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2013/06/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [3.962399959564209,⋅1800.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2013-06-24\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'First⋅Lieutenant')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "\n", "\n", "
Viewing 3 of 3 rows / 16 columns
\n", "
1 partition(s)
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Histograma\n", "df.rows.between(\"height(ft)\",17,26, invert = False , equal =True, ).table()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 6 of 6 rows / 16 columns
\n", "
1 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
names
\n", "
1 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
height(ft)
\n", "
2 (smallint)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
function
\n", "
3 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
rank
\n", "
4 (tinyint)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
age
\n", "
5 (int)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
weight(t)
\n", "
6 (float)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
japanese name
\n", "
7 (array<string>)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
last position seen
\n", "
8 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
date arrival
\n", "
9 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
last date seen
\n", "
10 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
attributes
\n", "
11 (array<float>)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
DateType
\n", "
12 (date)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
timestamp
\n", "
13 (timestamp)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
Cybertronian
\n", "
14 (boolean)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
function(binary)
\n", "
15 (binary)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
NullType
\n", "
16 (null)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " Optimus⋅OptimusPrime\n", " \n", "
\n", "
\n", "
\n", " \n", " 28\n", " \n", "
\n", "
\n", "
\n", " \n", " redaeL\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 4.300000190734863\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Inochi',⋅'Convoy']\n", " \n", "
\n", "
\n", "
\n", " \n", " 19.442735,-99.201111\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2016/09/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [8.53439998626709,⋅4300.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2016-09-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Leader')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " bumbl#ebéé⋅⋅\n", " \n", "
\n", "
\n", "
\n", " \n", " 17\n", " \n", "
\n", "
\n", "
\n", " \n", " eganoipsE\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 2.0\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Bumble',⋅'Goldback']\n", " \n", "
\n", "
\n", "
\n", " \n", " 10.642707,-71.612534\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2015/08/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [5.334000110626221,⋅2000.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2015-08-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Espionage')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " ironhide&\n", " \n", "
\n", "
\n", "
\n", " \n", " 26\n", " \n", "
\n", "
\n", "
\n", " \n", " ytiruceS\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 4.0\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Roadbuster']\n", " \n", "
\n", "
\n", "
\n", " \n", " 37.789563,-122.400356\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014/07/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [7.924799919128418,⋅4000.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Security')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1⋅Megatron\n", " \n", "
\n", "
\n", "
\n", " \n", " 13\n", " \n", "
\n", "
\n", "
\n", " \n", " tnanetueiL⋅tsriF\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 1.7999999523162842\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Meister']\n", " \n", "
\n", "
\n", "
\n", " \n", " 33.670666,-117.841553\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2013/06/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [3.962399959564209,⋅1800.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2013-06-24\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'First⋅Lieutenant')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1⋅Megatron\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " enoN\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 5.699999809265137\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Megatron']\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2012/05/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [None,⋅5700.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2012-05-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'None')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " megatron⋅1\n", " \n", "
\n", "
\n", "
\n", " \n", " 300\n", " \n", "
\n", "
\n", "
\n", " \n", " noitatS⋅elttaB\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Metroflex']\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2011/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [91.44000244140625,⋅None]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2011-04-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Battle⋅Station')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "\n", "\n", "
Viewing 6 of 6 rows / 16 columns
\n", "
1 partition(s)
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.cols.reverse(\"function\").table()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "outlier = df.outliers.tukey(\"mass (g)\")" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'{\"columns\": [{\"title\": \"mass (g)\"}], \"value\": [[21.0], [160.0], [252.0], [256.8], [320.0], [41.0], [94.2], [265.0], [146.0], [134.0], [345.0], [14.0], [23.2], [17.0], [375.0], [270.0], [13.9], [18.0], [100.0], [488.1], [470.0], [67.8], [56.0], [190.0], [219.0], [324.0], [357.0], [212.0], [478.0], [342.0], [8.0], [94.0], [45.6], [0.5], [72.0], [367.0], [303.0], [48.6], [469.0], [78.4], [167.0], [100.0], [340.0], [28.0], [0.8], [230.0], [400.0], [438.0], [230.0], [30.0], [300.0], [188.0], [127.0], [277.0], [113.0], [107.2], [380.0], [82.0], [220.0], [240.0], [132.7], [36.1], [28.0], [380.0], [102.0], [480.0], [45.5], [215.0], [288.0], [28.0], [0.2], [315.0], [414.0], [167.7], [305.5], [180.0], [266.1], [112.0], [22.0], [450.0], [222.0], [100.0], [30.0], [483.0], [89.0], [230.0], [350.0], [448.0], [299.0], [400.0], [180.0], [450.0], [100.0], [331.0], [195.0], [140.0], [67.4], [97.7], [202.6], [136.0]]}'" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# print(outlier.info())\n", "outlier.select_lower_bound()" ] }, { "cell_type": "code", "execution_count": 256, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 10 of 19 rows / 9 columns
\n", "
1 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
id
\n", "
1 (int)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
firstName
\n", "
2 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
lastName
\n", "
3 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
billingId
\n", "
4 (int)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
product
\n", "
5 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
price
\n", "
6 (int)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
birth
\n", "
7 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
dummyCol
\n", "
8 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
product***FINGERPRINT
\n", "
9 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " Luis\n", " \n", "
\n", "
\n", "
\n", " \n", " Alvarez$$%!\n", " \n", "
\n", "
\n", "
\n", " \n", " 123\n", " \n", "
\n", "
\n", "
\n", " \n", " Cake\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/07/07\n", " \n", "
\n", "
\n", "
\n", " \n", " never\n", " \n", "
\n", "
\n", "
\n", " \n", " cake\n", " \n", "
\n", "
\n", "
\n", " \n", " 2\n", " \n", "
\n", "
\n", "
\n", " \n", " André\n", " \n", "
\n", "
\n", "
\n", " \n", " Ampère\n", " \n", "
\n", "
\n", "
\n", " \n", " 423\n", " \n", "
\n", "
\n", "
\n", " \n", " piza\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 1950/07/08\n", " \n", "
\n", "
\n", "
\n", " \n", " gonna\n", " \n", "
\n", "
\n", "
\n", " \n", " piza\n", " \n", "
\n", "
\n", "
\n", " \n", " 3\n", " \n", "
\n", "
\n", "
\n", " \n", " NiELS\n", " \n", "
\n", "
\n", "
\n", " \n", " Böhr//((%%\n", " \n", "
\n", "
\n", "
\n", " \n", " 551\n", " \n", "
\n", "
\n", "
\n", " \n", " pizza\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 1990/07/09\n", " \n", "
\n", "
\n", "
\n", " \n", " give\n", " \n", "
\n", "
\n", "
\n", " \n", " pizza\n", " \n", "
\n", "
\n", "
\n", " \n", " 4\n", " \n", "
\n", "
\n", "
\n", " \n", " PAUL\n", " \n", "
\n", "
\n", "
\n", " \n", " dirac$\n", " \n", "
\n", "
\n", "
\n", " \n", " 521\n", " \n", "
\n", "
\n", "
\n", " \n", " pizza\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 1954/07/10\n", " \n", "
\n", "
\n", "
\n", " \n", " you\n", " \n", "
\n", "
\n", "
\n", " \n", " pizza\n", " \n", "
\n", "
\n", "
\n", " \n", " 5\n", " \n", "
\n", "
\n", "
\n", " \n", " Albert\n", " \n", "
\n", "
\n", "
\n", " \n", " Einstein\n", " \n", "
\n", "
\n", "
\n", " \n", " 634\n", " \n", "
\n", "
\n", "
\n", " \n", " pizza\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 1990/07/11\n", " \n", "
\n", "
\n", "
\n", " \n", " up\n", " \n", "
\n", "
\n", "
\n", " \n", " pizza\n", " \n", "
\n", "
\n", "
\n", " \n", " 6\n", " \n", "
\n", "
\n", "
\n", " \n", " Galileo\n", " \n", "
\n", "
\n", "
\n", " \n", " ⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI\n", " \n", "
\n", "
\n", "
\n", " \n", " 672\n", " \n", "
\n", "
\n", "
\n", " \n", " arepa\n", " \n", "
\n", "
\n", "
\n", " \n", " 5\n", " \n", "
\n", "
\n", "
\n", " \n", " 1930/08/12\n", " \n", "
\n", "
\n", "
\n", " \n", " never\n", " \n", "
\n", "
\n", "
\n", " \n", " arepa\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " CaRL\n", " \n", "
\n", "
\n", "
\n", " \n", " Ga%%%uss\n", " \n", "
\n", "
\n", "
\n", " \n", " 323\n", " \n", "
\n", "
\n", "
\n", " \n", " taco\n", " \n", "
\n", "
\n", "
\n", " \n", " 3\n", " \n", "
\n", "
\n", "
\n", " \n", " 1970/07/13\n", " \n", "
\n", "
\n", "
\n", " \n", " gonna\n", " \n", "
\n", "
\n", "
\n", " \n", " taco\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " David\n", " \n", "
\n", "
\n", "
\n", " \n", " H$$$ilbert\n", " \n", "
\n", "
\n", "
\n", " \n", " 624\n", " \n", "
\n", "
\n", "
\n", " \n", " taaaccoo\n", " \n", "
\n", "
\n", "
\n", " \n", " 3\n", " \n", "
\n", "
\n", "
\n", " \n", " 1950/07/14\n", " \n", "
\n", "
\n", "
\n", " \n", " let\n", " \n", "
\n", "
\n", "
\n", " \n", " taaaccoo\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Johannes\n", " \n", "
\n", "
\n", "
\n", " \n", " KEPLER\n", " \n", "
\n", "
\n", "
\n", " \n", " 735\n", " \n", "
\n", "
\n", "
\n", " \n", " taco\n", " \n", "
\n", "
\n", "
\n", " \n", " 3\n", " \n", "
\n", "
\n", "
\n", " \n", " 1920/04/22\n", " \n", "
\n", "
\n", "
\n", " \n", " you\n", " \n", "
\n", "
\n", "
\n", " \n", " taco\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " JaMES\n", " \n", "
\n", "
\n", "
\n", " \n", " M$$ax%%well\n", " \n", "
\n", "
\n", "
\n", " \n", " 875\n", " \n", "
\n", "
\n", "
\n", " \n", " taco\n", " \n", "
\n", "
\n", "
\n", " \n", " 3\n", " \n", "
\n", "
\n", "
\n", " \n", " 1923/03/12\n", " \n", "
\n", "
\n", "
\n", " \n", " down\n", " \n", "
\n", "
\n", "
\n", " \n", " taco\n", " \n", "
\n", "
\n", "\n", "\n", "
Viewing 10 of 19 rows / 9 columns
\n", "
1 partition(s)
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "keyCol.fingerprint(df,\"product\").table()" ] }, { "cell_type": "code", "execution_count": 245, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 6 of 6 rows / 17 columns
\n", "
1 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
names
\n", "
1 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
height(ft)
\n", "
2 (smallint)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
function
\n", "
3 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
rank
\n", "
4 (tinyint)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
age
\n", "
5 (int)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
weight(t)
\n", "
6 (float)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
japanese name
\n", "
7 (array<string>)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
last position seen
\n", "
8 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
date arrival
\n", "
9 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
last date seen
\n", "
10 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
attributes
\n", "
11 (array<float>)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
DateType
\n", "
12 (date)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
timestamp
\n", "
13 (timestamp)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
Cybertronian
\n", "
14 (boolean)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
function(binary)
\n", "
15 (binary)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
NullType
\n", "
16 (null)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
names***FINGERPRINT
\n", "
17 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " Optimus⋅OptimusPrime\n", " \n", "
\n", "
\n", "
\n", " \n", " 28\n", " \n", "
\n", "
\n", "
\n", " \n", " Leader\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 4.300000190734863\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Inochi',⋅'Convoy']\n", " \n", "
\n", "
\n", "
\n", " \n", " 19.442735,-99.201111\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2016/09/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [8.53439998626709,⋅4300.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2016-09-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Leader')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " optimusoptimusprime\n", " \n", "
\n", "
\n", "
\n", " \n", " bumbl#ebéé⋅⋅\n", " \n", "
\n", "
\n", "
\n", " \n", " 17\n", " \n", "
\n", "
\n", "
\n", " \n", " Espionage\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 2.0\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Bumble',⋅'Goldback']\n", " \n", "
\n", "
\n", "
\n", " \n", " 10.642707,-71.612534\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2015/08/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [5.334000110626221,⋅2000.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2015-08-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Espionage')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " bumblebee\n", " \n", "
\n", "
\n", "
\n", " \n", " ironhide&\n", " \n", "
\n", "
\n", "
\n", " \n", " 26\n", " \n", "
\n", "
\n", "
\n", " \n", " Security\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 4.0\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Roadbuster']\n", " \n", "
\n", "
\n", "
\n", " \n", " 37.789563,-122.400356\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014/07/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [7.924799919128418,⋅4000.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Security')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " ironhide\n", " \n", "
\n", "
\n", "
\n", " \n", " 1⋅Megatron\n", " \n", "
\n", "
\n", "
\n", " \n", " 13\n", " \n", "
\n", "
\n", "
\n", " \n", " First⋅Lieutenant\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 1.7999999523162842\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Meister']\n", " \n", "
\n", "
\n", "
\n", " \n", " 33.670666,-117.841553\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2013/06/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [3.962399959564209,⋅1800.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2013-06-24\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'First⋅Lieutenant')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1megatron\n", " \n", "
\n", "
\n", "
\n", " \n", " 1⋅Megatron\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 5.699999809265137\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Megatron']\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2012/05/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [None,⋅5700.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2012-05-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'None')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1megatron\n", " \n", "
\n", "
\n", "
\n", " \n", " megatron⋅1\n", " \n", "
\n", "
\n", "
\n", " \n", " 300\n", " \n", "
\n", "
\n", "
\n", " \n", " Battle⋅Station\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Metroflex']\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2011/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [91.44000244140625,⋅None]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2011-04-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Battle⋅Station')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1megatron\n", " \n", "
\n", "
\n", "\n", "\n", "
Viewing 6 of 6 rows / 17 columns
\n", "
1 partition(s)
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "keyCol.fingerprint(df,\"names\").table()" ] }, { "cell_type": "code", "execution_count": 259, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "'{\"taaaccoo\": {\"similar\": {\"taaaccoo\": 1}, \"count\": 1, \"sum\": 1}, \"piza\": {\"similar\": {\"piza\": 1}, \"count\": 1, \"sum\": 1}, \"hamburguer\": {\"similar\": {\"hamburguer\": 1}, \"count\": 1, \"sum\": 1}, \"taco\": {\"similar\": {\"taco\": 3}, \"count\": 1, \"sum\": 3}, \"pizzza\": {\"similar\": {\"pizzza\": 1}, \"count\": 1, \"sum\": 1}, \"arepa\": {\"similar\": {\"arepa\": 1}, \"count\": 1, \"sum\": 1}, \"pizza\": {\"similar\": {\"pizza\": 4}, \"count\": 1, \"sum\": 4}, \"Rice\": {\"similar\": {\"Rice\": 1}, \"count\": 1, \"sum\": 1}, \"110790\": {\"similar\": {\"110790\": 1}, \"count\": 1, \"sum\": 1}, \"BEER\": {\"similar\": {\"BEER\": 1}, \"count\": 1, \"sum\": 1}, \"Cake\": {\"similar\": {\"Cake\": 1}, \"count\": 1, \"sum\": 1}, \"null\": {\"similar\": {\"null\": 1}, \"count\": 1, \"sum\": 1}, \"pasta\": {\"similar\": {\"pasta\": 2}, \"count\": 1, \"sum\": 2}}'" ] }, "execution_count": 259, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keyCol.fingerprint_cluster(df,\"product\", output=\"json\")" ] }, { "cell_type": "code", "execution_count": 261, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'{\"arepa\": {\"similar\": {\"arepa\": 1}, \"count\": 1, \"sum\": 1}, \"taaaccoo\": {\"similar\": {\"taaaccoo\": 1}, \"count\": 1, \"sum\": 1}, \"pasta\": {\"similar\": {\"pasta\": 2}, \"count\": 1, \"sum\": 2}, \"pizza\": {\"similar\": {\"pizzza\": 1, \"pizza\": 4}, \"count\": 2, \"sum\": 5}, \"110790\": {\"similar\": {\"110790\": 1}, \"count\": 1, \"sum\": 1}, \"hamburguer\": {\"similar\": {\"hamburguer\": 1}, \"count\": 1, \"sum\": 1}, \"taco\": {\"similar\": {\"taco\": 3}, \"count\": 1, \"sum\": 3}, \"Cake\": {\"similar\": {\"Cake\": 1}, \"count\": 1, \"sum\": 1}, \"Rice\": {\"similar\": {\"Rice\": 1}, \"count\": 1, \"sum\": 1}, \"piza\": {\"similar\": {\"piza\": 1}, \"count\": 1, \"sum\": 1}, \"null\": {\"similar\": {\"null\": 1}, \"count\": 1, \"sum\": 1}, \"BEER\": {\"similar\": {\"BEER\": 1}, \"count\": 1, \"sum\": 1}}'" ] }, "execution_count": 261, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keyCol.n_gram_fingerprint_cluster(df,\"product\", output=\"json\",n_size=2)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from optimus.ml import keycollision as keyCol\n", "from optimus.ml import distancecluster as dc" ] }, { "cell_type": "code", "execution_count": 258, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'{\"taaaccoo\": {\"similar\": {\"taco\": 3, \"taaaccoo\": 1}, \"count\": 2, \"sum\": 4}, \"piza\": {\"similar\": {\"pizza\": 4, \"piza\": 1}, \"count\": 2, \"sum\": 5}, \"hamburguer\": {\"similar\": {\"BEER\": 1, \"hamburguer\": 1}, \"count\": 2, \"sum\": 2}, \"taco\": {\"similar\": {\"Cake\": 1, \"Rice\": 1, \"taco\": 3}, \"count\": 3, \"sum\": 5}, \"pizzza\": {\"similar\": {\"pizza\": 4, \"pizzza\": 1}, \"count\": 2, \"sum\": 5}, \"arepa\": {\"similar\": {\"BEER\": 1, \"piza\": 1, \"pasta\": 2, \"Cake\": 1, \"Rice\": 1, \"pizza\": 4, \"arepa\": 1}, \"count\": 7, \"sum\": 11}, \"pizza\": {\"similar\": {\"piza\": 1, \"pizzza\": 1, \"pizza\": 4}, \"count\": 3, \"sum\": 6}, \"Rice\": {\"similar\": {\"piza\": 1, \"Cake\": 1, \"taco\": 3, \"Rice\": 1}, \"count\": 4, \"sum\": 6}, \"110790\": {\"similar\": {\"arepa\": 1, \"BEER\": 1, \"piza\": 1, \"pizzza\": 1, \"pasta\": 2, \"Cake\": 1, \"null\": 1, \"Rice\": 1, \"pizza\": 4, \"taco\": 3, \"110790\": 1}, \"count\": 11, \"sum\": 17}, \"BEER\": {\"similar\": {\"arepa\": 1, \"piza\": 1, \"Cake\": 1, \"null\": 1, \"Rice\": 1, \"taco\": 3, \"BEER\": 1}, \"count\": 7, \"sum\": 9}, \"Cake\": {\"similar\": {\"Rice\": 1, \"taco\": 3, \"Cake\": 1}, \"count\": 3, \"sum\": 5}, \"null\": {\"similar\": {\"BEER\": 1, \"piza\": 1, \"Cake\": 1, \"Rice\": 1, \"taco\": 3, \"null\": 1}, \"count\": 6, \"sum\": 8}, \"pasta\": {\"similar\": {\"piza\": 1, \"pizza\": 4, \"pasta\": 2}, \"count\": 3, \"sum\": 7}}'" ] }, "execution_count": 258, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dc.levenshtein_cluster(df,\"product\", output=\"json\")" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 6 of 6 rows / 4 columns
\n", "
1 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
count
\n", "
1 (string)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
names
\n", "
2 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
names***NGRAM
\n", "
3 (array<string>)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
names***NGRAM_FINGERPRINT
\n", "
4 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " bumbl#ebéé⋅⋅\n", " \n", "
\n", "
\n", "
\n", " \n", " ['bumblebee']\n", " \n", "
\n", "
\n", "
\n", " \n", " bumblebee\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " ironhide&\n", " \n", "
\n", "
\n", "
\n", " \n", " ['ironhide']\n", " \n", "
\n", "
\n", "
\n", " \n", " ironhide\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " Megatron2\n", " \n", "
\n", "
\n", "
\n", " \n", " ['megatron2']\n", " \n", "
\n", "
\n", "
\n", " \n", " megatron2\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " Optimus⋅OptimusPrime\n", " \n", "
\n", "
\n", "
\n", " \n", " ['optimusoptimusprime']\n", " \n", "
\n", "
\n", "
\n", " \n", " optimusoptimusprime\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " Megatron1\n", " \n", "
\n", "
\n", "
\n", " \n", " ['megatron1']\n", " \n", "
\n", "
\n", "
\n", " \n", " megatron1\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " Megatron\n", " \n", "
\n", "
\n", "
\n", " \n", " ['megatron']\n", " \n", "
\n", "
\n", "
\n", " \n", " megatron\n", " \n", "
\n", "
\n", "\n", "\n", "
Viewing 6 of 6 rows / 4 columns
\n", "
1 partition(s)
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'{\"ironhide&\": {\"similar\": {\"ironhide&\": 1}, \"count\": 1, \"sum\": 1.0}, \"Megatron1\": {\"similar\": {\"Megatron1\": 1}, \"count\": 1, \"sum\": 1.0}, \"Optimus OptimusPrime\": {\"similar\": {\"Optimus OptimusPrime\": 1}, \"count\": 1, \"sum\": 1.0}, \"Megatron\": {\"similar\": {\"Megatron\": 1}, \"count\": 1, \"sum\": 1.0}, \"bumbl#eb\\\\u00e9\\\\u00e9 \": {\"similar\": {\"bumbl#eb\\\\u00e9\\\\u00e9 \": 1}, \"count\": 1, \"sum\": 1.0}, \"Megatron2\": {\"similar\": {\"Megatron2\": 1}, \"count\": 1, \"sum\": 1.0}}'" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keyCol.n_gram_fingerprint_cluster(df,\"names\", n_size=1,output=\"json\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 6 of 6 rows / 16 columns
\n", "
1 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
names
\n", "
1 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
height(ft)
\n", "
2 (smallint)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
function
\n", "
3 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
rank
\n", "
4 (tinyint)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
age
\n", "
5 (int)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
weight(t)
\n", "
6 (float)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
japanese name
\n", "
7 (array<string>)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
last position seen
\n", "
8 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
date arrival
\n", "
9 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
last date seen
\n", "
10 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
attributes
\n", "
11 (array<float>)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
DateType
\n", "
12 (date)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
timestamp
\n", "
13 (timestamp)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
Cybertronian
\n", "
14 (boolean)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
function(binary)
\n", "
15 (binary)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
NullType
\n", "
16 (null)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " Optimus⋅OptimusPrime\n", " \n", "
\n", "
\n", "
\n", " \n", " 28\n", " \n", "
\n", "
\n", "
\n", " \n", " Leader\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 4.300000190734863\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Inochi',⋅'Convoy']\n", " \n", "
\n", "
\n", "
\n", " \n", " 19.442735,-99.201111\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2016/09/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [8.53439998626709,⋅4300.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2016-09-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Leader')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " bumbl#ebéé⋅⋅\n", " \n", "
\n", "
\n", "
\n", " \n", " 17\n", " \n", "
\n", "
\n", "
\n", " \n", " Espionage\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 2.0\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Bumble',⋅'Goldback']\n", " \n", "
\n", "
\n", "
\n", " \n", " 10.642707,-71.612534\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2015/08/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [5.334000110626221,⋅2000.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2015-08-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Espionage')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " ironhide&\n", " \n", "
\n", "
\n", "
\n", " \n", " 26\n", " \n", "
\n", "
\n", "
\n", " \n", " Security\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 4.0\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Roadbuster']\n", " \n", "
\n", "
\n", "
\n", " \n", " 37.789563,-122.400356\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014/07/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [7.924799919128418,⋅4000.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Security')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " Megatron1\n", " \n", "
\n", "
\n", "
\n", " \n", " 13\n", " \n", "
\n", "
\n", "
\n", " \n", " First⋅Lieutenant\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 1.7999999523162842\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Meister']\n", " \n", "
\n", "
\n", "
\n", " \n", " 33.670666,-117.841553\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2013/06/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [3.962399959564209,⋅1800.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2013-06-24\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'First⋅Lieutenant')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " Megatron\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 5.699999809265137\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Megatron']\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2012/05/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [None,⋅5700.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2012-05-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'None')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " megatron\n", " \n", "
\n", "
\n", "
\n", " \n", " 300\n", " \n", "
\n", "
\n", "
\n", " \n", " Battle⋅Station\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Metroflex']\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2011/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [91.44000244140625,⋅None]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2011-04-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Battle⋅Station')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "\n", "\n", "
Viewing 6 of 6 rows / 16 columns
\n", "
1 partition(s)
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.table()" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "# df = op.load.csv(\"data/foo.csv\", sep=\",\", header='true', infer_schema='true', charset=\"UTF-8\", null_value=\"None\")" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 6 of 6 rows / 16 columns
\n", "
1 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
names
\n", "
1 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
height(ft)
\n", "
2 (smallint)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
function
\n", "
3 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
rank
\n", "
4 (tinyint)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
age
\n", "
5 (int)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
weight(t)
\n", "
6 (float)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
japanese name
\n", "
7 (array<string>)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
last position seen
\n", "
8 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
date arrival
\n", "
9 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
last date seen
\n", "
10 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
attributes
\n", "
11 (array<float>)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
DateType
\n", "
12 (date)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
timestamp
\n", "
13 (timestamp)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
Cybertronian
\n", "
14 (boolean)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
function(binary)
\n", "
15 (binary)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
NullType
\n", "
16 (null)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " Optimus⋅OptimusPrime\n", " \n", "
\n", "
\n", "
\n", " \n", " 28\n", " \n", "
\n", "
\n", "
\n", " \n", " Leader\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 4.300000190734863\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Inochi',⋅'Convoy']\n", " \n", "
\n", "
\n", "
\n", " \n", " 19.442735,-99.201111\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2016/09/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [8.53439998626709,⋅4300.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2016-09-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Leader')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " bumbl#ebéé⋅⋅\n", " \n", "
\n", "
\n", "
\n", " \n", " 17\n", " \n", "
\n", "
\n", "
\n", " \n", " Espionage\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 2.0\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Bumble',⋅'Goldback']\n", " \n", "
\n", "
\n", "
\n", " \n", " 10.642707,-71.612534\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2015/08/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [5.334000110626221,⋅2000.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2015-08-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Espionage')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " ironhide&\n", " \n", "
\n", "
\n", "
\n", " \n", " 26\n", " \n", "
\n", "
\n", "
\n", " \n", " Security\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 4.0\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Roadbuster']\n", " \n", "
\n", "
\n", "
\n", " \n", " 37.789563,-122.400356\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014/07/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [7.924799919128418,⋅4000.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Security')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " JaJa⋅JaJaJ\n", " \n", "
\n", "
\n", "
\n", " \n", " 13\n", " \n", "
\n", "
\n", "
\n", " \n", " First⋅Lieutenant\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 1.7999999523162842\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Meister']\n", " \n", "
\n", "
\n", "
\n", " \n", " 33.670666,-117.841553\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2013/06/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [3.962399959564209,⋅1800.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2013-06-24\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'First⋅Lieutenant')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " Megatron\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 5.699999809265137\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Megatron']\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2012/05/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [None,⋅5700.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2012-05-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'None')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " Metroplex_)^$\n", " \n", "
\n", "
\n", "
\n", " \n", " 300\n", " \n", "
\n", "
\n", "
\n", " \n", " Battle⋅Station\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Metroflex']\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2011/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [91.44000244140625,⋅None]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2011-04-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Battle⋅Station')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "\n", "\n", "
Viewing 6 of 6 rows / 16 columns
\n", "
1 partition(s)
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.table()" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 6 of 6 rows / 16 columns
\n", "
1 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
names
\n", "
1 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
height(ft)
\n", "
2 (smallint)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
function
\n", "
3 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
rank
\n", "
4 (tinyint)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
age
\n", "
5 (int)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
weight(t)
\n", "
6 (float)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
japanese name
\n", "
7 (array<string>)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
last position seen
\n", "
8 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
date arrival
\n", "
9 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
last date seen
\n", "
10 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
attributes
\n", "
11 (array<float>)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
DateType
\n", "
12 (date)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
timestamp
\n", "
13 (timestamp)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
Cybertronian
\n", "
14 (boolean)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
function(binary)
\n", "
15 (binary)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
NullType
\n", "
16 (null)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " Optimus⋅OptimusPrime\n", " \n", "
\n", "
\n", "
\n", " \n", " 28\n", " \n", "
\n", "
\n", "
\n", " \n", " Leader\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 4.300000190734863\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Inochi',⋅'Convoy']\n", " \n", "
\n", "
\n", "
\n", " \n", " 19.442735,-99.201111\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2016/09/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [8.53439998626709,⋅4300.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2016-09-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Leader')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " bumbl#ebéé⋅⋅\n", " \n", "
\n", "
\n", "
\n", " \n", " 17\n", " \n", "
\n", "
\n", "
\n", " \n", " Espionage\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 2.0\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Bumble',⋅'Goldback']\n", " \n", "
\n", "
\n", "
\n", " \n", " 10.642707,-71.612534\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2015/08/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [5.334000110626221,⋅2000.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2015-08-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Espionage')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " ironhide&\n", " \n", "
\n", "
\n", "
\n", " \n", " 26\n", " \n", "
\n", "
\n", "
\n", " \n", " Security\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 4.0\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Roadbuster']\n", " \n", "
\n", "
\n", "
\n", " \n", " 37.789563,-122.400356\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014/07/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [7.924799919128418,⋅4000.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Security')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " aaa⋅JaJaJ\n", " \n", "
\n", "
\n", "
\n", " \n", " 13\n", " \n", "
\n", "
\n", "
\n", " \n", " First⋅Lieutenant\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 1.7999999523162842\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Meister']\n", " \n", "
\n", "
\n", "
\n", " \n", " 33.670666,-117.841553\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2013/06/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [3.962399959564209,⋅1800.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2013-06-24\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'First⋅Lieutenant')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " Megatron\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " 5.699999809265137\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Megatron']\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2012/05/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [None,⋅5700.0]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2012-05-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'None')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " Metroplex_)^$\n", " \n", "
\n", "
\n", "
\n", " \n", " 300\n", " \n", "
\n", "
\n", "
\n", " \n", " Battle⋅Station\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 5000000\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " ['Metroflex']\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2011/04/10\n", " \n", "
\n", "
\n", "
\n", " \n", " [91.44000244140625,⋅None]\n", " \n", "
\n", "
\n", "
\n", " \n", " 2011-04-10\n", " \n", "
\n", "
\n", "
\n", " \n", " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", "
\n", " \n", " True\n", " \n", "
\n", "
\n", "
\n", " \n", " bytearray(b'Battle⋅Station')\n", " \n", "
\n", "
\n", "
\n", " \n", " None\n", " \n", "
\n", "
\n", "\n", "\n", "
Viewing 6 of 6 rows / 16 columns
\n", "
1 partition(s)
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.cols.replace(\"names\",[\"JaJa\",\"bbb\"],\"aaa\",search_by=\"words\").table()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Send!\n" ] } ], "source": [ "df.send()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 19 of 19 rows / 8 columns
\n", "
1 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
id
\n", "
1 (int)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
firstName
\n", "
2 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
lastName
\n", "
3 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
billingId
\n", "
4 (int)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
product
\n", "
5 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
price
\n", "
6 (int)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
birth
\n", "
7 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
dummyCol
\n", "
8 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " Luis\n", " \n", "
\n", "
\n", "
\n", " \n", " Alvarez$$%!\n", " \n", "
\n", "
\n", "
\n", " \n", " 123\n", " \n", "
\n", "
\n", "
\n", " \n", " Cake\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 1980/07/07\n", " \n", "
\n", "
\n", "
\n", " \n", " never\n", " \n", "
\n", "
\n", "
\n", " \n", " 2\n", " \n", "
\n", "
\n", "
\n", " \n", " André\n", " \n", "
\n", "
\n", "
\n", " \n", " Ampère\n", " \n", "
\n", "
\n", "
\n", " \n", " 423\n", " \n", "
\n", "
\n", "
\n", " \n", " piza\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 1950/07/08\n", " \n", "
\n", "
\n", "
\n", " \n", " gonna\n", " \n", "
\n", "
\n", "
\n", " \n", " 3\n", " \n", "
\n", "
\n", "
\n", " \n", " NiELS\n", " \n", "
\n", "
\n", "
\n", " \n", " Böhr//((%%\n", " \n", "
\n", "
\n", "
\n", " \n", " 551\n", " \n", "
\n", "
\n", "
\n", " \n", " pizza\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 1990/07/09\n", " \n", "
\n", "
\n", "
\n", " \n", " give\n", " \n", "
\n", "
\n", "
\n", " \n", " 4\n", " \n", "
\n", "
\n", "
\n", " \n", " PAUL\n", " \n", "
\n", "
\n", "
\n", " \n", " dirac$\n", " \n", "
\n", "
\n", "
\n", " \n", " 521\n", " \n", "
\n", "
\n", "
\n", " \n", " pizza\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 1954/07/10\n", " \n", "
\n", "
\n", "
\n", " \n", " you\n", " \n", "
\n", "
\n", "
\n", " \n", " 5\n", " \n", "
\n", "
\n", "
\n", " \n", " Albert\n", " \n", "
\n", "
\n", "
\n", " \n", " Einstein\n", " \n", "
\n", "
\n", "
\n", " \n", " 634\n", " \n", "
\n", "
\n", "
\n", " \n", " pizza\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 1990/07/11\n", " \n", "
\n", "
\n", "
\n", " \n", " up\n", " \n", "
\n", "
\n", "
\n", " \n", " 6\n", " \n", "
\n", "
\n", "
\n", " \n", " Galileo\n", " \n", "
\n", "
\n", "
\n", " \n", " ⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI\n", " \n", "
\n", "
\n", "
\n", " \n", " 672\n", " \n", "
\n", "
\n", "
\n", " \n", " arepa\n", " \n", "
\n", "
\n", "
\n", " \n", " 5\n", " \n", "
\n", "
\n", "
\n", " \n", " 1930/08/12\n", " \n", "
\n", "
\n", "
\n", " \n", " never\n", " \n", "
\n", "
\n", "
\n", " \n", " 7\n", " \n", "
\n", "
\n", "
\n", " \n", " CaRL\n", " \n", "
\n", "
\n", "
\n", " \n", " Ga%%%uss\n", " \n", "
\n", "
\n", "
\n", " \n", " 323\n", " \n", "
\n", "
\n", "
\n", " \n", " taco\n", " \n", "
\n", "
\n", "
\n", " \n", " 3\n", " \n", "
\n", "
\n", "
\n", " \n", " 1970/07/13\n", " \n", "
\n", "
\n", "
\n", " \n", " gonna\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " David\n", " \n", "
\n", "
\n", "
\n", " \n", " H$$$ilbert\n", " \n", "
\n", "
\n", "
\n", " \n", " 624\n", " \n", "
\n", "
\n", "
\n", " \n", " taaaccoo\n", " \n", "
\n", "
\n", "
\n", " \n", " 3\n", " \n", "
\n", "
\n", "
\n", " \n", " 1950/07/14\n", " \n", "
\n", "
\n", "
\n", " \n", " let\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " Johannes\n", " \n", "
\n", "
\n", "
\n", " \n", " KEPLER\n", " \n", "
\n", "
\n", "
\n", " \n", " 735\n", " \n", "
\n", "
\n", "
\n", " \n", " taco\n", " \n", "
\n", "
\n", "
\n", " \n", " 3\n", " \n", "
\n", "
\n", "
\n", " \n", " 1920/04/22\n", " \n", "
\n", "
\n", "
\n", " \n", " you\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " JaMES\n", " \n", "
\n", "
\n", "
\n", " \n", " M$$ax%%well\n", " \n", "
\n", "
\n", "
\n", " \n", " 875\n", " \n", "
\n", "
\n", "
\n", " \n", " taco\n", " \n", "
\n", "
\n", "
\n", " \n", " 3\n", " \n", "
\n", "
\n", "
\n", " \n", " 1923/03/12\n", " \n", "
\n", "
\n", "
\n", " \n", " down\n", " \n", "
\n", "
\n", "
\n", " \n", " 11\n", " \n", "
\n", "
\n", "
\n", " \n", " Isaac\n", " \n", "
\n", "
\n", "
\n", " \n", " Newton\n", " \n", "
\n", "
\n", "
\n", " \n", " 992\n", " \n", "
\n", "
\n", "
\n", " \n", " pasta\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " 1999/02/15\n", " \n", "
\n", "
\n", "
\n", " \n", " never⋅\n", " \n", "
\n", "
\n", "
\n", " \n", " 12\n", " \n", "
\n", "
\n", "
\n", " \n", " Emmy%%\n", " \n", "
\n", "
\n", "
\n", " \n", " Nöether$\n", " \n", "
\n", "
\n", "
\n", " \n", " 234\n", " \n", "
\n", "
\n", "
\n", " \n", " pasta\n", " \n", "
\n", "
\n", "
\n", " \n", " 9\n", " \n", "
\n", "
\n", "
\n", " \n", " 1993/12/08\n", " \n", "
\n", "
\n", "
\n", " \n", " gonna\n", " \n", "
\n", "
\n", "
\n", " \n", " 13\n", " \n", "
\n", "
\n", "
\n", " \n", " Max!!!\n", " \n", "
\n", "
\n", "
\n", " \n", " Planck!!!\n", " \n", "
\n", "
\n", "
\n", " \n", " 111\n", " \n", "
\n", "
\n", "
\n", " \n", " hamburguer\n", " \n", "
\n", "
\n", "
\n", " \n", " 4\n", " \n", "
\n", "
\n", "
\n", " \n", " 1994/01/04\n", " \n", "
\n", "
\n", "
\n", " \n", " run⋅\n", " \n", "
\n", "
\n", "
\n", " \n", " 14\n", " \n", "
\n", "
\n", "
\n", " \n", " Fred\n", " \n", "
\n", "
\n", "
\n", " \n", " Hoy&&&le\n", " \n", "
\n", "
\n", "
\n", " \n", " 553\n", " \n", "
\n", "
\n", "
\n", " \n", " pizzza\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 1997/06/27\n", " \n", "
\n", "
\n", "
\n", " \n", " around\n", " \n", "
\n", "
\n", "
\n", " \n", " 15\n", " \n", "
\n", "
\n", "
\n", " \n", " (((⋅⋅⋅Heinrich⋅)))))\n", " \n", "
\n", "
\n", "
\n", " \n", " Hertz\n", " \n", "
\n", "
\n", "
\n", " \n", " 116\n", " \n", "
\n", "
\n", "
\n", " \n", " pizza\n", " \n", "
\n", "
\n", "
\n", " \n", " 8\n", " \n", "
\n", "
\n", "
\n", " \n", " 1956/11/30\n", " \n", "
\n", "
\n", "
\n", " \n", " and\n", " \n", "
\n", "
\n", "
\n", " \n", " 16\n", " \n", "
\n", "
\n", "
\n", " \n", " William\n", " \n", "
\n", "
\n", "
\n", " \n", " Gilbert###\n", " \n", "
\n", "
\n", "
\n", " \n", " 886\n", " \n", "
\n", "
\n", "
\n", " \n", " BEER\n", " \n", "
\n", "
\n", "
\n", " \n", " 2\n", " \n", "
\n", "
\n", "
\n", " \n", " 1958/03/26\n", " \n", "
\n", "
\n", "
\n", " \n", " desert\n", " \n", "
\n", "
\n", "
\n", " \n", " 17\n", " \n", "
\n", "
\n", "
\n", " \n", " Marie\n", " \n", "
\n", "
\n", "
\n", " \n", " CURIE\n", " \n", "
\n", "
\n", "
\n", " \n", " 912\n", " \n", "
\n", "
\n", "
\n", " \n", " Rice\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " 2000/03/22\n", " \n", "
\n", "
\n", "
\n", " \n", " you\n", " \n", "
\n", "
\n", "
\n", " \n", " 18\n", " \n", "
\n", "
\n", "
\n", " \n", " Arthur\n", " \n", "
\n", "
\n", "
\n", " \n", " COM%%%pton\n", " \n", "
\n", "
\n", "
\n", " \n", " 812\n", " \n", "
\n", "
\n", "
\n", " \n", " 110790\n", " \n", "
\n", "
\n", "
\n", " \n", " 5\n", " \n", "
\n", "
\n", "
\n", " \n", " 1899/01/01\n", " \n", "
\n", "
\n", "
\n", " \n", " #\n", " \n", "
\n", "
\n", "
\n", " \n", " 19\n", " \n", "
\n", "
\n", "
\n", " \n", " JAMES\n", " \n", "
\n", "
\n", "
\n", " \n", " Chadwick\n", " \n", "
\n", "
\n", "
\n", " \n", " 467\n", " \n", "
\n", "
\n", "
\n", " \n", " null\n", " \n", "
\n", "
\n", "
\n", " \n", " 10\n", " \n", "
\n", "
\n", "
\n", " \n", " 1921/05/03\n", " \n", "
\n", "
\n", "
\n", " \n", " #\n", " \n", "
\n", "
\n", "\n", "\n", "
Viewing 19 of 19 rows / 8 columns
\n", "
1 partition(s)
\n" ], "text/plain": [ "{'firstName': {'null': 0, 'missing': 0, 'string': 19}}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.table(20)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'count_outliers': 8, 'count_non_outliers': 11, 'max_z_score': 1.7111}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.outliers.z_score(\"price\",threshold =1).info()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'count_outliers': 0,\n", " 'count_non_outliers': 19,\n", " 'lower_bound': -4.5,\n", " 'lower_bound_count': 0,\n", " 'upper_bound': 15.5,\n", " 'upper_bound_count': 0,\n", " 'iqr1': 3,\n", " 'iqr3': 8}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.outliers.tukey(\"price\").info()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'count_outliers': 9,\n", " 'count_non_outliers': 19,\n", " 'lower_bound': 6,\n", " 'lower_bound_count': 9,\n", " 'upper_bound': 10,\n", " 'upper_bound_count': 0}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.outliers.mad(\"price\", threshold =1).info()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'count_outliers': 19, 'count_non_outliers': 19, 'max_m_z_score': 2.36075}" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.outliers.modified_z_score(\"price\",threshold =1).info()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\"taaaccoo\": {\"similar\": {\"taco\": 3, \"taaaccoo\": 1}, \"count\": 2, \"sum\": 4}, \"piza\": {\"similar\": {\"pizza\": 4, \"piza\": 1}, \"count\": 2, \"sum\": 5}, \"hamburguer\": {\"similar\": {\"BEER\": 1, \"hamburguer\": 1}, \"count\": 2, \"sum\": 2}, \"taco\": {\"similar\": {\"Cake\": 1, \"Rice\": 1, \"taco\": 3}, \"count\": 3, \"sum\": 5}, \"pizzza\": {\"similar\": {\"pizza\": 4, \"pizzza\": 1}, \"count\": 2, \"sum\": 5}, \"arepa\": {\"similar\": {\"BEER\": 1, \"piza\": 1, \"pasta\": 2, \"Cake\": 1, \"Rice\": 1, \"pizza\": 4, \"arepa\": 1}, \"count\": 7, \"sum\": 11}, \"pizza\": {\"similar\": {\"piza\": 1, \"pizzza\": 1, \"pizza\": 4}, \"count\": 3, \"sum\": 6}, \"Rice\": {\"similar\": {\"piza\": 1, \"Cake\": 1, \"taco\": 3, \"Rice\": 1}, \"count\": 4, \"sum\": 6}, \"110790\": {\"similar\": {\"arepa\": 1, \"BEER\": 1, \"piza\": 1, \"pizzza\": 1, \"pasta\": 2, \"Cake\": 1, \"null\": 1, \"Rice\": 1, \"pizza\": 4, \"taco\": 3, \"110790\": 1}, \"count\": 11, \"sum\": 17}, \"BEER\": {\"similar\": {\"arepa\": 1, \"piza\": 1, \"Cake\": 1, \"null\": 1, \"Rice\": 1, \"taco\": 3, \"BEER\": 1}, \"count\": 7, \"sum\": 9}, \"Cake\": {\"similar\": {\"Rice\": 1, \"taco\": 3, \"Cake\": 1}, \"count\": 3, \"sum\": 5}, \"null\": {\"similar\": {\"BEER\": 1, \"piza\": 1, \"Cake\": 1, \"Rice\": 1, \"taco\": 3, \"null\": 1}, \"count\": 6, \"sum\": 8}, \"pasta\": {\"similar\": {\"piza\": 1, \"pizza\": 4, \"pasta\": 2}, \"count\": 3, \"sum\": 7}}\n", "Wall time: 9.6 s\n" ] } ], "source": [ "%%time\n", "from optimus.ml import distancecluster as dc\n", "print(dc.levenshtein_cluster(df,'product',output=\"json\"))" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "from optimus.ml import distancecluster as dc\n", "from optimus.ml import keycollision as kc\n", "\n", "# result = dc.levenshtein_json(df,'product')\n", "result = kc.fingerprint_cluster(df, \"product\",3)" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 10 of 13 rows / 4 columns
\n", "
1 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
count
\n", "
1 (string)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
product
\n", "
2 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
product***NGRAM
\n", "
3 (array<string>)
\n", "
\n", " \n", " not nullable\n", " \n", "
\n", "
\n", "
product***NGRAM_FINGERPRINT
\n", "
4 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " taaaccoo\n", " \n", "
\n", "
\n", "
\n", " \n", " ['taaaccoo']\n", " \n", "
\n", "
\n", "
\n", " \n", " taaaccoo\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " piza\n", " \n", "
\n", "
\n", "
\n", " \n", " ['piza']\n", " \n", "
\n", "
\n", "
\n", " \n", " piza\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " hamburguer\n", " \n", "
\n", "
\n", "
\n", " \n", " ['hamburguer']\n", " \n", "
\n", "
\n", "
\n", " \n", " hamburguer\n", " \n", "
\n", "
\n", "
\n", " \n", " 3\n", " \n", "
\n", "
\n", "
\n", " \n", " taco\n", " \n", "
\n", "
\n", "
\n", " \n", " ['taco']\n", " \n", "
\n", "
\n", "
\n", " \n", " taco\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " BEER\n", " \n", "
\n", "
\n", "
\n", " \n", " ['beer']\n", " \n", "
\n", "
\n", "
\n", " \n", " beer\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " pizzza\n", " \n", "
\n", "
\n", "
\n", " \n", " ['pizzza']\n", " \n", "
\n", "
\n", "
\n", " \n", " pizzza\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " arepa\n", " \n", "
\n", "
\n", "
\n", " \n", " ['arepa']\n", " \n", "
\n", "
\n", "
\n", " \n", " arepa\n", " \n", "
\n", "
\n", "
\n", " \n", " 4\n", " \n", "
\n", "
\n", "
\n", " \n", " pizza\n", " \n", "
\n", "
\n", "
\n", " \n", " ['pizza']\n", " \n", "
\n", "
\n", "
\n", " \n", " pizza\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " Rice\n", " \n", "
\n", "
\n", "
\n", " \n", " ['rice']\n", " \n", "
\n", "
\n", "
\n", " \n", " rice\n", " \n", "
\n", "
\n", "
\n", " \n", " 1\n", " \n", "
\n", "
\n", "
\n", " \n", " 110790\n", " \n", "
\n", "
\n", "
\n", " \n", " ['110790']\n", " \n", "
\n", "
\n", "
\n", " \n", " 110790\n", " \n", "
\n", "
\n", "\n", "\n", "
Viewing 10 of 13 rows / 4 columns
\n", "
1 partition(s)
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "result = kc.n_gram_fingerprint_cluster(df, \"product\",3)\n" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'taaaccoo': {'similar': ['taaaccoo'], 'count': 1, 'sum': 1.0}, 'piza': {'similar': ['piza'], 'count': 1, 'sum': 1.0}, 'hamburguer': {'similar': ['hamburguer'], 'count': 1, 'sum': 1.0}, 'taco': {'similar': ['taco'], 'count': 1, 'sum': 3.0}, 'pizzza': {'similar': ['pizzza'], 'count': 1, 'sum': 1.0}, 'arepa': {'similar': ['arepa'], 'count': 1, 'sum': 1.0}, 'pizza': {'similar': ['pizza'], 'count': 1, 'sum': 4.0}, 'Rice': {'similar': ['Rice'], 'count': 1, 'sum': 1.0}, '110790': {'similar': ['110790'], 'count': 1, 'sum': 1.0}, 'BEER': {'similar': ['BEER'], 'count': 1, 'sum': 1.0}, 'Cake': {'similar': ['Cake'], 'count': 1, 'sum': 1.0}, 'null': {'similar': ['null'], 'count': 1, 'sum': 1.0}, 'pasta': {'similar': ['pasta'], 'count': 1, 'sum': 2.0}}\n" ] } ], "source": [ "print(result)" ] }, { "cell_type": "code", "execution_count": 159, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "str" ] }, "execution_count": 159, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(result)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['taaaccoo', 1]\n", "['piza', 1]\n", "['hamburguer', 1]\n", "['taco', 3]\n", "['BEER', 1]\n", "['pizzza', 1]\n", "['arepa', 1]\n", "['pizza', 4]\n", "['Rice', 1]\n", "['110790', 1]\n", "['Cake', 1]\n", "['null', 1]\n", "['pasta', 2]\n" ] } ], "source": [ "kv_dict ={}\n", "for row in result.collect():\n", " _row = list(row.asDict().values())\n", " print(_row)\n", " kv_dict[_row[0]] = _row[1]" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'taaaccoo': 1, 'piza': 1, 'hamburguer': 1, 'taco': 3, 'BEER': 1, 'pizzza': 1, 'arepa': 1, 'pizza': 4, 'Rice': 1, '110790': 1, 'Cake': 1, 'null': 1, 'pasta': 2}\n" ] } ], "source": [ "print(kv_dict)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'str' object has no attribute 'cols'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0ma\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcols\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"product***LEVENSHTEIN_DISTANCE\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m: 'str' object has no attribute 'cols'" ] } ], "source": [ "a.cols.replace(\"product***LEVENSHTEIN_DISTANCE\", 0, None).table()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'str' object has no attribute 'rows'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0ma\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrows\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mwhere\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"product_LEVENSHTEIN_1\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m!=\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"product_LEVENSHTEIN_2\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m&\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"product***LEVENSHTEIN_DISTANCE\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m==\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m: 'str' object has no attribute 'rows'" ] } ], "source": [ "a.rows.drop(where=((a[\"product_LEVENSHTEIN_1\"]!=a[\"product_LEVENSHTEIN_2\"])& (a[\"product***LEVENSHTEIN_DISTANCE\"]==0))).table()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 4 }