{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.append(\"..\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Now you can get extra information for the profiler if you activate pass verbose= True to optimus" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\argenisleon\\Anaconda3\\lib\\site-packages\\socks.py:58: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n", " from collections import Callable\n", "\n", " You are using PySparkling of version 2.4.10, but your PySpark is of\n", " version 2.3.1. Please make sure Spark and PySparkling versions are compatible. \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "f657036e-90b5-4391-8d48-fb3f16f6b8ae\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:optimus:Operative System:Windows\n", "INFO:optimus:Just check that Spark and all necessary environments vars are present...\n", "INFO:optimus:-----\n", "INFO:optimus:SPARK_HOME=C:\\opt\\spark\\spark-2.3.1-bin-hadoop2.7\n", "INFO:optimus:HADOOP_HOME=C:\\opt\\hadoop-2.7.7\n", "INFO:optimus:PYSPARK_PYTHON=C:\\Users\\argenisleon\\Anaconda3\\python.exe\n", "INFO:optimus:PYSPARK_DRIVER_PYTHON=jupyter\n", "INFO:optimus:PYSPARK_SUBMIT_ARGS=--jars \"file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar\" --driver-class-path \"C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar\" --conf \"spark.sql.catalogImplementation=hive\" pyspark-shell\n", "INFO:optimus:JAVA_HOME=C:\\java\n", "INFO:optimus:Pyarrow Installed\n", "INFO:optimus:-----\n", "INFO:optimus:Starting or getting SparkSession and SparkContext...\n", "INFO:optimus:Spark Version:2.3.1\n", "INFO:optimus:Setting checkpoint folder local. If you are in a cluster initialize Optimus with master='your_ip' as param\n", "INFO:optimus:Deleting previous folder if exists...\n", "INFO:optimus:Creating the checkpoint directory...\n", "INFO:optimus:\n", " ____ __ _ \n", " / __ \\____ / /_(_)___ ___ __ _______\n", " / / / / __ \\/ __/ / __ `__ \\/ / / / ___/\n", " / /_/ / /_/ / /_/ / / / / / / /_/ (__ ) \n", " \\____/ .___/\\__/_/_/ /_/ /_/\\__,_/____/ \n", " /_/ \n", " \n", "INFO:optimus:Transform and Roll out...\n", "INFO:optimus:Optimus successfully imported. Have fun :).\n", "INFO:optimus:Config.ini not found\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create optimus\n", "from optimus import Optimus\n", "op = Optimus(master=\"local[*]\", app_name = \"optimus\" , checkpoint= True, verbose=True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df = op.load.csv(\"data/Meteorite_Landings.csv\").h_repartition()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 10 of 45716 rows / 10 columns
\n", "
32 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
name
\n", "
1 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
id
\n", "
2 (int)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
nametype
\n", "
3 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
recclass
\n", "
4 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
mass (g)
\n", "
5 (double)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
fall
\n", "
6 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
year
\n", "
7 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
reclat
\n", "
8 (double)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
reclong
\n", "
9 (double)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
GeoLocation
\n", "
10 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
Acfer⋅232\n", "
\n", "
\n", "
240\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H5\n", "
\n", "
\n", "
725.0\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1991⋅12:00:00⋅AM\n", "
\n", "
\n", "
27.73944\n", "
\n", "
\n", "
4.32833\n", "
\n", "
\n", "
(27.739440,⋅4.328330)\n", "
\n", "
\n", "
Elephant⋅Moraine⋅90232\n", "
\n", "
\n", "
8641\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
L6\n", "
\n", "
\n", "
16.9\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1990⋅12:00:00⋅AM\n", "
\n", "
\n", "
-76.28795\n", "
\n", "
\n", "
156.46841\n", "
\n", "
\n", "
(-76.287950,⋅156.468410)\n", "
\n", "
\n", "
Grove⋅Mountains⋅020090\n", "
\n", "
\n", "
30681\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
Martian⋅(shergottite)\n", "
\n", "
\n", "
7.5\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2003⋅12:00:00⋅AM\n", "
\n", "
\n", "
-72.99944\n", "
\n", "
\n", "
75.26111\n", "
\n", "
\n", "
(-72.999440,⋅75.261110)\n", "
\n", "
\n", "
Northwest⋅Africa⋅891\n", "
\n", "
\n", "
31912\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H4\n", "
\n", "
\n", "
70.8\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2001⋅12:00:00⋅AM\n", "
\n", "
\n", "
None\n", "
\n", "
\n", "
None\n", "
\n", "
\n", "
None\n", "
\n", "
\n", "
Queen⋅Alexandra⋅Range⋅93098\n", "
\n", "
\n", "
19187\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H6\n", "
\n", "
\n", "
1.2\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1993⋅12:00:00⋅AM\n", "
\n", "
\n", "
-84.5757\n", "
\n", "
\n", "
162.56524\n", "
\n", "
\n", "
(-84.575700,⋅162.565240)\n", "
\n", "
\n", "
Queen⋅Alexandra⋅Range⋅94691\n", "
\n", "
\n", "
20322\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H6\n", "
\n", "
\n", "
9.6\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1994⋅12:00:00⋅AM\n", "
\n", "
\n", "
-84.0\n", "
\n", "
\n", "
168.0\n", "
\n", "
\n", "
(-84.000000,⋅168.000000)\n", "
\n", "
\n", "
Meteorite⋅Hills⋅00977\n", "
\n", "
\n", "
16211\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H5\n", "
\n", "
\n", "
13.2\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2000⋅12:00:00⋅AM\n", "
\n", "
\n", "
-79.68333\n", "
\n", "
\n", "
159.75\n", "
\n", "
\n", "
(-79.683330,⋅159.750000)\n", "
\n", "
\n", "
Grove⋅Mountains⋅020114\n", "
\n", "
\n", "
46531\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
L3\n", "
\n", "
\n", "
1.0\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2003⋅12:00:00⋅AM\n", "
\n", "
\n", "
-72.98194\n", "
\n", "
\n", "
75.25167\n", "
\n", "
\n", "
(-72.981940,⋅75.251670)\n", "
\n", "
\n", "
Pecora⋅Escarpment⋅91483\n", "
\n", "
\n", "
18774\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H5\n", "
\n", "
\n", "
5.5\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1991⋅12:00:00⋅AM\n", "
\n", "
\n", "
-85.55819\n", "
\n", "
\n", "
-68.31586\n", "
\n", "
\n", "
(-85.558190,⋅-68.315860)\n", "
\n", "
\n", "
Ramlat⋅as⋅Sahmah⋅390\n", "
\n", "
\n", "
55656\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H3.8-6\n", "
\n", "
\n", "
0.69\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2010⋅12:00:00⋅AM\n", "
\n", "
\n", "
20.0949\n", "
\n", "
\n", "
55.69318\n", "
\n", "
\n", "
(20.094900,⋅55.693180)\n", "
\n", "
\n", "\n", "\n", "
Viewing 10 of 45716 rows / 10 columns
\n", "
32 partition(s)
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.table(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Profiler dump mode (Faster). It just handle the column data type as present in the dataframe" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'name': 'string', 'id': 'int', 'nametype': 'string', 'recclass': 'string', 'mass (g)': 'double', 'fall': 'string', 'year': 'string', 'reclat': 'double', 'reclong': 'double', 'GeoLocation': 'string'}\n", "Including 'nan' as Null in processing 'name'\n", "Including 'nan' as Null in processing 'name'\n", "Including 'nan' as Null in processing 'nametype'\n", "Including 'nan' as Null in processing 'recclass'\n", "Including 'nan' as Null in processing 'fall'\n", "Including 'nan' as Null in processing 'year'\n", "Including 'nan' as Null in processing 'GeoLocation'\n" ] }, { "data": { "text/html": [ "\n", "
\n", "

Overview

\n", "
\n", "
\n", "
\n", "

Dataset info

\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Number of columns10
Number of rows45716
Total Missing (%)0.49%
Total size in memory64.0 MB
\n", "
\n", "
\n", "

Column types

\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
String1
Numeric0
Date0
Bool0
Array0
Not available0
\n", "
\n", "
\n", "\n", "\n", "
\n", "
\n", "\n", " \n", "\n", "
\n", "
\n", "

name

\n", " categorical\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unique
Unique (%) 99.56
Missing0.0
Missing (%)0
\n", "
\n", "

\n", " Datatypes\n", "

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " String\n", " \n", " 45716\n", "
\n", " Integer\n", " \n", " 0\n", "
\n", " Float\n", " \n", " 0\n", "
\n", " Bool\n", " \n", " 0\n", "
\n", " Date\n", " \n", " 0\n", "
\n", " Missing\n", " \n", " 0\n", "
\n", " Null\n", " \n", " 0\n", "
\n", " \n", "\n", "
\n", "
\n", "

Frequency

\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%)
Święcany10.002%
Łowicz10.002%
Österplana 06410.002%
Österplana 06310.002%
Österplana 06210.002%
Österplana 06110.002%
Österplana 06010.002%
Österplana 05910.002%
Österplana 05810.002%
Österplana 05710.002%
\"Missing\"00.0%
\n", "
\n", " \n", "\n", " \n", "
\n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "
\n", "\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "\n", "
\n", "
\n", "\n", "\n", "\n", "\n", "
Viewing 10 of 45716 rows / 10 columns
\n", "
32 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
name
\n", "
1 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
id
\n", "
2 (int)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
nametype
\n", "
3 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
recclass
\n", "
4 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
mass (g)
\n", "
5 (double)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
fall
\n", "
6 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
year
\n", "
7 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
reclat
\n", "
8 (double)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
reclong
\n", "
9 (double)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
GeoLocation
\n", "
10 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
Acfer⋅232\n", "
\n", "
\n", "
240\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H5\n", "
\n", "
\n", "
725.0\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1991⋅12:00:00⋅AM\n", "
\n", "
\n", "
27.73944\n", "
\n", "
\n", "
4.32833\n", "
\n", "
\n", "
(27.739440,⋅4.328330)\n", "
\n", "
\n", "
Elephant⋅Moraine⋅90232\n", "
\n", "
\n", "
8641\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
L6\n", "
\n", "
\n", "
16.9\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1990⋅12:00:00⋅AM\n", "
\n", "
\n", "
-76.28795\n", "
\n", "
\n", "
156.46841\n", "
\n", "
\n", "
(-76.287950,⋅156.468410)\n", "
\n", "
\n", "
Grove⋅Mountains⋅020090\n", "
\n", "
\n", "
30681\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
Martian⋅(shergottite)\n", "
\n", "
\n", "
7.5\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2003⋅12:00:00⋅AM\n", "
\n", "
\n", "
-72.99944\n", "
\n", "
\n", "
75.26111\n", "
\n", "
\n", "
(-72.999440,⋅75.261110)\n", "
\n", "
\n", "
Northwest⋅Africa⋅891\n", "
\n", "
\n", "
31912\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H4\n", "
\n", "
\n", "
70.8\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2001⋅12:00:00⋅AM\n", "
\n", "
\n", "
None\n", "
\n", "
\n", "
None\n", "
\n", "
\n", "
None\n", "
\n", "
\n", "
Queen⋅Alexandra⋅Range⋅93098\n", "
\n", "
\n", "
19187\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H6\n", "
\n", "
\n", "
1.2\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1993⋅12:00:00⋅AM\n", "
\n", "
\n", "
-84.5757\n", "
\n", "
\n", "
162.56524\n", "
\n", "
\n", "
(-84.575700,⋅162.565240)\n", "
\n", "
\n", "
Queen⋅Alexandra⋅Range⋅94691\n", "
\n", "
\n", "
20322\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H6\n", "
\n", "
\n", "
9.6\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1994⋅12:00:00⋅AM\n", "
\n", "
\n", "
-84.0\n", "
\n", "
\n", "
168.0\n", "
\n", "
\n", "
(-84.000000,⋅168.000000)\n", "
\n", "
\n", "
Meteorite⋅Hills⋅00977\n", "
\n", "
\n", "
16211\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H5\n", "
\n", "
\n", "
13.2\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2000⋅12:00:00⋅AM\n", "
\n", "
\n", "
-79.68333\n", "
\n", "
\n", "
159.75\n", "
\n", "
\n", "
(-79.683330,⋅159.750000)\n", "
\n", "
\n", "
Grove⋅Mountains⋅020114\n", "
\n", "
\n", "
46531\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
L3\n", "
\n", "
\n", "
1.0\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2003⋅12:00:00⋅AM\n", "
\n", "
\n", "
-72.98194\n", "
\n", "
\n", "
75.25167\n", "
\n", "
\n", "
(-72.981940,⋅75.251670)\n", "
\n", "
\n", "
Pecora⋅Escarpment⋅91483\n", "
\n", "
\n", "
18774\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H5\n", "
\n", "
\n", "
5.5\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1991⋅12:00:00⋅AM\n", "
\n", "
\n", "
-85.55819\n", "
\n", "
\n", "
-68.31586\n", "
\n", "
\n", "
(-85.558190,⋅-68.315860)\n", "
\n", "
\n", "
Ramlat⋅as⋅Sahmah⋅390\n", "
\n", "
\n", "
55656\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H3.8-6\n", "
\n", "
\n", "
0.69\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2010⋅12:00:00⋅AM\n", "
\n", "
\n", "
20.0949\n", "
\n", "
\n", "
55.69318\n", "
\n", "
\n", "
(20.094900,⋅55.693180)\n", "
\n", "
\n", "\n", "\n", "
Viewing 10 of 45716 rows / 10 columns
\n", "
32 partition(s)
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "op.profiler.run(df, \"name\", infer=False, approx_count= True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Profiler smart mode (Slower). It just try to infer the column data type and present extra data accordingly. From example datetype columns get extra histograms about minutes, day, week and month. Also can detect array types on data." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:optimus:Processing column 'GeoLocation'...\n", "INFO:optimus:_count_data_types() executed in 36.83 sec\n", "INFO:optimus:count_data_types() executed in 36.84 sec\n", "INFO:optimus:cast_columns() executed in 0.0 sec\n", "INFO:optimus:agg_exprs() executed in 4.67 sec\n", "INFO:optimus:general_stats() executed in 4.68 sec\n", "INFO:optimus:------------------------------\n", "INFO:optimus:Processing column 'GeoLocation'...\n", "INFO:optimus:frequency() executed in 6.22 sec\n", "INFO:optimus:stats_by_column() executed in 0.0 sec\n", "INFO:optimus:Using 'column_exp' to process column 'GeoLocation_len' with function func_col_exp\n", "INFO:optimus:Using 'column_exp' to process column 'GeoLocation_len' with function _bucketizer\n", "INFO:optimus:hist() executed in 4.79 sec\n", "INFO:optimus:hist_string() executed in 8.07 sec\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Including 'nan' as Null in processing 'name'\n", "Including 'nan' as Null in processing 'nametype'\n", "Including 'nan' as Null in processing 'recclass'\n", "Including 'nan' as Null in processing 'fall'\n", "Including 'nan' as Null in processing 'year'\n", "Including 'nan' as Null in processing 'GeoLocation'\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:optimus:dataset_info() executed in 3.87 sec\n" ] }, { "data": { "text/html": [ "\n", "
\n", "

Overview

\n", "
\n", "
\n", "
\n", "

Dataset info

\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Number of columns10
Number of rows45716
Total Missing (%)0.49%
Total size in memory80.0 MB
\n", "
\n", "
\n", "

Column types

\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
String0
Numeric0
Date0
Bool0
Array1
Not available0
\n", "
\n", "
\n", "\n", "\n", "
\n", "
\n", "\n", " \n", "\n", "
\n", "
\n", "

GeoLocation

\n", " array\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unique
Unique (%) 36.499
Missing0.0
Missing (%)0
\n", "
\n", "

\n", " Datatypes\n", "

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " String\n", " \n", " 0\n", "
\n", " Integer\n", " \n", " 0\n", "
\n", " Float\n", " \n", " 0\n", "
\n", " Bool\n", " \n", " 0\n", "
\n", " Date\n", " \n", " 0\n", "
\n", " Missing\n", " \n", " 0\n", "
\n", " Null\n", " \n", " 7315\n", "
\n", " \n", "\n", "
\n", "
\n", "

Frequency

\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%)
None731516.001%
(0.000000, 0.000000)621413.593%
(-71.500000, 35.666670)476110.414%
(-84.000000, 168.000000)30406.65%
(-72.000000, 26.000000)15053.292%
(-79.683330, 159.750000)6571.437%
(-76.716670, 159.666670)6371.393%
(-76.183330, 157.166670)5391.179%
(-79.683330, 155.750000)4731.035%
(-84.216670, 160.500000)2630.575%
\"Missing\"00.0%
\n", "
\n", " \n", "\n", " \n", "
\n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "
\n", "\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "\n", "
\n", "
\n", "\n", "\n", "\n", "\n", "
Viewing 10 of 45716 rows / 10 columns
\n", "
32 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
name
\n", "
1 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
id
\n", "
2 (int)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
nametype
\n", "
3 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
recclass
\n", "
4 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
mass (g)
\n", "
5 (double)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
fall
\n", "
6 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
year
\n", "
7 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
reclat
\n", "
8 (double)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
reclong
\n", "
9 (double)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
GeoLocation
\n", "
10 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
Acfer⋅232\n", "
\n", "
\n", "
240\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H5\n", "
\n", "
\n", "
725.0\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1991⋅12:00:00⋅AM\n", "
\n", "
\n", "
27.73944\n", "
\n", "
\n", "
4.32833\n", "
\n", "
\n", "
(27.739440,⋅4.328330)\n", "
\n", "
\n", "
Elephant⋅Moraine⋅90232\n", "
\n", "
\n", "
8641\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
L6\n", "
\n", "
\n", "
16.9\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1990⋅12:00:00⋅AM\n", "
\n", "
\n", "
-76.28795\n", "
\n", "
\n", "
156.46841\n", "
\n", "
\n", "
(-76.287950,⋅156.468410)\n", "
\n", "
\n", "
Grove⋅Mountains⋅020090\n", "
\n", "
\n", "
30681\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
Martian⋅(shergottite)\n", "
\n", "
\n", "
7.5\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2003⋅12:00:00⋅AM\n", "
\n", "
\n", "
-72.99944\n", "
\n", "
\n", "
75.26111\n", "
\n", "
\n", "
(-72.999440,⋅75.261110)\n", "
\n", "
\n", "
Northwest⋅Africa⋅891\n", "
\n", "
\n", "
31912\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H4\n", "
\n", "
\n", "
70.8\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2001⋅12:00:00⋅AM\n", "
\n", "
\n", "
None\n", "
\n", "
\n", "
None\n", "
\n", "
\n", "
None\n", "
\n", "
\n", "
Queen⋅Alexandra⋅Range⋅93098\n", "
\n", "
\n", "
19187\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H6\n", "
\n", "
\n", "
1.2\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1993⋅12:00:00⋅AM\n", "
\n", "
\n", "
-84.5757\n", "
\n", "
\n", "
162.56524\n", "
\n", "
\n", "
(-84.575700,⋅162.565240)\n", "
\n", "
\n", "
Queen⋅Alexandra⋅Range⋅94691\n", "
\n", "
\n", "
20322\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H6\n", "
\n", "
\n", "
9.6\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1994⋅12:00:00⋅AM\n", "
\n", "
\n", "
-84.0\n", "
\n", "
\n", "
168.0\n", "
\n", "
\n", "
(-84.000000,⋅168.000000)\n", "
\n", "
\n", "
Meteorite⋅Hills⋅00977\n", "
\n", "
\n", "
16211\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H5\n", "
\n", "
\n", "
13.2\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2000⋅12:00:00⋅AM\n", "
\n", "
\n", "
-79.68333\n", "
\n", "
\n", "
159.75\n", "
\n", "
\n", "
(-79.683330,⋅159.750000)\n", "
\n", "
\n", "
Grove⋅Mountains⋅020114\n", "
\n", "
\n", "
46531\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
L3\n", "
\n", "
\n", "
1.0\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2003⋅12:00:00⋅AM\n", "
\n", "
\n", "
-72.98194\n", "
\n", "
\n", "
75.25167\n", "
\n", "
\n", "
(-72.981940,⋅75.251670)\n", "
\n", "
\n", "
Pecora⋅Escarpment⋅91483\n", "
\n", "
\n", "
18774\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H5\n", "
\n", "
\n", "
5.5\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1991⋅12:00:00⋅AM\n", "
\n", "
\n", "
-85.55819\n", "
\n", "
\n", "
-68.31586\n", "
\n", "
\n", "
(-85.558190,⋅-68.315860)\n", "
\n", "
\n", "
Ramlat⋅as⋅Sahmah⋅390\n", "
\n", "
\n", "
55656\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H3.8-6\n", "
\n", "
\n", "
0.69\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2010⋅12:00:00⋅AM\n", "
\n", "
\n", "
20.0949\n", "
\n", "
\n", "
55.69318\n", "
\n", "
\n", "
(20.094900,⋅55.693180)\n", "
\n", "
\n", "\n", "\n", "
Viewing 10 of 45716 rows / 10 columns
\n", "
32 partition(s)
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:optimus:run() executed in 65.79 sec\n" ] } ], "source": [ "op.profiler.run(df, \"GeoLocation\",infer=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Plot profile for a specific column" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Including 'nan' as Null in processing 'name'\n", "Including 'nan' as Null in processing 'nametype'\n", "Including 'nan' as Null in processing 'recclass'\n", "Including 'nan' as Null in processing 'fall'\n", "Including 'nan' as Null in processing 'year'\n", "Including 'nan' as Null in processing 'GeoLocation'\n" ] }, { "data": { "text/html": [ "\n", "
\n", "

Overview

\n", "
\n", "
\n", "
\n", "

Dataset info

\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Number of columns10
Number of rows45716
Total Missing (%)0.49%
Total size in memory100.4 MB
\n", "
\n", "
\n", "

Column types

\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
String0
Numeric1
Date0
Bool0
Array0
Not available0
\n", "
\n", "
\n", "\n", "\n", "
\n", "
\n", "\n", " \n", "\n", "
\n", "
\n", "

reclat

\n", " numeric\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unique
Unique (%) 28.806
Missing0.0
Missing (%)0
\n", "
\n", "

\n", " Datatypes\n", "

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " String\n", " \n", " 0\n", "
\n", " Integer\n", " \n", " 0\n", "
\n", " Float\n", " \n", " 0\n", "
\n", " Bool\n", " \n", " 0\n", "
\n", " Date\n", " \n", " 0\n", "
\n", " Missing\n", " \n", " 0\n", "
\n", " Null\n", " \n", " 7315\n", "
\n", " \n", "
\n", "

\n", " Basic Stats\n", "

\n", "\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", "
Mean-39.12258
Minimum-87.36667
Maximum81.16667
Zeros(%)
\n", " \n", "\n", "
\n", "
\n", "

Frequency

\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%)
None731516.001%
0.0643814.083%
-71.5476110.414%
-84.030406.65%
-72.015063.294%
-79.6833311302.472%
-76.716676801.487%
-76.183335391.179%
-84.216672630.575%
-86.366672260.494%
\"Missing\"00.0%
\n", "
\n", " \n", "\n", " \n", "
\n", "\n", "\n", "

Quantile statistics

\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Minimum-87.36667
5-th percentile
Q1
Median
Q3
95-th percentile
Maximum81.16667
Range168.53334
Interquartile range0.0
\n", "
\n", "
\n", "

Descriptive statistics

\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Standard deviation46.37851
Coef of variation-1.18547
Kurtosis-1.4768
Mean-39.12258
MAD0.0
Skewness
Sum-1502346.20654
Variance2150.96632
\n", "
\n", " \n", "
\n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "
\n", "\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "\n", "
\n", "
\n", "\n", "\n", "\n", "\n", "
Viewing 10 of 45716 rows / 10 columns
\n", "
32 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
name
\n", "
1 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
id
\n", "
2 (int)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
nametype
\n", "
3 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
recclass
\n", "
4 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
mass (g)
\n", "
5 (double)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
fall
\n", "
6 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
year
\n", "
7 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
reclat
\n", "
8 (double)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
reclong
\n", "
9 (double)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
GeoLocation
\n", "
10 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
Acfer⋅232\n", "
\n", "
\n", "
240\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H5\n", "
\n", "
\n", "
725.0\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1991⋅12:00:00⋅AM\n", "
\n", "
\n", "
27.73944\n", "
\n", "
\n", "
4.32833\n", "
\n", "
\n", "
(27.739440,⋅4.328330)\n", "
\n", "
\n", "
Elephant⋅Moraine⋅90232\n", "
\n", "
\n", "
8641\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
L6\n", "
\n", "
\n", "
16.9\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1990⋅12:00:00⋅AM\n", "
\n", "
\n", "
-76.28795\n", "
\n", "
\n", "
156.46841\n", "
\n", "
\n", "
(-76.287950,⋅156.468410)\n", "
\n", "
\n", "
Grove⋅Mountains⋅020090\n", "
\n", "
\n", "
30681\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
Martian⋅(shergottite)\n", "
\n", "
\n", "
7.5\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2003⋅12:00:00⋅AM\n", "
\n", "
\n", "
-72.99944\n", "
\n", "
\n", "
75.26111\n", "
\n", "
\n", "
(-72.999440,⋅75.261110)\n", "
\n", "
\n", "
Northwest⋅Africa⋅891\n", "
\n", "
\n", "
31912\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H4\n", "
\n", "
\n", "
70.8\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2001⋅12:00:00⋅AM\n", "
\n", "
\n", "
None\n", "
\n", "
\n", "
None\n", "
\n", "
\n", "
None\n", "
\n", "
\n", "
Queen⋅Alexandra⋅Range⋅93098\n", "
\n", "
\n", "
19187\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H6\n", "
\n", "
\n", "
1.2\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1993⋅12:00:00⋅AM\n", "
\n", "
\n", "
-84.5757\n", "
\n", "
\n", "
162.56524\n", "
\n", "
\n", "
(-84.575700,⋅162.565240)\n", "
\n", "
\n", "
Queen⋅Alexandra⋅Range⋅94691\n", "
\n", "
\n", "
20322\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H6\n", "
\n", "
\n", "
9.6\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1994⋅12:00:00⋅AM\n", "
\n", "
\n", "
-84.0\n", "
\n", "
\n", "
168.0\n", "
\n", "
\n", "
(-84.000000,⋅168.000000)\n", "
\n", "
\n", "
Meteorite⋅Hills⋅00977\n", "
\n", "
\n", "
16211\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H5\n", "
\n", "
\n", "
13.2\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2000⋅12:00:00⋅AM\n", "
\n", "
\n", "
-79.68333\n", "
\n", "
\n", "
159.75\n", "
\n", "
\n", "
(-79.683330,⋅159.750000)\n", "
\n", "
\n", "
Grove⋅Mountains⋅020114\n", "
\n", "
\n", "
46531\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
L3\n", "
\n", "
\n", "
1.0\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2003⋅12:00:00⋅AM\n", "
\n", "
\n", "
-72.98194\n", "
\n", "
\n", "
75.25167\n", "
\n", "
\n", "
(-72.981940,⋅75.251670)\n", "
\n", "
\n", "
Pecora⋅Escarpment⋅91483\n", "
\n", "
\n", "
18774\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H5\n", "
\n", "
\n", "
5.5\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1991⋅12:00:00⋅AM\n", "
\n", "
\n", "
-85.55819\n", "
\n", "
\n", "
-68.31586\n", "
\n", "
\n", "
(-85.558190,⋅-68.315860)\n", "
\n", "
\n", "
Ramlat⋅as⋅Sahmah⋅390\n", "
\n", "
\n", "
55656\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H3.8-6\n", "
\n", "
\n", "
0.69\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2010⋅12:00:00⋅AM\n", "
\n", "
\n", "
20.0949\n", "
\n", "
\n", "
55.69318\n", "
\n", "
\n", "
(20.094900,⋅55.693180)\n", "
\n", "
\n", "\n", "\n", "
Viewing 10 of 45716 rows / 10 columns
\n", "
32 partition(s)
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "op.profiler.run(df, \"reclat\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Output a json file" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Plot histagram for multiple columns" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "df.plot.hist([\"id\", \"reclong\"], 20)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAA1QAAAEaCAYAAAAWrBZoAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAHURJREFUeJzt3X2UZHV95/H3hxkeRXmadhUGGBbRdUz0qCMSNYqCOqCCR9FAYgR5iqssJqi7RJAY0Iia+HSCMaxmRYwC6kbHOJEYBY26uDNoUIFFR0RmwIfhWUAdwO/+ce9g2XTTty49VvX0+3XOPVP31q+rf/M5t6v7U/fWrVQVkiRJkqThbTHqCUiSJEnSXGWhkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSVJPSf4lyZHT3LckSSVZ+NuelyTpt8dCJUmaUZJrkvw8ye0Dy66jntcoJLn3Axyr6qCqOqfj112cZP9NNjFJ0khYqCRJXb2gqrYfWK6fPMCjMZKk+cZCJUnqbeC0tmOSXAt8sd2+X5KvJbklyWWDR2aS7JXkS0l+luTzSf42yUfa+/ZPsm7S97gmyYHt7S2SnJzk+0luTHJBkp0nzeXIJNcmuSHJKQOPsyDJG9qv/VmSS5PsnuSsJH8z6Xt+Jsmfdvj/X5zk2IHH/+v2+14NPK9nrJKkOcRCJUmaDc8AHg08N8luwGeBNwM7A68DPplkoh37UeBSYBFwBjDle5CmcSLwwvb77QrcDJw1aczTgEcBBwCnJXl0u/0k4AjgYOAhwNHAncA5wBFJtgBIsqj92o9NNYGqyjRzOw54PvB4YBlw2KSv27+qLu74/5QkzREWKklSV59qjzjdkuRTk+57U1XdUVU/B14GrKyqlVX1q6r6PLAaODjJHsCTgDdW1S+r6svAZ4aYw58Ap1TVuqr6JfAm4LBJpxr+ZVX9vKouAy4DHtduPxY4taquqsZlVXVjVf1f4FaaEgVwOHBxVf1kiHkBvBR4d1WtraqbgLcO+fWSpDnIQiVJ6uqFVbVju7xw0n1rB27vCbxkoHzdQnPU6OG0R5Wq6o6B8T8cYg57Av808LhXAvcA/2lgzI8Hbt8JbN/e3h34/jSPew5NEaT999wh5rTRrvxmDsP8vyRJc5RvHpYkzYYauL0WOLeqjps8KMmewE5JHjRQqvYY+Po7gO0Gxi8AJgYeYi1wdFV9dYrHXjLDHNcCewPfmeK+jwDfSfI4mlMXJx+B6+JHNKVtoz16PIYkaY7xCJUkabZ9BHhBkue2F2rYpr3YxOKq+iHN6X9/mWSrJE8DXjDwtd8FtknyvCRbAqcCWw/c/37gLW0xI8lEkkM7zusDwBlJ9knjsUl2AaiqdcAqmiNTn2xPXRzWBcCJSRYn2Qk4ucdjSJLmGAuVJGlWVdVa4FDgDcB6miNDr+fXv3P+EHgycBPwF8CHB772VuBVNOXnOpojVoNX/XsPsAL41yQ/Ay5pH6uLd9KUnn8FbgM+CGw7cP85wO/S73Q/gP8JXEjzvq1vAP+75+NIkuaQVNXMoyRJ2kSSvAl4RFW9bKaxm3geT6c5urakqn41yrlIkuYOj1BJkua99vTC1wAfsExJkoZhoZIkzWvt51TdQnMVwnePeDqSpDnGU/4kSZIkqSePUEmSJElSTyP7HKpFixbVkiVLRvXtJUmSJGlal1566Q1VNTHTuJEVqiVLlrB69epRfXtJkiRJmlaSH3YZ5yl/kiRJktSThUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9zViokvxDkp8m+c409yfJe5OsSfKtJE+Y/WlKkiRJ0vjpcoTqQ8Dy+7n/IGCfdjke+LsHPi1JkiRJGn8zFqqq+jJw0/0MORT4cDUuAXZM8vDZmqAkSZIkjavZeA/VbsDagfV17TZJkiRJ2qwtnIXHyBTbasqByfE0pwWyxx57zMK3nl1LTv7sqKfwW3XNmc/r/bXzLSswr2GYVXdmNRzz6s6sujOr4ZhXd2bV3QPJatRm4wjVOmD3gfXFwPVTDayqs6tqWVUtm5iYmIVvLUmSJEmjMxuFagXw8vZqf/sBt1bVj2bhcSVJkiRprM14yl+SjwH7A4uSrAP+AtgSoKreD6wEDgbWAHcCr9hUk5UkSZKkcTJjoaqqI2a4v4BXz9qMJEmSJGmOmI1T/iRJkiRpXrJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ4sVJIkSZLUk4VKkiRJknqyUEmSJElSTxYqSZIkSerJQiVJkiRJPVmoJEmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSZIkST1ZqCRJkiSpJwuVJEmSJPVkoZIkSZKknixUkiRJktSThUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ4sVJIkSZLUk4VKkiRJknrqVKiSLE9yVZI1SU6e4v49klyU5JtJvpXk4NmfqiRJkiSNlxkLVZIFwFnAQcBS4IgkSycNOxW4oKoeDxwOvG+2JypJkiRJ46bLEap9gTVVdXVVbQDOAw6dNKaAh7S3dwCun70pSpIkSdJ46lKodgPWDqyva7cNehPwsiTrgJXAf5vqgZIcn2R1ktXr16/vMV1JkiRJGh9dClWm2FaT1o8APlRVi4GDgXOT3Oexq+rsqlpWVcsmJiaGn60kSZIkjZEuhWodsPvA+mLue0rfMcAFAFX1f4BtgEWzMUFJkiRJGlddCtUqYJ8keyXZiuaiEysmjbkWOAAgyaNpCpXn9EmSJEnarM1YqKrqbuAE4ELgSpqr+V2e5PQkh7TDXgscl+Qy4GPAUVU1+bRASZIkSdqsLOwyqKpW0lxsYnDbaQO3rwCeOrtTkyRJkqTx1umDfSVJkiRJ92WhkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSZIkST1ZqCRJkiSpJwuVJEmSJPVkoZIkSZKknixUkiRJktSThUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ4sVJIkSZLUk4VKkiRJknqyUEmSJElSTxYqSZIkSerJQiVJkiRJPVmoJEmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSZIkST1ZqCRJkiSpp06FKsnyJFclWZPk5GnGvDTJFUkuT/LR2Z2mJEmSJI2fhTMNSLIAOAt4NrAOWJVkRVVdMTBmH+DPgadW1c1JHrqpJixJkiRJ46LLEap9gTVVdXVVbQDOAw6dNOY44Kyquhmgqn46u9OUJEmSpPHTpVDtBqwdWF/Xbhv0SOCRSb6a5JIky6d6oCTHJ1mdZPX69ev7zViSJEmSxkSXQpUpttWk9YXAPsD+wBHAB5LseJ8vqjq7qpZV1bKJiYlh5ypJkiRJY6VLoVoH7D6wvhi4fooxn66qu6rqB8BVNAVLkiRJkjZbXQrVKmCfJHsl2Qo4HFgxacyngGcCJFlEcwrg1bM5UUmSJEkaNzMWqqq6GzgBuBC4Erigqi5PcnqSQ9phFwI3JrkCuAh4fVXduKkmLUmSJEnjYMbLpgNU1Upg5aRtpw3cLuCkdpEkSZKkeaHTB/tKkiRJku7LQiVJkiRJPVmoJEmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSZIkST1ZqCRJkiSpJwuVJEmSJPVkoZIkSZKknixUkiRJktSThUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ4sVJIkSZLUk4VKkiRJknqyUEmSJElSTxYqSZIkSerJQiVJkiRJPVmoJEmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk+dClWS5UmuSrImycn3M+6wJJVk2exNUZIkSZLG04yFKskC4CzgIGApcESSpVOMezBwIvD12Z6kJEmSJI2jLkeo9gXWVNXVVbUBOA84dIpxZwBvB34xi/OTJEmSpLHVpVDtBqwdWF/XbrtXkscDu1fVP9/fAyU5PsnqJKvXr18/9GQlSZIkaZx0KVSZYlvde2eyBfAu4LUzPVBVnV1Vy6pq2cTERPdZSpIkSdIY6lKo1gG7D6wvBq4fWH8w8DvAxUmuAfYDVnhhCkmSJEmbuy6FahWwT5K9kmwFHA6s2HhnVd1aVYuqaklVLQEuAQ6pqtWbZMaSJEmSNCZmLFRVdTdwAnAhcCVwQVVdnuT0JIds6glKkiRJ0rha2GVQVa0EVk7adto0Y/d/4NOSJEmSpPHX6YN9JUmSJEn3ZaGSJEmSpJ4sVJIkSZLUk4VKkiRJknqyUEmSJElSTxYqSZIkSerJQiVJkiRJPVmoJEmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSZIkST1ZqCRJkiSpJwuVJEmSJPVkoZIkSZKknixUkiRJktSThUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ4sVJIkSZLUk4VKkiRJknqyUEmSJElSTxYqSZIkSerJQiVJkiRJPVmoJEmSJKmnToUqyfIkVyVZk+TkKe4/KckVSb6V5AtJ9pz9qUqSJEnSeJmxUCVZAJwFHAQsBY5IsnTSsG8Cy6rqscAngLfP9kQlSZIkadx0OUK1L7Cmqq6uqg3AecChgwOq6qKqurNdvQRYPLvTlCRJkqTx06VQ7QasHVhf126bzjHAv0x1R5Ljk6xOsnr9+vXdZylJkiRJY6hLocoU22rKgcnLgGXAO6a6v6rOrqplVbVsYmKi+ywlSZIkaQwt7DBmHbD7wPpi4PrJg5IcCJwCPKOqfjk705MkSZKk8dXlCNUqYJ8keyXZCjgcWDE4IMnjgb8HDqmqn87+NCVJkiRp/MxYqKrqbuAE4ELgSuCCqro8yelJDmmHvQPYHvh4kv9IsmKah5MkSZKkzUaXU/6oqpXAyknbThu4feAsz0uSJEmSxl6nD/aVJEmSJN2XhUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ4sVJIkSZLUk4VKkiRJknqyUEmSJElSTxYqSZIkSerJQiVJkiRJPVmoJEmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSZIkST1ZqCRJkiSpJwuVJEmSJPVkoZIkSZKknixUkiRJktSThUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ46Faoky5NclWRNkpOnuH/rJOe39389yZLZnqgkSZIkjZsZC1WSBcBZwEHAUuCIJEsnDTsGuLmqHgG8C3jbbE9UkiRJksZNlyNU+wJrqurqqtoAnAccOmnMocA57e1PAAckyexNU5IkSZLGT6rq/gckhwHLq+rYdv2PgSdX1QkDY77TjlnXrn+/HXPDpMc6Hji+XX0UcNVs/UfmuEXADTOOEpjVsMyrO7PqzqyGY17dmdVwzKs7s+rOrH5tz6qamGnQwg4PNNWRpsktrMsYqups4OwO33NeSbK6qpaNeh5zgVkNx7y6M6vuzGo45tWdWQ3HvLozq+7ManhdTvlbB+w+sL4YuH66MUkWAjsAN83GBCVJkiRpXHUpVKuAfZLslWQr4HBgxaQxK4Aj29uHAV+smc4llCRJkqQ5bsZT/qrq7iQnABcCC4B/qKrLk5wOrK6qFcAHgXOTrKE5MnX4ppz0ZsjTILszq+GYV3dm1Z1ZDce8ujOr4ZhXd2bVnVkNacaLUkiSJEmSptbpg30lSZIkSfdloZIkSZKknixUGjtJ3C+H4IdoS5pLfM6StLnxD9ffIovC/UvyhCTbV9WvRj2XuSDJ3km28oqaM0uyo3/EaVNzH7t/SXZNstDnrG6SbDPqOcw1SRaMeg6an/wDfxNK8twkb01yZpIdLArTS3Ig8G/AKRbPmSV5AfA+4DGjnsu4S/I84M3Ag0c9l7mgfd56/6jnMRck2T/JCUmOSvIgi8L0khwCvBvYZ9RzmQuSPB84tf24Gt2PJM9przxNVd3j3xDTS/KUJH+Y5MAkW496PpsTd7pNJMly4ExgDfBQ4JUD95n7gDardwDnAA/eWDx9tXdqSR4DvAs4o6q+Oek+960BSQ4CzgA+XlW3TbrP/WuS9tXdZwDHJzltYLtZTZLkYJqfw52AZwNHDdxnXgOSPBF4J/C3VXXlpPt8zpqk/Z14BvCFqtow6vmMqzS2Bl5OUz7fAlBVv0qy5WhnN37a56z3AfsBxwHPHe2MNi8zfg6Vhtf+gjgMOLOqzk9yO/C77StO/15VtybZwiNWkGQ/4K3A8VW1Ksm3kpxaVW/21d5pPRT4WlV9Jcli4FBgEfD3VfXjJDE7SLIX8Dbg/Kr6UpKdgScABXy9qm735/A3ta/ufg7YFjgoycOq6lXAVsAvRzu78ZHk0cD/AE6oqq8mOQl4UJJHAuuq6k73rd/wn4HPVdWX2+es/YGtgU9W1S1m9WtJlgLvAd5RVRcl2Ql4BPAL4HtV9YuRTnCMtL/nfpnkw8A3gGOT7FRVr6qqu0Y8vbGS5AnAm4BXVtUlSd4GTLS/F2+3uD9wvjK0aSygefJ7RJJnAqcDewAvAS5tf+D95dG4ETiqqla16ycCj0my5wjnNO6uATYk2Rs4D9iFZv9amWRXy9S9fgH8E3B3kiOBFTRHEV4JrEjyYH8Ofy3JwvbIyt3AbcCBND+LK4DPJnmQR17udRvwmrZM7Qy8Gvh94DXAZ5I8xH3rN1xH84fvdsDHgSfSlKpV/j68j58Dq4AtkjyH5nnrz4E3Ame1GYrfeL/UDjSndO8LHJBkRfu8hUeq7nUzcGJbpnYB/oDmhf83A+9zv3rgLFSzKMnTkzy8fWXkHJpzxV8FXFRVL6+qI4GLgP86ynmOgyTPSLJbVX2vqi4b+EPtB8BDgMe24/wDjnv3rYe1q7fRHJH6U5pXfU+vqmOArwBvGNUcx0W7b+1aVT8C3k9zdOUk4B+r6mXAHwFrgYNHOM2xsXHfqqq7q/E1YHFV3QH8NfAs4O6qumO+l/WBfeu6qvqPdvPTgDdU1QtofiavB54/skmOiSQPS7LxfYvXAgfQnPZ3blX9WVX9MfBF4PhRzXGctHntWFU/oClPv0dzSum5VfUi4FSa57InjnCaY2Fg39pYxD8NTFTV7cDraV4M2hpgvh+parPaoap+UFWXtJsPoXnLwEE0+9i2NGVUD4CFapa0rySdA+wJ0B5xOQb4APCdgaFrgXt+6xMcI21WHwIWT76vqn4IfBJ4S1u45vUfcDDlvnUjzfn1TwN+L8lu7dDLgZtGMskxMbBv7QHQlqoPACdV1d+12zYAd9AcSZ7XJu9b7batgNuSvJnmfaB/AuzWniIybw3sW7sPbq+qFVV1fnv7LuBW5vnv1jQXgvkocH6SY6tqHXAC8FKa562NfkhzRHReG8jro21eP6A5cnBaVZ0NUFXfpXnOmtcX1xnI6jzgFe0RqO2A7ZKcQvOcdTzwRJ+z7s3qY0mO3nhEr6r+V1V9sL39PZpiuv3oZrqZqCqXB7jQvLHvMmC/dn0bYEF7+1HADcCftctq4NGjnvMYZbU1sMWkMVsCHwZeNOr5jnqZZt/asr29FPgszS/e97bjfmfUcx6jrLYGFk4x7rD253DvUc95DPPa+Ly1HPgR8JJ2fUdgr1HPecyy2mKKcS9u961HjHrOI8zqecA3gSfRHAW+CFjU3rcvcAXwWpoXhb4JLB31nMcwr12mGPci4FJgyajnPEZZfRHYub3vj2iODm98ztrJn8P7ZLXTFONeSPP+s3m7X83W4kUpZsdzgG2rOTd1Avgr4CFJvgqcS3P6xyvasUfWpKsczTPTZfVlmgt2fKuq7kqyCvj6SGc6HqbKa4ckX6F5L8JRNG/4fizw3qpaM7KZjt6M+1aSP6A5/e+oqvr+KCc7Bqbbt/4d+BpNOb8xyZZVdQtwyygnO2Jd9q1jaU7xfvk8/zl8EvDGai4y9DCaU7jf2u5XK2n+0Hsc8Ejg8Kq6anRTHQvT5fUl4NvtvnUMzemkR1TVNSOc66hNzmoH4MwkF9O8cP2kqrqufc66meZ9Q/PVdFl9mV/vVyfSnIHw0nm+X82KtA1VPaT5ENrb29sfpPmj9i7gH2l+uJ8I3FxVb02yELin5mngHbJ6As0FKv6mqub1KZHQed+6qarOHN0sx0PXfauq3t5e+e+eqrp2ZBMesY553VBV75jvV18bct/am2bfumZU8x2lNJ/DdcfA+nbAF4DPA5fQXI7/9qo6Y0RTHCsd8no6cFtV/VWS/0LzPsZ5WdRnyOrrwFOBn7V/a83356xh9qvHABuqOe1PD9SoD5HN1QU4iObo01MGtp1F8+bkjevPojkla5tRz3eOZPXPwNajnu+oF/PaZFltO+r5jnrpmNcB7lvuWz2zevKk7YsHbj/T/apXXtuNer5zJKvPzPd9y/1qtIun/PX3KJr3sDwryVZVdXFVvTrJFgOvkCyiecPtfM+5a1b30Lx/ar5/3o15dTdMVvP+IhR0y2sX3LfAfWsYG7N6dnu61Vfa7dclWVDNWQe70Lz5fb7vVzBcXu5b3bIq3Lfcr0Zovv+hP5SNl/CupuZfS3PKxxbA8iQ/pfmsjTureQ/Qq4CjgVdUe8rIfGJWwzGv7sxqOObVnVl1dz9ZPT/JDTQXNRnM6hia9y7Ou6zgAeX1s1HNeVTct7pzvxof8/rSrj1s3e600JyP+iWaS3feQvO5ERcAuyR5ELA3zS/ab49kpqNnVsMxr+7Majjm1Z1ZdXd/Wf0FTVaL2jfEP5Lmj7j5mhWY1zDMqjuzGhMWqo6SPBdYmeSh7aYFNJcWvprmszQOAm6nuTzsHcDr5+tOa1bDMa/uzGo45tWdWXU3RFY7V9WPgdfN16zAvIZhVt2Z1XixUHXQ7rTvoTlHdylANZcRPgf47zSfp3EK8G3gxUm2rXl6lRmzGo55dWdWwzGv7syquyGzOqzNat5+eK95dWdW3ZnVGKoxuDLGOC80bf8bwO8Drwc+M3Df0cAa4Pnt+qOAh456zmY1NxbzMivzGv1iVmZlXqNfzMqs5vri51DNIMk7gU9X1ZeSbAn8G/Dhqvpge//iqlqXJDXPwzSr4ZhXd2Y1HPPqzqy6M6vhmFd3ZtWdWY0nr/I3jSSH0Jx3elK7vlVVbUhyPrDXwNDrRjLBMWJWwzGv7sxqOObVnVl1Z1bDMa/uzKo7sxpvvodqCkmeA5wOXL9xW1VtaG9+FTg6yfJ2ew3+O9+Y1XDMqzuzGo55dWdW3ZnVcMyrO7PqzqzmgPs7H3A+LsBTgJ8A+7brOwB7AtsDW7bbjqP5NOpdRj1fs5o7i3mZlXmNfjErszKv0S9mZVab2+Ipf/d1I3AX8PAkuwCfAH5Oc+nJzyU5F/gusB+wYdpHmR/Majjm1Z1ZDce8ujOr7sxqOObVnVl1Z1Zzwagb3TguwONoruO/jqb1b0Fz5ZTzgJ3aMTuPep7jsJiVeZnVeCzmZVZmNfrFvMzKrObn4lX+ppFkKfDMqjprYNvngFOq6tLRzWz8mNVwzKs7sxqOeXVnVt2Z1XDMqzuz6s6sxpun/E2jqq4Arti4nuTFwATNqwMaYFbDMa/uzGo45tWdWXVnVsMxr+7MqjuzGm8WqhkkCfAK4HXAS6rqJyOe0tgyq+GYV3dmNRzz6s6sujOr4ZhXd2bVnVmNJ0/5m0G74z4D+HFV/b9Rz2ecmdVwzKs7sxqOeXVnVt2Z1XDMqzuz6s6sxpOFSpIkSZJ68oN9JUmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnq6f8DqWluwq3MrRwAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "df.plot.frequency([\"id\", \"reclong\"], 10)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 10 of 45716 rows / 10 columns
\n", "
32 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
name
\n", "
1 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
id
\n", "
2 (int)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
nametype
\n", "
3 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
recclass
\n", "
4 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
mass (g)
\n", "
5 (double)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
fall
\n", "
6 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
year
\n", "
7 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
reclat
\n", "
8 (double)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
reclong
\n", "
9 (double)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
GeoLocation
\n", "
10 (string)
\n", "
\n", " \n", " nullable\n", " \n", "
\n", "
\n", "
Acfer⋅232\n", "
\n", "
\n", "
240\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H5\n", "
\n", "
\n", "
725.0\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1991⋅12:00:00⋅AM\n", "
\n", "
\n", "
27.73944\n", "
\n", "
\n", "
4.32833\n", "
\n", "
\n", "
(27.739440,⋅4.328330)\n", "
\n", "
\n", "
Elephant⋅Moraine⋅90232\n", "
\n", "
\n", "
8641\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
L6\n", "
\n", "
\n", "
16.9\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1990⋅12:00:00⋅AM\n", "
\n", "
\n", "
-76.28795\n", "
\n", "
\n", "
156.46841\n", "
\n", "
\n", "
(-76.287950,⋅156.468410)\n", "
\n", "
\n", "
Grove⋅Mountains⋅020090\n", "
\n", "
\n", "
30681\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
Martian⋅(shergottite)\n", "
\n", "
\n", "
7.5\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2003⋅12:00:00⋅AM\n", "
\n", "
\n", "
-72.99944\n", "
\n", "
\n", "
75.26111\n", "
\n", "
\n", "
(-72.999440,⋅75.261110)\n", "
\n", "
\n", "
Northwest⋅Africa⋅891\n", "
\n", "
\n", "
31912\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H4\n", "
\n", "
\n", "
70.8\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2001⋅12:00:00⋅AM\n", "
\n", "
\n", "
None\n", "
\n", "
\n", "
None\n", "
\n", "
\n", "
None\n", "
\n", "
\n", "
Queen⋅Alexandra⋅Range⋅93098\n", "
\n", "
\n", "
19187\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H6\n", "
\n", "
\n", "
1.2\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1993⋅12:00:00⋅AM\n", "
\n", "
\n", "
-84.5757\n", "
\n", "
\n", "
162.56524\n", "
\n", "
\n", "
(-84.575700,⋅162.565240)\n", "
\n", "
\n", "
Queen⋅Alexandra⋅Range⋅94691\n", "
\n", "
\n", "
20322\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H6\n", "
\n", "
\n", "
9.6\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1994⋅12:00:00⋅AM\n", "
\n", "
\n", "
-84.0\n", "
\n", "
\n", "
168.0\n", "
\n", "
\n", "
(-84.000000,⋅168.000000)\n", "
\n", "
\n", "
Meteorite⋅Hills⋅00977\n", "
\n", "
\n", "
16211\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H5\n", "
\n", "
\n", "
13.2\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2000⋅12:00:00⋅AM\n", "
\n", "
\n", "
-79.68333\n", "
\n", "
\n", "
159.75\n", "
\n", "
\n", "
(-79.683330,⋅159.750000)\n", "
\n", "
\n", "
Grove⋅Mountains⋅020114\n", "
\n", "
\n", "
46531\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
L3\n", "
\n", "
\n", "
1.0\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2003⋅12:00:00⋅AM\n", "
\n", "
\n", "
-72.98194\n", "
\n", "
\n", "
75.25167\n", "
\n", "
\n", "
(-72.981940,⋅75.251670)\n", "
\n", "
\n", "
Pecora⋅Escarpment⋅91483\n", "
\n", "
\n", "
18774\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H5\n", "
\n", "
\n", "
5.5\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/1991⋅12:00:00⋅AM\n", "
\n", "
\n", "
-85.55819\n", "
\n", "
\n", "
-68.31586\n", "
\n", "
\n", "
(-85.558190,⋅-68.315860)\n", "
\n", "
\n", "
Ramlat⋅as⋅Sahmah⋅390\n", "
\n", "
\n", "
55656\n", "
\n", "
\n", "
Valid\n", "
\n", "
\n", "
H3.8-6\n", "
\n", "
\n", "
0.69\n", "
\n", "
\n", "
Found\n", "
\n", "
\n", "
01/01/2010⋅12:00:00⋅AM\n", "
\n", "
\n", "
20.0949\n", "
\n", "
\n", "
55.69318\n", "
\n", "
\n", "
(20.094900,⋅55.693180)\n", "
\n", "
\n", "\n", "\n", "
Viewing 10 of 45716 rows / 10 columns
\n", "
32 partition(s)
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.table()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Including 'nan' as Null in processing 'name'\n", "Including 'nan' as Null in processing 'nametype'\n", "Including 'nan' as Null in processing 'recclass'\n", "Including 'nan' as Null in processing 'fall'\n", "Including 'nan' as Null in processing 'year'\n", "Including 'nan' as Null in processing 'GeoLocation'\n" ] }, { "data": { "text/plain": [ "{'name': 0,\n", " 'id': 0,\n", " 'nametype': 0,\n", " 'recclass': 0,\n", " 'mass (g)': 131,\n", " 'fall': 0,\n", " 'year': 288,\n", " 'reclat': 7315,\n", " 'reclong': 7315,\n", " 'GeoLocation': 7315}" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.count_na(\"*\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "ename": "IndentationError", "evalue": "unexpected indent (, line 12)", "output_type": "error", "traceback": [ "\u001b[1;36m File \u001b[1;32m\"\"\u001b[1;36m, line \u001b[1;32m12\u001b[0m\n\u001b[1;33m df.cols.dtypes()\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mIndentationError\u001b[0m\u001b[1;31m:\u001b[0m unexpected indent\n" ] } ], "source": [ "a = {'name': 0,\n", " 'id': 0,\n", " 'nametype': 0,\n", " 'recclass': 0,\n", " 'mass (g)': 131,\n", " 'fall': 0,\n", " 'year': 288,\n", " 'reclat': 7315,\n", " 'reclong': 7315,\n", " 'GeoLocation': 7315}\n", "\n", " df.cols.dtypes()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\"id\",\"mass (g)\",\"reclat\"]\n", "# We drops nulls because correlation can not handle them\n", "df_not_nulls = df.rows.drop_na(cols)\n", "\n", "df_not_nulls.plot.correlation(cols)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'cols': ['id', 'mass (g)', 'reclat'],\n", " 'data': array([[ 1. , -0.01794746, 0.27151272],\n", " [-0.01794746, 1. , 0.02908721],\n", " [ 0.27151272, 0.02908721, 1. ]])}" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_not_nulls.cols.correlation([\"id\",\"mass (g)\", \"reclat\"], output=\"array\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "jupytext": { "formats": "ipynb,py:light" }, "kernel_info": { "name": "python3" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" }, "nteract": { "version": "0.11.6" } }, "nbformat": 4, "nbformat_minor": 2 }