{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Import py_entitymatching package\n", "import py_entitymatching as em\n", "import os\n", "import pandas as pd\n", "import pandas_profiling" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, read the (sample) input tables" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get the datasets directory\n", "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n", "\n", "# Get the paths of the input tables\n", "path_A = datasets_dir + os.sep + 'dblp_demo.csv'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Metadata file is not present in the given path; proceeding to read the csv file.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleauthorsvenueyear
0l0Paradise: A Database System for GIS ApplicationsParadise TeamSIGMOD Conference1995
1l1A Query Language and Optimization Techniques for Unstructured DataGerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan SuciuSIGMOD Conference1996
2l2Turbo-charging Vertical Mining of Large DatabasesJayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav BhalotiaSIGMOD Conference2000
3l3Maintenance of Data Cubes and Summary Tables in a WarehouseInderpal Singh Mumick, Dallan Quass, Barinderpal Singh MumickSIGMOD Conference1997
4l4On Relational Support for XML Publishing: Beyond Sorting and TaggingRaghav Kaushik, Jeffrey F. Naughton, Surajit ChaudhuriSIGMOD Conference2003
\n", "
" ], "text/plain": [ " id title \\\n", "0 l0 Paradise: A Database System for GIS Applications \n", "1 l1 A Query Language and Optimization Techniques for Unstructured Data \n", "2 l2 Turbo-charging Vertical Mining of Large Databases \n", "3 l3 Maintenance of Data Cubes and Summary Tables in a Warehouse \n", "4 l4 On Relational Support for XML Publishing: Beyond Sorting and Tagging \n", "\n", " authors \\\n", "0 Paradise Team \n", "1 Gerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan Suciu \n", "2 Jayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav Bhalotia \n", "3 Inderpal Singh Mumick, Dallan Quass, Barinderpal Singh Mumick \n", "4 Raghav Kaushik, Jeffrey F. Naughton, Surajit Chaudhuri \n", "\n", " venue year \n", "0 SIGMOD Conference 1995 \n", "1 SIGMOD Conference 1996 \n", "2 SIGMOD Conference 2000 \n", "3 SIGMOD Conference 1997 \n", "4 SIGMOD Conference 2003 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Read the CSV files and set 'ID' as the key attribute\n", "A = em.read_csv_metadata(path_A, key='id')\n", "A.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Profiling" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "
\n", "

Overview

\n", "
\n", "
\n", "
\n", "

Dataset info

\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Number of variables5
Number of observations1800
Total Missing (%)0.0%
Total size in memory70.4 KiB
Average record size in memory40.0 B
\n", "
\n", "
\n", "

Variables types

\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Numeric1
Categorical3
Date0
Text (Unique)1
Rejected0
\n", "
\n", "
\n", "

Warnings

\n", "
  • authors has a high cardinality: 1703 distinct values Warning
  • title has a high cardinality: 1797 distinct values Warning
\n", "
\n", "
\n", "
\n", "

Variables

\n", "
\n", "
\n", "
\n", "

authors
\n", " Categorical\n", "

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Distinct count1703
Unique (%)94.6%
Missing (%)0.0%
Missing (n)0
\n", "
\n", "
\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", "
Dan Suciu\n", "
\n", "  \n", "
\n", " 7\n", "
C. Mohan\n", "
\n", "  \n", "
\n", " 6\n", "
Andrew Eisenberg, Jim Melton\n", "
\n", "  \n", "
\n", " 5\n", "
Other values (1700)\n", "
\n", " 1782\n", "
\n", " \n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%) 
Dan Suciu70.4%\n", "
 
\n", "
C. Mohan60.3%\n", "
 
\n", "
Andrew Eisenberg, Jim Melton50.3%\n", "
 
\n", "
Xiaolei Qian50.3%\n", "
 
\n", "
Joseph M. Hellerstein40.2%\n", "
 
\n", "
Richard T. Snodgrass40.2%\n", "
 
\n", "
Praveen Seshadri30.2%\n", "
 
\n", "
H. V. Jagadish30.2%\n", "
 
\n", "
Nam Huyn30.2%\n", "
 
\n", "
Viswanath Poosala, Yannis E. Ioannidis30.2%\n", "
 
\n", "
Other values (1693)175797.6%\n", "
 
\n", "
\n", "
\n", "
\n", "
\n", "

id
\n", " Categorical, Unique\n", "

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
First 3 values
l415
l1574
l1364
\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Last 3 values
l492
l273
l92
\n", "\n", "
\n", "

First 10 values

\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%) 
l010.1%\n", "
 
\n", "
l110.1%\n", "
 
\n", "
l1010.1%\n", "
 
\n", "
l10010.1%\n", "
 
\n", "
l100010.1%\n", "
 
\n", "
\n", "

Last 10 values

\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%) 
l99510.1%\n", "
 
\n", "
l99610.1%\n", "
 
\n", "
l99710.1%\n", "
 
\n", "
l99810.1%\n", "
 
\n", "
l99910.1%\n", "
 
\n", "
\n", "
\n", "
\n", "
\n", "

title
\n", " Categorical\n", "

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Distinct count1797
Unique (%)99.8%
Missing (%)0.0%
Missing (n)0
\n", "
\n", "
\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", "
Editorial\n", "
\n", "  \n", "
\n", " 2\n", "
Guest editorial\n", "
\n", "  \n", "
\n", " 2\n", "
Keynote Address\n", "
\n", "  \n", "
\n", " 2\n", "
Other values (1794)\n", "
\n", " 1794\n", "
\n", " \n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%) 
Editorial20.1%\n", "
 
\n", "
Guest editorial20.1%\n", "
 
\n", "
Keynote Address20.1%\n", "
 
\n", "
Integrating Modelling Systems for Environmental Management Information Systems10.1%\n", "
 
\n", "
Historical Queries Along Multiple Lines of Time Evolution10.1%\n", "
 
\n", "
Selectivity Estimation Without the Attribute Value Independence Assumption10.1%\n", "
 
\n", "
Analysis of existing databases at the logical level: the DBA companion project10.1%\n", "
 
\n", "
Using Versions in Update Transactions: Application to Integrity Checking10.1%\n", "
 
\n", "
Power efficient data gathering and aggregation in wireless sensor networks10.1%\n", "
 
\n", "
Instance-based attribute identification in database integration10.1%\n", "
 
\n", "
Other values (1787)178799.3%\n", "
 
\n", "
\n", "
\n", "
\n", "
\n", "

venue
\n", " Categorical\n", "

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Distinct count5
Unique (%)0.3%
Missing (%)0.0%
Missing (n)0
\n", "
\n", "
\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", "
SIGMOD Conference\n", "
\n", " 654\n", "
\n", " \n", "
VLDB\n", "
\n", " 512\n", "
\n", " \n", "
SIGMOD Record\n", "
\n", " 381\n", "
\n", " \n", "
Other values (2)\n", "
\n", " 253\n", "
\n", " \n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%) 
SIGMOD Conference65436.3%\n", "
 
\n", "
VLDB51228.4%\n", "
 
\n", "
SIGMOD Record38121.2%\n", "
 
\n", "
VLDB J.1468.1%\n", "
 
\n", "
ACM Trans. Database Syst.1075.9%\n", "
 
\n", "
\n", "
\n", "
\n", "
\n", "

year
\n", " Numeric\n", "

\n", "
\n", "
\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Distinct count10
Unique (%)0.6%
Missing (%)0.0%
Missing (n)0
Infinite (%)0.0%
Infinite (n)0
\n", "\n", "
\n", "
\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Mean1998.4
Minimum1994
Maximum2003
Zeros (%)0.0%
\n", "
\n", "
\n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "
\n", "
\n", "

Quantile statistics

\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Minimum1994
5-th percentile1994
Q11996
Median1998
Q32001
95-th percentile2003
Maximum2003
Range9
Interquartile range5
\n", "
\n", "
\n", "

Descriptive statistics

\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Standard deviation2.8231
Coef of variation0.0014127
Kurtosis-1.2004
Mean1998.4
MAD2.4525
Skewness-0.007014
Sum3597166
Variance7.97
Memory size14.1 KiB
\n", "
\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%) 
200121812.1%\n", "
 
\n", "
199819410.8%\n", "
 
\n", "
200019110.6%\n", "
 
\n", "
199518810.4%\n", "
 
\n", "
199618210.1%\n", "
 
\n", "
199418210.1%\n", "
 
\n", "
19991769.8%\n", "
 
\n", "
19971649.1%\n", "
 
\n", "
20031548.6%\n", "
 
\n", "
20021518.4%\n", "
 
\n", "
\n", "
\n", "
\n", "

Minimum 5 values

\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%) 
199418210.1%\n", "
 
\n", "
199518810.4%\n", "
 
\n", "
199618210.1%\n", "
 
\n", "
19971649.1%\n", "
 
\n", "
199819410.8%\n", "
 
\n", "
\n", "

Maximum 5 values

\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%) 
19991769.8%\n", "
 
\n", "
200019110.6%\n", "
 
\n", "
200121812.1%\n", "
 
\n", "
20021518.4%\n", "
 
\n", "
20031548.6%\n", "
 
\n", "
\n", "
\n", "
\n", "
\n", "
\n", "
\n", "

Sample

\n", "
\n", "
\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleauthorsvenueyear
0l0Paradise: A Database System for GIS ApplicationsParadise TeamSIGMOD Conference1995
1l1A Query Language and Optimization Techniques for Unstructured DataGerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan SuciuSIGMOD Conference1996
2l2Turbo-charging Vertical Mining of Large DatabasesJayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav BhalotiaSIGMOD Conference2000
3l3Maintenance of Data Cubes and Summary Tables in a WarehouseInderpal Singh Mumick, Dallan Quass, Barinderpal Singh MumickSIGMOD Conference1997
4l4On Relational Support for XML Publishing: Beyond Sorting and TaggingRaghav Kaushik, Jeffrey F. Naughton, Surajit ChaudhuriSIGMOD Conference2003
\n", "
\n", "
\n", "
" ], "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pandas_profiling.ProfileReport(A)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Saving the Data Profiling Report to an HTML File" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "pfr = pandas_profiling.ProfileReport(A)\n", "pfr.to_file(\"/tmp/example.html\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "
\n", "

Overview

\n", "
\n", "
\n", "
\n", "

Dataset info

\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Number of variables5
Number of observations1800
Total Missing (%)0.0%
Total size in memory70.4 KiB
Average record size in memory40.0 B
\n", "
\n", "
\n", "

Variables types

\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Numeric1
Categorical3
Date0
Text (Unique)1
Rejected0
\n", "
\n", "
\n", "

Warnings

\n", "
  • authors has a high cardinality: 1703 distinct values Warning
  • title has a high cardinality: 1797 distinct values Warning
\n", "
\n", "
\n", "
\n", "

Variables

\n", "
\n", "
\n", "
\n", "

authors
\n", " Categorical\n", "

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Distinct count1703
Unique (%)94.6%
Missing (%)0.0%
Missing (n)0
\n", "
\n", "
\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", "
Dan Suciu\n", "
\n", "  \n", "
\n", " 7\n", "
C. Mohan\n", "
\n", "  \n", "
\n", " 6\n", "
Andrew Eisenberg, Jim Melton\n", "
\n", "  \n", "
\n", " 5\n", "
Other values (1700)\n", "
\n", " 1782\n", "
\n", " \n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%) 
Dan Suciu70.4%\n", "
 
\n", "
C. Mohan60.3%\n", "
 
\n", "
Andrew Eisenberg, Jim Melton50.3%\n", "
 
\n", "
Xiaolei Qian50.3%\n", "
 
\n", "
Joseph M. Hellerstein40.2%\n", "
 
\n", "
Richard T. Snodgrass40.2%\n", "
 
\n", "
Praveen Seshadri30.2%\n", "
 
\n", "
H. V. Jagadish30.2%\n", "
 
\n", "
Nam Huyn30.2%\n", "
 
\n", "
Viswanath Poosala, Yannis E. Ioannidis30.2%\n", "
 
\n", "
Other values (1693)175797.6%\n", "
 
\n", "
\n", "
\n", "
\n", "
\n", "

id
\n", " Categorical, Unique\n", "

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
First 3 values
l415
l1574
l1364
\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Last 3 values
l492
l273
l92
\n", "\n", "
\n", "

First 10 values

\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%) 
l010.1%\n", "
 
\n", "
l110.1%\n", "
 
\n", "
l1010.1%\n", "
 
\n", "
l10010.1%\n", "
 
\n", "
l100010.1%\n", "
 
\n", "
\n", "

Last 10 values

\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%) 
l99510.1%\n", "
 
\n", "
l99610.1%\n", "
 
\n", "
l99710.1%\n", "
 
\n", "
l99810.1%\n", "
 
\n", "
l99910.1%\n", "
 
\n", "
\n", "
\n", "
\n", "
\n", "

title
\n", " Categorical\n", "

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Distinct count1797
Unique (%)99.8%
Missing (%)0.0%
Missing (n)0
\n", "
\n", "
\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", "
Editorial\n", "
\n", "  \n", "
\n", " 2\n", "
Guest editorial\n", "
\n", "  \n", "
\n", " 2\n", "
Keynote Address\n", "
\n", "  \n", "
\n", " 2\n", "
Other values (1794)\n", "
\n", " 1794\n", "
\n", " \n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%) 
Editorial20.1%\n", "
 
\n", "
Guest editorial20.1%\n", "
 
\n", "
Keynote Address20.1%\n", "
 
\n", "
Integrating Modelling Systems for Environmental Management Information Systems10.1%\n", "
 
\n", "
Historical Queries Along Multiple Lines of Time Evolution10.1%\n", "
 
\n", "
Selectivity Estimation Without the Attribute Value Independence Assumption10.1%\n", "
 
\n", "
Analysis of existing databases at the logical level: the DBA companion project10.1%\n", "
 
\n", "
Using Versions in Update Transactions: Application to Integrity Checking10.1%\n", "
 
\n", "
Power efficient data gathering and aggregation in wireless sensor networks10.1%\n", "
 
\n", "
Instance-based attribute identification in database integration10.1%\n", "
 
\n", "
Other values (1787)178799.3%\n", "
 
\n", "
\n", "
\n", "
\n", "
\n", "

venue
\n", " Categorical\n", "

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Distinct count5
Unique (%)0.3%
Missing (%)0.0%
Missing (n)0
\n", "
\n", "
\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", "
SIGMOD Conference\n", "
\n", " 654\n", "
\n", " \n", "
VLDB\n", "
\n", " 512\n", "
\n", " \n", "
SIGMOD Record\n", "
\n", " 381\n", "
\n", " \n", "
Other values (2)\n", "
\n", " 253\n", "
\n", " \n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%) 
SIGMOD Conference65436.3%\n", "
 
\n", "
VLDB51228.4%\n", "
 
\n", "
SIGMOD Record38121.2%\n", "
 
\n", "
VLDB J.1468.1%\n", "
 
\n", "
ACM Trans. Database Syst.1075.9%\n", "
 
\n", "
\n", "
\n", "
\n", "
\n", "

year
\n", " Numeric\n", "

\n", "
\n", "
\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Distinct count10
Unique (%)0.6%
Missing (%)0.0%
Missing (n)0
Infinite (%)0.0%
Infinite (n)0
\n", "\n", "
\n", "
\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Mean1998.4
Minimum1994
Maximum2003
Zeros (%)0.0%
\n", "
\n", "
\n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "
\n", "
\n", "

Quantile statistics

\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Minimum1994
5-th percentile1994
Q11996
Median1998
Q32001
95-th percentile2003
Maximum2003
Range9
Interquartile range5
\n", "
\n", "
\n", "

Descriptive statistics

\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Standard deviation2.8231
Coef of variation0.0014127
Kurtosis-1.2004
Mean1998.4
MAD2.4525
Skewness-0.007014
Sum3597166
Variance7.97
Memory size14.1 KiB
\n", "
\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%) 
200121812.1%\n", "
 
\n", "
199819410.8%\n", "
 
\n", "
200019110.6%\n", "
 
\n", "
199518810.4%\n", "
 
\n", "
199618210.1%\n", "
 
\n", "
199418210.1%\n", "
 
\n", "
19991769.8%\n", "
 
\n", "
19971649.1%\n", "
 
\n", "
20031548.6%\n", "
 
\n", "
20021518.4%\n", "
 
\n", "
\n", "
\n", "
\n", "

Minimum 5 values

\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%) 
199418210.1%\n", "
 
\n", "
199518810.4%\n", "
 
\n", "
199618210.1%\n", "
 
\n", "
19971649.1%\n", "
 
\n", "
199819410.8%\n", "
 
\n", "
\n", "

Maximum 5 values

\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", "
ValueCountFrequency (%) 
19991769.8%\n", "
 
\n", "
200019110.6%\n", "
 
\n", "
200121812.1%\n", "
 
\n", "
20021518.4%\n", "
 
\n", "
20031548.6%\n", "
 
\n", "
\n", "
\n", "
\n", "
\n", "
\n", "
\n", "

Sample

\n", "
\n", "
\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleauthorsvenueyear
0l0Paradise: A Database System for GIS ApplicationsParadise TeamSIGMOD Conference1995
1l1A Query Language and Optimization Techniques for Unstructured DataGerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan SuciuSIGMOD Conference1996
2l2Turbo-charging Vertical Mining of Large DatabasesJayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav BhalotiaSIGMOD Conference2000
3l3Maintenance of Data Cubes and Summary Tables in a WarehouseInderpal Singh Mumick, Dallan Quass, Barinderpal Singh MumickSIGMOD Conference1997
4l4On Relational Support for XML Publishing: Beyond Sorting and TaggingRaghav Kaushik, Jeffrey F. Naughton, Surajit ChaudhuriSIGMOD Conference2003
\n", "
\n", "
\n", "
" ], "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pfr" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }