{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Introduction"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Import py_entitymatching package\n",
"import py_entitymatching as em\n",
"import os\n",
"import pandas as pd\n",
"import pandas_profiling"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Then, read the (sample) input tables"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Get the datasets directory\n",
"datasets_dir = em.get_install_path() + os.sep + 'datasets'\n",
"\n",
"# Get the paths of the input tables\n",
"path_A = datasets_dir + os.sep + 'dblp_demo.csv'"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Metadata file is not present in the given path; proceeding to read the csv file.\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" title | \n",
" authors | \n",
" venue | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" l0 | \n",
" Paradise: A Database System for GIS Applications | \n",
" Paradise Team | \n",
" SIGMOD Conference | \n",
" 1995 | \n",
"
\n",
" \n",
" 1 | \n",
" l1 | \n",
" A Query Language and Optimization Techniques for Unstructured Data | \n",
" Gerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan Suciu | \n",
" SIGMOD Conference | \n",
" 1996 | \n",
"
\n",
" \n",
" 2 | \n",
" l2 | \n",
" Turbo-charging Vertical Mining of Large Databases | \n",
" Jayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav Bhalotia | \n",
" SIGMOD Conference | \n",
" 2000 | \n",
"
\n",
" \n",
" 3 | \n",
" l3 | \n",
" Maintenance of Data Cubes and Summary Tables in a Warehouse | \n",
" Inderpal Singh Mumick, Dallan Quass, Barinderpal Singh Mumick | \n",
" SIGMOD Conference | \n",
" 1997 | \n",
"
\n",
" \n",
" 4 | \n",
" l4 | \n",
" On Relational Support for XML Publishing: Beyond Sorting and Tagging | \n",
" Raghav Kaushik, Jeffrey F. Naughton, Surajit Chaudhuri | \n",
" SIGMOD Conference | \n",
" 2003 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id title \\\n",
"0 l0 Paradise: A Database System for GIS Applications \n",
"1 l1 A Query Language and Optimization Techniques for Unstructured Data \n",
"2 l2 Turbo-charging Vertical Mining of Large Databases \n",
"3 l3 Maintenance of Data Cubes and Summary Tables in a Warehouse \n",
"4 l4 On Relational Support for XML Publishing: Beyond Sorting and Tagging \n",
"\n",
" authors \\\n",
"0 Paradise Team \n",
"1 Gerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan Suciu \n",
"2 Jayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav Bhalotia \n",
"3 Inderpal Singh Mumick, Dallan Quass, Barinderpal Singh Mumick \n",
"4 Raghav Kaushik, Jeffrey F. Naughton, Surajit Chaudhuri \n",
"\n",
" venue year \n",
"0 SIGMOD Conference 1995 \n",
"1 SIGMOD Conference 1996 \n",
"2 SIGMOD Conference 2000 \n",
"3 SIGMOD Conference 1997 \n",
"4 SIGMOD Conference 2003 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Read the CSV files and set 'ID' as the key attribute\n",
"A = em.read_csv_metadata(path_A, key='id')\n",
"A.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Profiling"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"\n",
"\n",
"\n",
"
\n",
"
Overview
\n",
" \n",
"
\n",
"
\n",
"
Dataset info
\n",
"
\n",
" \n",
" \n",
" Number of variables | \n",
" 5 | \n",
"
\n",
" \n",
" Number of observations | \n",
" 1800 | \n",
"
\n",
" \n",
" Total Missing (%) | \n",
" 0.0% | \n",
"
\n",
" \n",
" Total size in memory | \n",
" 70.4 KiB | \n",
"
\n",
" \n",
" Average record size in memory | \n",
" 40.0 B | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
Variables types
\n",
"
\n",
" \n",
" \n",
" Numeric | \n",
" 1 | \n",
"
\n",
" \n",
" Categorical | \n",
" 3 | \n",
"
\n",
" \n",
" Date | \n",
" 0 | \n",
"
\n",
" \n",
" Text (Unique) | \n",
" 1 | \n",
"
\n",
" \n",
" Rejected | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
Warnings
\n",
"
authors
has a high cardinality: 1703 distinct values Warningtitle
has a high cardinality: 1797 distinct values Warning
\n",
"
\n",
"
\n",
"
\n",
"
Variables
\n",
" \n",
"
\n",
"
\n",
"
authors
\n",
" Categorical\n",
"
\n",
"
\n",
"
\n",
" \n",
" Distinct count | \n",
" 1703 | \n",
"
\n",
" \n",
" Unique (%) | \n",
" 94.6% | \n",
"
\n",
" \n",
" Missing (%) | \n",
" 0.0% | \n",
"
\n",
" \n",
" Missing (n) | \n",
" 0 | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
" \n",
" Dan Suciu | \n",
" \n",
" \n",
" \n",
" \n",
" 7\n",
" | \n",
"
\n",
" C. Mohan | \n",
" \n",
" \n",
" \n",
" \n",
" 6\n",
" | \n",
"
\n",
" Andrew Eisenberg, Jim Melton | \n",
" \n",
" \n",
" \n",
" \n",
" 5\n",
" | \n",
"
\n",
" Other values (1700) | \n",
" \n",
" \n",
" 1782\n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
" \n",
"
\n",
" \n",
" \n",
" Value | \n",
" Count | \n",
" Frequency (%) | \n",
" | \n",
"
\n",
" \n",
" \n",
" Dan Suciu | \n",
" 7 | \n",
" 0.4% | \n",
" \n",
" \n",
" | \n",
"
\n",
" C. Mohan | \n",
" 6 | \n",
" 0.3% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Andrew Eisenberg, Jim Melton | \n",
" 5 | \n",
" 0.3% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Xiaolei Qian | \n",
" 5 | \n",
" 0.3% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Joseph M. Hellerstein | \n",
" 4 | \n",
" 0.2% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Richard T. Snodgrass | \n",
" 4 | \n",
" 0.2% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Praveen Seshadri | \n",
" 3 | \n",
" 0.2% | \n",
" \n",
" \n",
" | \n",
"
\n",
" H. V. Jagadish | \n",
" 3 | \n",
" 0.2% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Nam Huyn | \n",
" 3 | \n",
" 0.2% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Viswanath Poosala, Yannis E. Ioannidis | \n",
" 3 | \n",
" 0.2% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Other values (1693) | \n",
" 1757 | \n",
" 97.6% | \n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
id
\n",
" Categorical, Unique\n",
"
\n",
"
\n",
" \n",
" \n",
" First 3 values | \n",
"
\n",
" \n",
" \n",
" \n",
" l415 | \n",
"
\n",
" \n",
" l1574 | \n",
"
\n",
" \n",
" l1364 | \n",
"
\n",
" \n",
"
\n",
"
\n",
" \n",
" \n",
" Last 3 values | \n",
"
\n",
" \n",
" \n",
" \n",
" l492 | \n",
"
\n",
" \n",
" l273 | \n",
"
\n",
" \n",
" l92 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
First 10 values
\n",
" \n",
"
\n",
" \n",
" \n",
" Value | \n",
" Count | \n",
" Frequency (%) | \n",
" | \n",
"
\n",
" \n",
" \n",
" l0 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" l1 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" l10 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" l100 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" l1000 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
Last 10 values
\n",
" \n",
"
\n",
" \n",
" \n",
" Value | \n",
" Count | \n",
" Frequency (%) | \n",
" | \n",
"
\n",
" \n",
" \n",
" l995 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" l996 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" l997 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" l998 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" l999 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
title
\n",
" Categorical\n",
"
\n",
"
\n",
"
\n",
" \n",
" Distinct count | \n",
" 1797 | \n",
"
\n",
" \n",
" Unique (%) | \n",
" 99.8% | \n",
"
\n",
" \n",
" Missing (%) | \n",
" 0.0% | \n",
"
\n",
" \n",
" Missing (n) | \n",
" 0 | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
" \n",
" Editorial | \n",
" \n",
" \n",
" \n",
" \n",
" 2\n",
" | \n",
"
\n",
" Guest editorial | \n",
" \n",
" \n",
" \n",
" \n",
" 2\n",
" | \n",
"
\n",
" Keynote Address | \n",
" \n",
" \n",
" \n",
" \n",
" 2\n",
" | \n",
"
\n",
" Other values (1794) | \n",
" \n",
" \n",
" 1794\n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
" \n",
"
\n",
" \n",
" \n",
" Value | \n",
" Count | \n",
" Frequency (%) | \n",
" | \n",
"
\n",
" \n",
" \n",
" Editorial | \n",
" 2 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Guest editorial | \n",
" 2 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Keynote Address | \n",
" 2 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Integrating Modelling Systems for Environmental Management Information Systems | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Historical Queries Along Multiple Lines of Time Evolution | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Selectivity Estimation Without the Attribute Value Independence Assumption | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Analysis of existing databases at the logical level: the DBA companion project | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Using Versions in Update Transactions: Application to Integrity Checking | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Power efficient data gathering and aggregation in wireless sensor networks | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Instance-based attribute identification in database integration | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Other values (1787) | \n",
" 1787 | \n",
" 99.3% | \n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
venue
\n",
" Categorical\n",
"
\n",
"
\n",
"
\n",
" \n",
" Distinct count | \n",
" 5 | \n",
"
\n",
" \n",
" Unique (%) | \n",
" 0.3% | \n",
"
\n",
" \n",
" Missing (%) | \n",
" 0.0% | \n",
"
\n",
" \n",
" Missing (n) | \n",
" 0 | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
" \n",
" SIGMOD Conference | \n",
" \n",
" \n",
" 654\n",
" \n",
" \n",
" | \n",
"
\n",
" VLDB | \n",
" \n",
" \n",
" 512\n",
" \n",
" \n",
" | \n",
"
\n",
" SIGMOD Record | \n",
" \n",
" \n",
" 381\n",
" \n",
" \n",
" | \n",
"
\n",
" Other values (2) | \n",
" \n",
" \n",
" 253\n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
" \n",
"
\n",
" \n",
" \n",
" Value | \n",
" Count | \n",
" Frequency (%) | \n",
" | \n",
"
\n",
" \n",
" \n",
" SIGMOD Conference | \n",
" 654 | \n",
" 36.3% | \n",
" \n",
" \n",
" | \n",
"
\n",
" VLDB | \n",
" 512 | \n",
" 28.4% | \n",
" \n",
" \n",
" | \n",
"
\n",
" SIGMOD Record | \n",
" 381 | \n",
" 21.2% | \n",
" \n",
" \n",
" | \n",
"
\n",
" VLDB J. | \n",
" 146 | \n",
" 8.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" ACM Trans. Database Syst. | \n",
" 107 | \n",
" 5.9% | \n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
year
\n",
" Numeric\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
" \n",
" Distinct count | \n",
" 10 | \n",
"
\n",
" \n",
" Unique (%) | \n",
" 0.6% | \n",
"
\n",
" \n",
" Missing (%) | \n",
" 0.0% | \n",
"
\n",
" \n",
" Missing (n) | \n",
" 0 | \n",
"
\n",
" \n",
" Infinite (%) | \n",
" 0.0% | \n",
"
\n",
" \n",
" Infinite (n) | \n",
" 0 | \n",
"
\n",
"
\n",
"\n",
"
\n",
"
\n",
"
\n",
"\n",
" \n",
" Mean | \n",
" 1998.4 | \n",
"
\n",
" \n",
" Minimum | \n",
" 1994 | \n",
"
\n",
" \n",
" Maximum | \n",
" 2003 | \n",
"
\n",
" \n",
" Zeros (%) | \n",
" 0.0% | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"\n",
"
\n",
"
\n",
"
\n",
"
\n",
"\n",
"
\n",
"
\n",
"
\n",
"
Quantile statistics
\n",
"
\n",
" \n",
" Minimum | \n",
" 1994 | \n",
"
\n",
" \n",
" 5-th percentile | \n",
" 1994 | \n",
"
\n",
" \n",
" Q1 | \n",
" 1996 | \n",
"
\n",
" \n",
" Median | \n",
" 1998 | \n",
"
\n",
" \n",
" Q3 | \n",
" 2001 | \n",
"
\n",
" \n",
" 95-th percentile | \n",
" 2003 | \n",
"
\n",
" \n",
" Maximum | \n",
" 2003 | \n",
"
\n",
" \n",
" Range | \n",
" 9 | \n",
"
\n",
" \n",
" Interquartile range | \n",
" 5 | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
Descriptive statistics
\n",
"
\n",
" \n",
" Standard deviation | \n",
" 2.8231 | \n",
"
\n",
" \n",
" Coef of variation | \n",
" 0.0014127 | \n",
"
\n",
" \n",
" Kurtosis | \n",
" -1.2004 | \n",
"
\n",
" \n",
" Mean | \n",
" 1998.4 | \n",
"
\n",
" \n",
" MAD | \n",
" 2.4525 | \n",
"
\n",
" \n",
" Skewness | \n",
" -0.007014 | \n",
"
\n",
" \n",
" Sum | \n",
" 3597166 | \n",
"
\n",
" \n",
" Variance | \n",
" 7.97 | \n",
"
\n",
" \n",
" Memory size | \n",
" 14.1 KiB | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
" \n",
"
\n",
" \n",
" \n",
" Value | \n",
" Count | \n",
" Frequency (%) | \n",
" | \n",
"
\n",
" \n",
" \n",
" 2001 | \n",
" 218 | \n",
" 12.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1998 | \n",
" 194 | \n",
" 10.8% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 2000 | \n",
" 191 | \n",
" 10.6% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1995 | \n",
" 188 | \n",
" 10.4% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1996 | \n",
" 182 | \n",
" 10.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1994 | \n",
" 182 | \n",
" 10.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1999 | \n",
" 176 | \n",
" 9.8% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1997 | \n",
" 164 | \n",
" 9.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 2003 | \n",
" 154 | \n",
" 8.6% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 2002 | \n",
" 151 | \n",
" 8.4% | \n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
Minimum 5 values
\n",
" \n",
"
\n",
" \n",
" \n",
" Value | \n",
" Count | \n",
" Frequency (%) | \n",
" | \n",
"
\n",
" \n",
" \n",
" 1994 | \n",
" 182 | \n",
" 10.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1995 | \n",
" 188 | \n",
" 10.4% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1996 | \n",
" 182 | \n",
" 10.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1997 | \n",
" 164 | \n",
" 9.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1998 | \n",
" 194 | \n",
" 10.8% | \n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
Maximum 5 values
\n",
" \n",
"
\n",
" \n",
" \n",
" Value | \n",
" Count | \n",
" Frequency (%) | \n",
" | \n",
"
\n",
" \n",
" \n",
" 1999 | \n",
" 176 | \n",
" 9.8% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 2000 | \n",
" 191 | \n",
" 10.6% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 2001 | \n",
" 218 | \n",
" 12.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 2002 | \n",
" 151 | \n",
" 8.4% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 2003 | \n",
" 154 | \n",
" 8.6% | \n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
Sample
\n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" title | \n",
" authors | \n",
" venue | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" l0 | \n",
" Paradise: A Database System for GIS Applications | \n",
" Paradise Team | \n",
" SIGMOD Conference | \n",
" 1995 | \n",
"
\n",
" \n",
" 1 | \n",
" l1 | \n",
" A Query Language and Optimization Techniques for Unstructured Data | \n",
" Gerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan Suciu | \n",
" SIGMOD Conference | \n",
" 1996 | \n",
"
\n",
" \n",
" 2 | \n",
" l2 | \n",
" Turbo-charging Vertical Mining of Large Databases | \n",
" Jayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav Bhalotia | \n",
" SIGMOD Conference | \n",
" 2000 | \n",
"
\n",
" \n",
" 3 | \n",
" l3 | \n",
" Maintenance of Data Cubes and Summary Tables in a Warehouse | \n",
" Inderpal Singh Mumick, Dallan Quass, Barinderpal Singh Mumick | \n",
" SIGMOD Conference | \n",
" 1997 | \n",
"
\n",
" \n",
" 4 | \n",
" l4 | \n",
" On Relational Support for XML Publishing: Beyond Sorting and Tagging | \n",
" Raghav Kaushik, Jeffrey F. Naughton, Surajit Chaudhuri | \n",
" SIGMOD Conference | \n",
" 2003 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pandas_profiling.ProfileReport(A)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Saving the Data Profiling Report to an HTML File"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"pfr = pandas_profiling.ProfileReport(A)\n",
"pfr.to_file(\"/tmp/example.html\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"\n",
"\n",
"\n",
"
\n",
"
Overview
\n",
" \n",
"
\n",
"
\n",
"
Dataset info
\n",
"
\n",
" \n",
" \n",
" Number of variables | \n",
" 5 | \n",
"
\n",
" \n",
" Number of observations | \n",
" 1800 | \n",
"
\n",
" \n",
" Total Missing (%) | \n",
" 0.0% | \n",
"
\n",
" \n",
" Total size in memory | \n",
" 70.4 KiB | \n",
"
\n",
" \n",
" Average record size in memory | \n",
" 40.0 B | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
Variables types
\n",
"
\n",
" \n",
" \n",
" Numeric | \n",
" 1 | \n",
"
\n",
" \n",
" Categorical | \n",
" 3 | \n",
"
\n",
" \n",
" Date | \n",
" 0 | \n",
"
\n",
" \n",
" Text (Unique) | \n",
" 1 | \n",
"
\n",
" \n",
" Rejected | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
Warnings
\n",
"
authors
has a high cardinality: 1703 distinct values Warningtitle
has a high cardinality: 1797 distinct values Warning
\n",
"
\n",
"
\n",
"
\n",
"
Variables
\n",
" \n",
"
\n",
"
\n",
"
authors
\n",
" Categorical\n",
"
\n",
"
\n",
"
\n",
" \n",
" Distinct count | \n",
" 1703 | \n",
"
\n",
" \n",
" Unique (%) | \n",
" 94.6% | \n",
"
\n",
" \n",
" Missing (%) | \n",
" 0.0% | \n",
"
\n",
" \n",
" Missing (n) | \n",
" 0 | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
" \n",
" Dan Suciu | \n",
" \n",
" \n",
" \n",
" \n",
" 7\n",
" | \n",
"
\n",
" C. Mohan | \n",
" \n",
" \n",
" \n",
" \n",
" 6\n",
" | \n",
"
\n",
" Andrew Eisenberg, Jim Melton | \n",
" \n",
" \n",
" \n",
" \n",
" 5\n",
" | \n",
"
\n",
" Other values (1700) | \n",
" \n",
" \n",
" 1782\n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
" \n",
"
\n",
" \n",
" \n",
" Value | \n",
" Count | \n",
" Frequency (%) | \n",
" | \n",
"
\n",
" \n",
" \n",
" Dan Suciu | \n",
" 7 | \n",
" 0.4% | \n",
" \n",
" \n",
" | \n",
"
\n",
" C. Mohan | \n",
" 6 | \n",
" 0.3% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Andrew Eisenberg, Jim Melton | \n",
" 5 | \n",
" 0.3% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Xiaolei Qian | \n",
" 5 | \n",
" 0.3% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Joseph M. Hellerstein | \n",
" 4 | \n",
" 0.2% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Richard T. Snodgrass | \n",
" 4 | \n",
" 0.2% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Praveen Seshadri | \n",
" 3 | \n",
" 0.2% | \n",
" \n",
" \n",
" | \n",
"
\n",
" H. V. Jagadish | \n",
" 3 | \n",
" 0.2% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Nam Huyn | \n",
" 3 | \n",
" 0.2% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Viswanath Poosala, Yannis E. Ioannidis | \n",
" 3 | \n",
" 0.2% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Other values (1693) | \n",
" 1757 | \n",
" 97.6% | \n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
id
\n",
" Categorical, Unique\n",
"
\n",
"
\n",
" \n",
" \n",
" First 3 values | \n",
"
\n",
" \n",
" \n",
" \n",
" l415 | \n",
"
\n",
" \n",
" l1574 | \n",
"
\n",
" \n",
" l1364 | \n",
"
\n",
" \n",
"
\n",
"
\n",
" \n",
" \n",
" Last 3 values | \n",
"
\n",
" \n",
" \n",
" \n",
" l492 | \n",
"
\n",
" \n",
" l273 | \n",
"
\n",
" \n",
" l92 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
First 10 values
\n",
" \n",
"
\n",
" \n",
" \n",
" Value | \n",
" Count | \n",
" Frequency (%) | \n",
" | \n",
"
\n",
" \n",
" \n",
" l0 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" l1 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" l10 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" l100 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" l1000 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
Last 10 values
\n",
" \n",
"
\n",
" \n",
" \n",
" Value | \n",
" Count | \n",
" Frequency (%) | \n",
" | \n",
"
\n",
" \n",
" \n",
" l995 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" l996 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" l997 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" l998 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" l999 | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
title
\n",
" Categorical\n",
"
\n",
"
\n",
"
\n",
" \n",
" Distinct count | \n",
" 1797 | \n",
"
\n",
" \n",
" Unique (%) | \n",
" 99.8% | \n",
"
\n",
" \n",
" Missing (%) | \n",
" 0.0% | \n",
"
\n",
" \n",
" Missing (n) | \n",
" 0 | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
" \n",
" Editorial | \n",
" \n",
" \n",
" \n",
" \n",
" 2\n",
" | \n",
"
\n",
" Guest editorial | \n",
" \n",
" \n",
" \n",
" \n",
" 2\n",
" | \n",
"
\n",
" Keynote Address | \n",
" \n",
" \n",
" \n",
" \n",
" 2\n",
" | \n",
"
\n",
" Other values (1794) | \n",
" \n",
" \n",
" 1794\n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
" \n",
"
\n",
" \n",
" \n",
" Value | \n",
" Count | \n",
" Frequency (%) | \n",
" | \n",
"
\n",
" \n",
" \n",
" Editorial | \n",
" 2 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Guest editorial | \n",
" 2 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Keynote Address | \n",
" 2 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Integrating Modelling Systems for Environmental Management Information Systems | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Historical Queries Along Multiple Lines of Time Evolution | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Selectivity Estimation Without the Attribute Value Independence Assumption | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Analysis of existing databases at the logical level: the DBA companion project | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Using Versions in Update Transactions: Application to Integrity Checking | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Power efficient data gathering and aggregation in wireless sensor networks | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Instance-based attribute identification in database integration | \n",
" 1 | \n",
" 0.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" Other values (1787) | \n",
" 1787 | \n",
" 99.3% | \n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
venue
\n",
" Categorical\n",
"
\n",
"
\n",
"
\n",
" \n",
" Distinct count | \n",
" 5 | \n",
"
\n",
" \n",
" Unique (%) | \n",
" 0.3% | \n",
"
\n",
" \n",
" Missing (%) | \n",
" 0.0% | \n",
"
\n",
" \n",
" Missing (n) | \n",
" 0 | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
" \n",
" SIGMOD Conference | \n",
" \n",
" \n",
" 654\n",
" \n",
" \n",
" | \n",
"
\n",
" VLDB | \n",
" \n",
" \n",
" 512\n",
" \n",
" \n",
" | \n",
"
\n",
" SIGMOD Record | \n",
" \n",
" \n",
" 381\n",
" \n",
" \n",
" | \n",
"
\n",
" Other values (2) | \n",
" \n",
" \n",
" 253\n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
" \n",
"
\n",
" \n",
" \n",
" Value | \n",
" Count | \n",
" Frequency (%) | \n",
" | \n",
"
\n",
" \n",
" \n",
" SIGMOD Conference | \n",
" 654 | \n",
" 36.3% | \n",
" \n",
" \n",
" | \n",
"
\n",
" VLDB | \n",
" 512 | \n",
" 28.4% | \n",
" \n",
" \n",
" | \n",
"
\n",
" SIGMOD Record | \n",
" 381 | \n",
" 21.2% | \n",
" \n",
" \n",
" | \n",
"
\n",
" VLDB J. | \n",
" 146 | \n",
" 8.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" ACM Trans. Database Syst. | \n",
" 107 | \n",
" 5.9% | \n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
year
\n",
" Numeric\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
" \n",
" Distinct count | \n",
" 10 | \n",
"
\n",
" \n",
" Unique (%) | \n",
" 0.6% | \n",
"
\n",
" \n",
" Missing (%) | \n",
" 0.0% | \n",
"
\n",
" \n",
" Missing (n) | \n",
" 0 | \n",
"
\n",
" \n",
" Infinite (%) | \n",
" 0.0% | \n",
"
\n",
" \n",
" Infinite (n) | \n",
" 0 | \n",
"
\n",
"
\n",
"\n",
"
\n",
"
\n",
"
\n",
"\n",
" \n",
" Mean | \n",
" 1998.4 | \n",
"
\n",
" \n",
" Minimum | \n",
" 1994 | \n",
"
\n",
" \n",
" Maximum | \n",
" 2003 | \n",
"
\n",
" \n",
" Zeros (%) | \n",
" 0.0% | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"\n",
"
\n",
"
\n",
"
\n",
"
\n",
"\n",
"
\n",
"
\n",
"
\n",
"
Quantile statistics
\n",
"
\n",
" \n",
" Minimum | \n",
" 1994 | \n",
"
\n",
" \n",
" 5-th percentile | \n",
" 1994 | \n",
"
\n",
" \n",
" Q1 | \n",
" 1996 | \n",
"
\n",
" \n",
" Median | \n",
" 1998 | \n",
"
\n",
" \n",
" Q3 | \n",
" 2001 | \n",
"
\n",
" \n",
" 95-th percentile | \n",
" 2003 | \n",
"
\n",
" \n",
" Maximum | \n",
" 2003 | \n",
"
\n",
" \n",
" Range | \n",
" 9 | \n",
"
\n",
" \n",
" Interquartile range | \n",
" 5 | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
Descriptive statistics
\n",
"
\n",
" \n",
" Standard deviation | \n",
" 2.8231 | \n",
"
\n",
" \n",
" Coef of variation | \n",
" 0.0014127 | \n",
"
\n",
" \n",
" Kurtosis | \n",
" -1.2004 | \n",
"
\n",
" \n",
" Mean | \n",
" 1998.4 | \n",
"
\n",
" \n",
" MAD | \n",
" 2.4525 | \n",
"
\n",
" \n",
" Skewness | \n",
" -0.007014 | \n",
"
\n",
" \n",
" Sum | \n",
" 3597166 | \n",
"
\n",
" \n",
" Variance | \n",
" 7.97 | \n",
"
\n",
" \n",
" Memory size | \n",
" 14.1 KiB | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
" \n",
"
\n",
" \n",
" \n",
" Value | \n",
" Count | \n",
" Frequency (%) | \n",
" | \n",
"
\n",
" \n",
" \n",
" 2001 | \n",
" 218 | \n",
" 12.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1998 | \n",
" 194 | \n",
" 10.8% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 2000 | \n",
" 191 | \n",
" 10.6% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1995 | \n",
" 188 | \n",
" 10.4% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1996 | \n",
" 182 | \n",
" 10.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1994 | \n",
" 182 | \n",
" 10.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1999 | \n",
" 176 | \n",
" 9.8% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1997 | \n",
" 164 | \n",
" 9.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 2003 | \n",
" 154 | \n",
" 8.6% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 2002 | \n",
" 151 | \n",
" 8.4% | \n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
Minimum 5 values
\n",
" \n",
"
\n",
" \n",
" \n",
" Value | \n",
" Count | \n",
" Frequency (%) | \n",
" | \n",
"
\n",
" \n",
" \n",
" 1994 | \n",
" 182 | \n",
" 10.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1995 | \n",
" 188 | \n",
" 10.4% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1996 | \n",
" 182 | \n",
" 10.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1997 | \n",
" 164 | \n",
" 9.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 1998 | \n",
" 194 | \n",
" 10.8% | \n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
Maximum 5 values
\n",
" \n",
"
\n",
" \n",
" \n",
" Value | \n",
" Count | \n",
" Frequency (%) | \n",
" | \n",
"
\n",
" \n",
" \n",
" 1999 | \n",
" 176 | \n",
" 9.8% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 2000 | \n",
" 191 | \n",
" 10.6% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 2001 | \n",
" 218 | \n",
" 12.1% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 2002 | \n",
" 151 | \n",
" 8.4% | \n",
" \n",
" \n",
" | \n",
"
\n",
" 2003 | \n",
" 154 | \n",
" 8.6% | \n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
Sample
\n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" title | \n",
" authors | \n",
" venue | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" l0 | \n",
" Paradise: A Database System for GIS Applications | \n",
" Paradise Team | \n",
" SIGMOD Conference | \n",
" 1995 | \n",
"
\n",
" \n",
" 1 | \n",
" l1 | \n",
" A Query Language and Optimization Techniques for Unstructured Data | \n",
" Gerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan Suciu | \n",
" SIGMOD Conference | \n",
" 1996 | \n",
"
\n",
" \n",
" 2 | \n",
" l2 | \n",
" Turbo-charging Vertical Mining of Large Databases | \n",
" Jayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav Bhalotia | \n",
" SIGMOD Conference | \n",
" 2000 | \n",
"
\n",
" \n",
" 3 | \n",
" l3 | \n",
" Maintenance of Data Cubes and Summary Tables in a Warehouse | \n",
" Inderpal Singh Mumick, Dallan Quass, Barinderpal Singh Mumick | \n",
" SIGMOD Conference | \n",
" 1997 | \n",
"
\n",
" \n",
" 4 | \n",
" l4 | \n",
" On Relational Support for XML Publishing: Beyond Sorting and Tagging | \n",
" Raghav Kaushik, Jeffrey F. Naughton, Surajit Chaudhuri | \n",
" SIGMOD Conference | \n",
" 2003 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pfr"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}