{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Import py_entitymatching package\n", "import py_entitymatching as em\n", "import os\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, read the (sample) input tables" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get the datasets directory\n", "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n", "\n", "# Get the paths of the input tables\n", "path = datasets_dir + os.sep + 'dblp_demo.csv'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Metadata file is not present in the given path; proceeding to read the csv file.\n", "Metadata file is not present in the given path; proceeding to read the csv file.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleauthorsvenueyear
0l0Paradise: A Database System for GIS ApplicationsParadise TeamSIGMOD Conference1995
1l1A Query Language and Optimization Techniques for Unstructured DataGerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan SuciuSIGMOD Conference1996
2l2Turbo-charging Vertical Mining of Large DatabasesJayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav BhalotiaSIGMOD Conference2000
3l3Maintenance of Data Cubes and Summary Tables in a WarehouseInderpal Singh Mumick, Dallan Quass, Barinderpal Singh MumickSIGMOD Conference1997
4l4On Relational Support for XML Publishing: Beyond Sorting and TaggingRaghav Kaushik, Jeffrey F. Naughton, Surajit ChaudhuriSIGMOD Conference2003
\n", "
" ], "text/plain": [ " id title \\\n", "0 l0 Paradise: A Database System for GIS Applications \n", "1 l1 A Query Language and Optimization Techniques for Unstructured Data \n", "2 l2 Turbo-charging Vertical Mining of Large Databases \n", "3 l3 Maintenance of Data Cubes and Summary Tables in a Warehouse \n", "4 l4 On Relational Support for XML Publishing: Beyond Sorting and Tagging \n", "\n", " authors \\\n", "0 Paradise Team \n", "1 Gerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan Suciu \n", "2 Jayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav Bhalotia \n", "3 Inderpal Singh Mumick, Dallan Quass, Barinderpal Singh Mumick \n", "4 Raghav Kaushik, Jeffrey F. Naughton, Surajit Chaudhuri \n", "\n", " venue year \n", "0 SIGMOD Conference 1995 \n", "1 SIGMOD Conference 1996 \n", "2 SIGMOD Conference 2000 \n", "3 SIGMOD Conference 1997 \n", "4 SIGMOD Conference 2003 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Read the CSV file and set 'ID' as the key attribute\n", "A = em.read_csv_metadata(path, key='id')\n", "B = em.read_csv_metadata(path, key='id')\n", "A.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Exploration\n", "\n", "This notebook will demonstrate using two different data exploration tools. OpenRefine is supported for python 2.7 and 3.5 and PandasTable is only supported for python 3.5 " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## OpenRefine" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Invoke the open refine gui for data exploration\n", "p = em.data_explore_openrefine(A, name='Table')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Save the project back to our dataframe\n", "# after calling export_pandas_frame, the openRefine project will be deleted automatically\n", "A = p.export_pandas_frame()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleauthorsvenueyear
0l0You can modify data if necessary using OpenRefineParadise TeamSIGMOD Conference1995
1l1A Query Language and Optimization Techniques for Unstructured DataGerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan SuciuSIGMOD Conference1996
2l2Turbo-charging Vertical Mining of Large DatabasesJayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav BhalotiaSIGMOD Conference2000
3l3Maintenance of Data Cubes and Summary Tables in a WarehouseInderpal Singh Mumick, Dallan Quass, Barinderpal Singh MumickSIGMOD Conference1997
4l4On Relational Support for XML Publishing: Beyond Sorting and TaggingRaghav Kaushik, Jeffrey F. Naughton, Surajit ChaudhuriSIGMOD Conference2003
\n", "
" ], "text/plain": [ " id title \\\n", "0 l0 You can modify data if necessary using OpenRefine \n", "1 l1 A Query Language and Optimization Techniques for Unstructured Data \n", "2 l2 Turbo-charging Vertical Mining of Large Databases \n", "3 l3 Maintenance of Data Cubes and Summary Tables in a Warehouse \n", "4 l4 On Relational Support for XML Publishing: Beyond Sorting and Tagging \n", "\n", " authors \\\n", "0 Paradise Team \n", "1 Gerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan Suciu \n", "2 Jayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav Bhalotia \n", "3 Inderpal Singh Mumick, Dallan Quass, Barinderpal Singh Mumick \n", "4 Raghav Kaushik, Jeffrey F. Naughton, Surajit Chaudhuri \n", "\n", " venue year \n", "0 SIGMOD Conference 1995 \n", "1 SIGMOD Conference 1996 \n", "2 SIGMOD Conference 2000 \n", "3 SIGMOD Conference 1997 \n", "4 SIGMOD Conference 2003 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "A.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pandastable" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Invoke the pandastable gui for data exploration\n", "# The process will be blocked until closing the GUI\n", "em.data_explore_pandastable(B)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleauthorsvenueyear
0l0You can modify data if necessary using pandastableParadise TeamSIGMOD Conference1995
1l1A Query Language and Optimization Techniques for Unstructured DataGerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan SuciuSIGMOD Conference1996
2l2Turbo-charging Vertical Mining of Large DatabasesJayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav BhalotiaSIGMOD Conference2000
3l3Maintenance of Data Cubes and Summary Tables in a WarehouseInderpal Singh Mumick, Dallan Quass, Barinderpal Singh MumickSIGMOD Conference1997
4l4On Relational Support for XML Publishing: Beyond Sorting and TaggingRaghav Kaushik, Jeffrey F. Naughton, Surajit ChaudhuriSIGMOD Conference2003
\n", "
" ], "text/plain": [ " id title \\\n", "0 l0 You can modify data if necessary using pandastable \n", "1 l1 A Query Language and Optimization Techniques for Unstructured Data \n", "2 l2 Turbo-charging Vertical Mining of Large Databases \n", "3 l3 Maintenance of Data Cubes and Summary Tables in a Warehouse \n", "4 l4 On Relational Support for XML Publishing: Beyond Sorting and Tagging \n", "\n", " authors \\\n", "0 Paradise Team \n", "1 Gerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan Suciu \n", "2 Jayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav Bhalotia \n", "3 Inderpal Singh Mumick, Dallan Quass, Barinderpal Singh Mumick \n", "4 Raghav Kaushik, Jeffrey F. Naughton, Surajit Chaudhuri \n", "\n", " venue year \n", "0 SIGMOD Conference 1995 \n", "1 SIGMOD Conference 1996 \n", "2 SIGMOD Conference 2000 \n", "3 SIGMOD Conference 1997 \n", "4 SIGMOD Conference 2003 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "B.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }