{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Introduction\n",
"This IPython notebook illustrates how to debug blocker output.\n",
"\n",
"First, we need to import *py_entitymatching* package and other libraries as follows:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Import py_entitymatching package\n",
"import py_entitymatching as em\n",
"import os\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Then, read the (sample) input tables for blocking purposes."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Get the datasets directory\n",
"datasets_dir = em.get_install_path() + os.sep + 'datasets'\n",
"\n",
"# Get the paths of the input tables\n",
"path_A = datasets_dir + os.sep + 'person_table_A.csv'\n",
"path_B = datasets_dir + os.sep + 'person_table_B.csv'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Read the CSV files and set 'ID' as the key attribute\n",
"A = em.read_csv_metadata(path_A, key='ID')\n",
"B = em.read_csv_metadata(path_B, key='ID')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Debugging Blocker Output"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First, block using rule-based blocker"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# First get features that can be used\n",
"feature_table = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'_rule_0'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create rule-based blocker\n",
"rb = em.RuleBasedBlocker()\n",
"# Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.8\n",
"rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.8'], feature_table)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"0% 100%\n",
"[##############################] | ETA: 00:00:00\n",
"Total time elapsed: 00:00:00\n"
]
}
],
"source": [
"E = rb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" _id | \n",
" ltable_ID | \n",
" rtable_ID | \n",
" ltable_name | \n",
" rtable_name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" a5 | \n",
" b5 | \n",
" Alphonse Kemper | \n",
" Alfons Kemper | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" _id ltable_ID rtable_ID ltable_name rtable_name\n",
"0 0 a5 b5 Alphonse Kemper Alfons Kemper"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"E"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dbg = em.debug_blocker(E, A, B, output_size=5)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" _id | \n",
" similarity | \n",
" ltable_ID | \n",
" rtable_ID | \n",
" ltable_name | \n",
" ltable_address | \n",
" rtable_name | \n",
" rtable_address | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0.750000 | \n",
" a2 | \n",
" b3 | \n",
" Michael Franklin | \n",
" 1652 Stockton St, San Francisco | \n",
" Mike Franklin | \n",
" 1652 Stockton St, San Francisco | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 0.750000 | \n",
" a3 | \n",
" b2 | \n",
" William Bridge | \n",
" 3131 Webster St, San Francisco | \n",
" Bill Bridge | \n",
" 3131 Webster St, San Francisco | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
" 0.272727 | \n",
" a4 | \n",
" b2 | \n",
" Binto George | \n",
" 423 Powell St, San Francisco | \n",
" Bill Bridge | \n",
" 3131 Webster St, San Francisco | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" 0.272727 | \n",
" a4 | \n",
" b3 | \n",
" Binto George | \n",
" 423 Powell St, San Francisco | \n",
" Mike Franklin | \n",
" 1652 Stockton St, San Francisco | \n",
"
\n",
" \n",
" 4 | \n",
" 4 | \n",
" 0.272727 | \n",
" a5 | \n",
" b6 | \n",
" Alphonse Kemper | \n",
" 1702 Post Street, San Francisco | \n",
" Michael Brodie | \n",
" 133 Clement Street, San Francisco | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" _id similarity ltable_ID rtable_ID ltable_name \\\n",
"0 0 0.750000 a2 b3 Michael Franklin \n",
"1 1 0.750000 a3 b2 William Bridge \n",
"2 2 0.272727 a4 b2 Binto George \n",
"3 3 0.272727 a4 b3 Binto George \n",
"4 4 0.272727 a5 b6 Alphonse Kemper \n",
"\n",
" ltable_address rtable_name \\\n",
"0 1652 Stockton St, San Francisco Mike Franklin \n",
"1 3131 Webster St, San Francisco Bill Bridge \n",
"2 423 Powell St, San Francisco Bill Bridge \n",
"3 423 Powell St, San Francisco Mike Franklin \n",
"4 1702 Post Street, San Francisco Michael Brodie \n",
"\n",
" rtable_address \n",
"0 1652 Stockton St, San Francisco \n",
"1 3131 Webster St, San Francisco \n",
"2 3131 Webster St, San Francisco \n",
"3 1652 Stockton St, San Francisco \n",
"4 133 Clement Street, San Francisco "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dbg"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'_rule_0'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create rule-based blocker --- NOTE: we are creating a new blocker !!!\n",
"rb = em.RuleBasedBlocker()\n",
"# Add rule : block tuples if name_name_lev_sim(ltuple, rtuple) < 0.4\n",
"rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], feature_table)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"0% 100%\n",
"[##############################] | ETA: 00:00:00\n",
"Total time elapsed: 00:00:00\n"
]
}
],
"source": [
"E = rb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" _id | \n",
" ltable_ID | \n",
" rtable_ID | \n",
" ltable_name | \n",
" rtable_name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" a2 | \n",
" b3 | \n",
" Michael Franklin | \n",
" Mike Franklin | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" a2 | \n",
" b6 | \n",
" Michael Franklin | \n",
" Michael Brodie | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
" a3 | \n",
" b2 | \n",
" William Bridge | \n",
" Bill Bridge | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" a3 | \n",
" b6 | \n",
" William Bridge | \n",
" Michael Brodie | \n",
"
\n",
" \n",
" 4 | \n",
" 4 | \n",
" a4 | \n",
" b2 | \n",
" Binto George | \n",
" Bill Bridge | \n",
"
\n",
" \n",
" 5 | \n",
" 5 | \n",
" a5 | \n",
" b5 | \n",
" Alphonse Kemper | \n",
" Alfons Kemper | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" _id ltable_ID rtable_ID ltable_name rtable_name\n",
"0 0 a2 b3 Michael Franklin Mike Franklin\n",
"1 1 a2 b6 Michael Franklin Michael Brodie\n",
"2 2 a3 b2 William Bridge Bill Bridge\n",
"3 3 a3 b6 William Bridge Michael Brodie\n",
"4 4 a4 b2 Binto George Bill Bridge\n",
"5 5 a5 b5 Alphonse Kemper Alfons Kemper"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"E"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dbg = em.debug_blocker(E, A, B, output_size=5)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" _id | \n",
" similarity | \n",
" ltable_ID | \n",
" rtable_ID | \n",
" ltable_name | \n",
" ltable_address | \n",
" rtable_name | \n",
" rtable_address | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0.272727 | \n",
" a3 | \n",
" b1 | \n",
" William Bridge | \n",
" 3131 Webster St, San Francisco | \n",
" Mark Levene | \n",
" 108 Clement St, San Francisco | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 0.272727 | \n",
" a3 | \n",
" b3 | \n",
" William Bridge | \n",
" 3131 Webster St, San Francisco | \n",
" Mike Franklin | \n",
" 1652 Stockton St, San Francisco | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
" 0.272727 | \n",
" a5 | \n",
" b6 | \n",
" Alphonse Kemper | \n",
" 1702 Post Street, San Francisco | \n",
" Michael Brodie | \n",
" 133 Clement Street, San Francisco | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" 0.272727 | \n",
" a4 | \n",
" b1 | \n",
" Binto George | \n",
" 423 Powell St, San Francisco | \n",
" Mark Levene | \n",
" 108 Clement St, San Francisco | \n",
"
\n",
" \n",
" 4 | \n",
" 4 | \n",
" 0.272727 | \n",
" a4 | \n",
" b3 | \n",
" Binto George | \n",
" 423 Powell St, San Francisco | \n",
" Mike Franklin | \n",
" 1652 Stockton St, San Francisco | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" _id similarity ltable_ID rtable_ID ltable_name \\\n",
"0 0 0.272727 a3 b1 William Bridge \n",
"1 1 0.272727 a3 b3 William Bridge \n",
"2 2 0.272727 a5 b6 Alphonse Kemper \n",
"3 3 0.272727 a4 b1 Binto George \n",
"4 4 0.272727 a4 b3 Binto George \n",
"\n",
" ltable_address rtable_name \\\n",
"0 3131 Webster St, San Francisco Mark Levene \n",
"1 3131 Webster St, San Francisco Mike Franklin \n",
"2 1702 Post Street, San Francisco Michael Brodie \n",
"3 423 Powell St, San Francisco Mark Levene \n",
"4 423 Powell St, San Francisco Mike Franklin \n",
"\n",
" rtable_address \n",
"0 108 Clement St, San Francisco \n",
"1 1652 Stockton St, San Francisco \n",
"2 133 Clement Street, San Francisco \n",
"3 108 Clement St, San Francisco \n",
"4 1652 Stockton St, San Francisco "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dbg"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
}
},
"nbformat": 4,
"nbformat_minor": 1
}