{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction\n", "This IPython notebook illustrates how to debug blocker output.\n", "\n", "First, we need to import *py_entitymatching* package and other libraries as follows:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Import py_entitymatching package\n", "import py_entitymatching as em\n", "import os\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, read the (sample) input tables for blocking purposes." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get the datasets directory\n", "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n", "\n", "# Get the paths of the input tables\n", "path_A = datasets_dir + os.sep + 'person_table_A.csv'\n", "path_B = datasets_dir + os.sep + 'person_table_B.csv'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Metadata file is not present in the given path; proceeding to read the csv file.\n" ] } ], "source": [ "# Read the CSV files and set 'ID' as the key attribute\n", "A = em.read_csv_metadata(path_A, key='ID')\n", "B = em.read_csv_metadata(path_B, key='ID')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Debugging Blocker Output" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First, block using rule-based blocker" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# First get features that can be used\n", "feature_table = em.get_features_for_blocking(A, B)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'_rule_0'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create rule-based blocker\n", "rb = em.RuleBasedBlocker()\n", "# Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.8\n", "rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.8'], feature_table)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "0% 100%\n", "[##############################] | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00\n", "Total time elapsed: 00:00:00\n" ] } ], "source": [ "E = rb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
_idltable_IDrtable_IDltable_namertable_name
00a5b5Alphonse KemperAlfons Kemper
\n", "
" ], "text/plain": [ " _id ltable_ID rtable_ID ltable_name rtable_name\n", "0 0 a5 b5 Alphonse Kemper Alfons Kemper" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "E" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dbg = em.debug_blocker(E, A, B, output_size=5)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
_idsimilarityltable_IDrtable_IDltable_nameltable_addressrtable_namertable_address
000.750000a2b3Michael Franklin1652 Stockton St, San FranciscoMike Franklin1652 Stockton St, San Francisco
110.750000a3b2William Bridge3131 Webster St, San FranciscoBill Bridge3131 Webster St, San Francisco
220.272727a4b2Binto George423 Powell St, San FranciscoBill Bridge3131 Webster St, San Francisco
330.272727a4b3Binto George423 Powell St, San FranciscoMike Franklin1652 Stockton St, San Francisco
440.272727a5b6Alphonse Kemper1702 Post Street, San FranciscoMichael Brodie133 Clement Street, San Francisco
\n", "
" ], "text/plain": [ " _id similarity ltable_ID rtable_ID ltable_name \\\n", "0 0 0.750000 a2 b3 Michael Franklin \n", "1 1 0.750000 a3 b2 William Bridge \n", "2 2 0.272727 a4 b2 Binto George \n", "3 3 0.272727 a4 b3 Binto George \n", "4 4 0.272727 a5 b6 Alphonse Kemper \n", "\n", " ltable_address rtable_name \\\n", "0 1652 Stockton St, San Francisco Mike Franklin \n", "1 3131 Webster St, San Francisco Bill Bridge \n", "2 423 Powell St, San Francisco Bill Bridge \n", "3 423 Powell St, San Francisco Mike Franklin \n", "4 1702 Post Street, San Francisco Michael Brodie \n", "\n", " rtable_address \n", "0 1652 Stockton St, San Francisco \n", "1 3131 Webster St, San Francisco \n", "2 3131 Webster St, San Francisco \n", "3 1652 Stockton St, San Francisco \n", "4 133 Clement Street, San Francisco " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dbg" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'_rule_0'" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create rule-based blocker --- NOTE: we are creating a new blocker !!!\n", "rb = em.RuleBasedBlocker()\n", "# Add rule : block tuples if name_name_lev_sim(ltuple, rtuple) < 0.4\n", "rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], feature_table)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "0% 100%\n", "[##############################] | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00\n", "Total time elapsed: 00:00:00\n" ] } ], "source": [ "E = rb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'])" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
_idltable_IDrtable_IDltable_namertable_name
00a2b3Michael FranklinMike Franklin
11a2b6Michael FranklinMichael Brodie
22a3b2William BridgeBill Bridge
33a3b6William BridgeMichael Brodie
44a4b2Binto GeorgeBill Bridge
55a5b5Alphonse KemperAlfons Kemper
\n", "
" ], "text/plain": [ " _id ltable_ID rtable_ID ltable_name rtable_name\n", "0 0 a2 b3 Michael Franklin Mike Franklin\n", "1 1 a2 b6 Michael Franklin Michael Brodie\n", "2 2 a3 b2 William Bridge Bill Bridge\n", "3 3 a3 b6 William Bridge Michael Brodie\n", "4 4 a4 b2 Binto George Bill Bridge\n", "5 5 a5 b5 Alphonse Kemper Alfons Kemper" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "E" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dbg = em.debug_blocker(E, A, B, output_size=5)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
_idsimilarityltable_IDrtable_IDltable_nameltable_addressrtable_namertable_address
000.272727a3b1William Bridge3131 Webster St, San FranciscoMark Levene108 Clement St, San Francisco
110.272727a3b3William Bridge3131 Webster St, San FranciscoMike Franklin1652 Stockton St, San Francisco
220.272727a5b6Alphonse Kemper1702 Post Street, San FranciscoMichael Brodie133 Clement Street, San Francisco
330.272727a4b1Binto George423 Powell St, San FranciscoMark Levene108 Clement St, San Francisco
440.272727a4b3Binto George423 Powell St, San FranciscoMike Franklin1652 Stockton St, San Francisco
\n", "
" ], "text/plain": [ " _id similarity ltable_ID rtable_ID ltable_name \\\n", "0 0 0.272727 a3 b1 William Bridge \n", "1 1 0.272727 a3 b3 William Bridge \n", "2 2 0.272727 a5 b6 Alphonse Kemper \n", "3 3 0.272727 a4 b1 Binto George \n", "4 4 0.272727 a4 b3 Binto George \n", "\n", " ltable_address rtable_name \\\n", "0 3131 Webster St, San Francisco Mark Levene \n", "1 3131 Webster St, San Francisco Mike Franklin \n", "2 1702 Post Street, San Francisco Michael Brodie \n", "3 423 Powell St, San Francisco Mark Levene \n", "4 423 Powell St, San Francisco Mike Franklin \n", "\n", " rtable_address \n", "0 108 Clement St, San Francisco \n", "1 1652 Stockton St, San Francisco \n", "2 133 Clement Street, San Francisco \n", "3 108 Clement St, San Francisco \n", "4 1652 Stockton St, San Francisco " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dbg" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }