{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction\n", "\n", "This IPython notebook illustrates how to use multiple blockers and combine the results.\n", "\n", "First, we need to import *py_entitymatching* package and other libraries as follows:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Import py_entitymatching package\n", "import py_entitymatching as em\n", "import os\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, read the (sample) input tables for blocking purposes." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get the datasets directory\n", "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n", "\n", "# Get the paths of the input tables\n", "path_A = datasets_dir + os.sep + 'person_table_A.csv'\n", "path_B = datasets_dir + os.sep + 'person_table_B.csv'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Read the CSV files and set 'ID' as the key attribute\n", "A = em.read_csv_metadata(path_A, key='ID')\n", "B = em.read_csv_metadata(path_B, key='ID')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Combining Multiple Blockers" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Blocking plan :\n", "\n", "#A, B --overlap blocking--> candset --attr-equiv-block--> candset |\n", "# | \n", "#A, B ------------rule-based-blocking--------------------> candset |----union--->candset\n", "# |\n", "#A, B -----------black-box-blocking----------------------> candset |\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
_idltable_IDrtable_IDltable_nameltable_birth_yearrtable_namertable_birth_year
00a3b2William Bridge1986Bill Bridge1986
11a2b3Michael Franklin1988Mike Franklin1988
22a5b5Alphonse Kemper1984Alfons Kemper1984
33a2b6Michael Franklin1988Michael Brodie1987
\n", "
" ], "text/plain": [ " _id ltable_ID rtable_ID ltable_name ltable_birth_year \\\n", "0 0 a3 b2 William Bridge 1986 \n", "1 1 a2 b3 Michael Franklin 1988 \n", "2 2 a5 b5 Alphonse Kemper 1984 \n", "3 3 a2 b6 Michael Franklin 1988 \n", "\n", " rtable_name rtable_birth_year \n", "0 Bill Bridge 1986 \n", "1 Mike Franklin 1988 \n", "2 Alfons Kemper 1984 \n", "3 Michael Brodie 1987 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Overlap blocking over input tables\n", "ob = em.OverlapBlocker()\n", "# block using name\n", "C = ob.block_tables(A, B, 'name', 'name', word_level=True, overlap_size=1, \n", " l_output_attrs=['name', 'birth_year'], \n", " r_output_attrs=['name', 'birth_year'],\n", " show_progress=False)\n", "C" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
_idltable_IDrtable_IDltable_nameltable_birth_yearrtable_namertable_birth_year
00a3b2William Bridge1986Bill Bridge1986
11a2b3Michael Franklin1988Mike Franklin1988
22a5b5Alphonse Kemper1984Alfons Kemper1984
33a2b6Michael Franklin1988Michael Brodie1987
\n", "
" ], "text/plain": [ " _id ltable_ID rtable_ID ltable_name ltable_birth_year \\\n", "0 0 a3 b2 William Bridge 1986 \n", "1 1 a2 b3 Michael Franklin 1988 \n", "2 2 a5 b5 Alphonse Kemper 1984 \n", "3 3 a2 b6 Michael Franklin 1988 \n", "\n", " rtable_name rtable_birth_year \n", "0 Bill Bridge 1986 \n", "1 Mike Franklin 1988 \n", "2 Alfons Kemper 1984 \n", "3 Michael Brodie 1987 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Overlap blocking over input tables\n", "ob = em.OverlapBlocker()\n", "# block using name\n", "C = ob.block_tables(A, B, 'name', 'name', word_level=True, overlap_size=1, \n", " l_output_attrs=['name', 'birth_year'], \n", " r_output_attrs=['name', 'birth_year'],\n", " show_progress=False)\n", "C" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Attribute equivalence blocking: block C using birth_year\n", "ab = em.AttrEquivalenceBlocker()\n", "D = ab.block_candset(C, 'birth_year', 'birth_year', show_progress=False)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
_idltable_IDrtable_IDltable_nameltable_birth_yearrtable_namertable_birth_year
00a3b2William Bridge1986Bill Bridge1986
11a2b3Michael Franklin1988Mike Franklin1988
22a5b5Alphonse Kemper1984Alfons Kemper1984
\n", "
" ], "text/plain": [ " _id ltable_ID rtable_ID ltable_name ltable_birth_year \\\n", "0 0 a3 b2 William Bridge 1986 \n", "1 1 a2 b3 Michael Franklin 1988 \n", "2 2 a5 b5 Alphonse Kemper 1984 \n", "\n", " rtable_name rtable_birth_year \n", "0 Bill Bridge 1986 \n", "1 Mike Franklin 1988 \n", "2 Alfons Kemper 1984 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "D" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Rule-based blocking over input tables\n", "# first get features that can be used\n", "feature_table = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'_rule_0'" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create rule-based blocker\n", "rb = em.RuleBasedBlocker()\n", "# Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.4\n", "rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], feature_table)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "E = rb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'], show_progress=False)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Apply black box blocker\n", "# Create black box blocker\n", "bb = em.BlackBoxBlocker()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# define a black box function. \n", "# The blocker function should drop tuple pairs whose last name do not match\n", "# The function has to do the following steps\n", "# 1) Get name attributes from each of the tuples\n", "# 2) Split name attribute to get last name\n", "# 3) if last names donot match return True" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def my_function(x, y):\n", " # x, y will be of type pandas series\n", " \n", " # get name attribute\n", " x_name = x['name']\n", " y_name = y['name']\n", " # get last names\n", " x_name = x_name.split(' ')[1]\n", " y_name = y_name.split(' ')[1]\n", " # check if last names match\n", " if x_name != y_name:\n", " return True\n", " else:\n", " return False" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "bb.set_black_box_function(my_function)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "F = bb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'], show_progress=False)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
_idltable_IDrtable_IDltable_namertable_name
00a2b3Michael FranklinMike Franklin
11a3b2William BridgeBill Bridge
22a5b5Alphonse KemperAlfons Kemper
\n", "
" ], "text/plain": [ " _id ltable_ID rtable_ID ltable_name rtable_name\n", "0 0 a2 b3 Michael Franklin Mike Franklin\n", "1 1 a3 b2 William Bridge Bill Bridge\n", "2 2 a5 b5 Alphonse Kemper Alfons Kemper" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "F" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# Combine all the blocker outputs\n", "G = em.combine_blocker_outputs_via_union([D, E, F])" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
_idltable_IDrtable_IDltable_nameltable_birth_yearrtable_namertable_birth_year
00a2b3Michael Franklin1988Mike Franklin1988
11a2b6Michael Franklin1988Michael Brodie1987
22a3b2William Bridge1986Bill Bridge1986
33a3b6William Bridge1986Michael Brodie1987
44a4b2Binto George1987Bill Bridge1986
55a5b5Alphonse Kemper1984Alfons Kemper1984
\n", "
" ], "text/plain": [ " _id ltable_ID rtable_ID ltable_name ltable_birth_year \\\n", "0 0 a2 b3 Michael Franklin 1988 \n", "1 1 a2 b6 Michael Franklin 1988 \n", "2 2 a3 b2 William Bridge 1986 \n", "3 3 a3 b6 William Bridge 1986 \n", "4 4 a4 b2 Binto George 1987 \n", "5 5 a5 b5 Alphonse Kemper 1984 \n", "\n", " rtable_name rtable_birth_year \n", "0 Mike Franklin 1988 \n", "1 Michael Brodie 1987 \n", "2 Bill Bridge 1986 \n", "3 Michael Brodie 1987 \n", "4 Bill Bridge 1986 \n", "5 Alfons Kemper 1984 " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "G" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 4547208976\n", "rtable(obj.id): 4546836464\n", "key: _id\n", "fk_rtable: rtable_ID\n", "fk_ltable: ltable_ID\n", "ltable(obj.id): 4546835680\n" ] } ], "source": [ "em.show_properties(G)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 1 }