# Introduction
This IPython notebook illustrates how to perform blocking using rule-based blocker.

First, we need to import *py_entitymatching* package and other libraries as follows:

In [1]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd



Then, read the (sample) input tables for blocking purposes.

In [2]:
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'

In [3]:
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')

In [4]:
A.head()

Unnamed: 0,ID,name,birth_year,hourly_wage,address,zipcode
0,a1,Kevin Smith,1989,30.0,"607 From St, San Francisco",94107
1,a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122
2,a3,William Bridge,1986,32.0,"3131 Webster St, San Francisco",94107
3,a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122
4,a5,Alphonse Kemper,1984,35.0,"1702 Post Street, San Francisco",94122


In [5]:
B.head()

Unnamed: 0,ID,name,birth_year,hourly_wage,address,zipcode
0,b1,Mark Levene,1987,29.5,"108 Clement St, San Francisco",94107
1,b2,Bill Bridge,1986,32.0,"3131 Webster St, San Francisco",94107
2,b3,Mike Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122
3,b4,Joseph Kuan,1982,26.0,"108 South Park, San Francisco",94122
4,b5,Alfons Kemper,1984,35.0,"170 Post St, Apt 4, San Francisco",94122


# Generating Features for Blocking

In [6]:
block_f = em.get_features_for_blocking(A, B)

In [22]:
block_f

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,ID_ID_lev_dist,ID,ID,,,lev_dist,<function ID_ID_lev_dist at 0x103fae7b8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
1,ID_ID_lev_sim,ID,ID,,,lev_sim,<function ID_ID_lev_sim at 0x114358268>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
2,ID_ID_jar,ID,ID,,,jaro,<function ID_ID_jar at 0x1143581e0>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
3,ID_ID_jwn,ID,ID,,,jaro_winkler,<function ID_ID_jwn at 0x114358400>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
4,ID_ID_exm,ID,ID,,,exact_match,<function ID_ID_exm at 0x114358598>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
5,ID_ID_jac_qgm_3_qgm_3,ID,ID,qgm_3,qgm_3,jaccard,<function ID_ID_jac_qgm_3_qgm_3 at 0x114358620>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
6,name_name_jac_qgm_3_qgm_3,name,name,qgm_3,qgm_3,jaccard,<function name_name_jac_qgm_3_qgm_3 at 0x1143586a8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
7,name_name_cos_dlm_dc0_dlm_dc0,name,name,dlm_dc0,dlm_dc0,cosine,<function name_name_cos_dlm_dc0_dlm_dc0 at 0x114358730>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
8,name_name_jac_dlm_dc0_dlm_dc0,name,name,dlm_dc0,dlm_dc0,jaccard,<function name_name_jac_dlm_dc0_dlm_dc0 at 0x1143587b8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
9,name_name_mel,name,name,,,monge_elkan,<function name_name_mel at 0x114358840>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True


In [10]:
em._block_c['corres']

[('ID', 'ID'),
 ('name', 'name'),
 ('birth_year', 'birth_year'),
 ('hourly_wage', 'hourly_wage'),
 ('address', 'address'),
 ('zipcode', 'zipcode')]

In [31]:
em._atypes1['birth_year'], em._atypes1['hourly_wage'], em._atypes1['name'], em._atypes1['zipcode']

('numeric', 'numeric', 'str_bt_1w_5w', 'numeric')

In [32]:
em._atypes2['birth_year'], em._atypes2['hourly_wage'], em._atypes2['name'], em._atypes2['zipcode']

('numeric', 'numeric', 'str_bt_1w_5w', 'numeric')

# Different Ways to Block Using Rule Based Blocker

There are three different ways to do overlap blocking:

1. Block two tables to produce a `candidate set` of tuple pairs.
2. Block a `candidate set` of tuple pairs to typically produce a reduced candidate set of tuple pairs.
3. Block two tuples to check if a tuple pair would get blocked.

## Block Tables to Produce a Candidate Set of Tuple Pairs

In [16]:
rb = em.RuleBasedBlocker()
# Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.4
rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], block_f)

'_rule_0'

In [20]:
C = rb.block_tables(A, B, l_output_attrs=['name', 'address'], r_output_attrs=['name', 'address'], show_progress=False)

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:00


In [21]:
C.head()

Unnamed: 0,_id,ltable_ID,rtable_ID,ltable_name,ltable_address,rtable_name,rtable_address
0,0,a2,b3,Michael Franklin,"1652 Stockton St, San Francisco",Mike Franklin,"1652 Stockton St, San Francisco"
1,1,a2,b6,Michael Franklin,"1652 Stockton St, San Francisco",Michael Brodie,"133 Clement Street, San Francisco"
2,2,a3,b2,William Bridge,"3131 Webster St, San Francisco",Bill Bridge,"3131 Webster St, San Francisco"
3,3,a3,b6,William Bridge,"3131 Webster St, San Francisco",Michael Brodie,"133 Clement Street, San Francisco"
4,4,a4,b2,Binto George,"423 Powell St, San Francisco",Bill Bridge,"3131 Webster St, San Francisco"


## Block Candidate Set

In [28]:
rb = em.RuleBasedBlocker()
rb.add_rule(['birth_year_birth_year_exm(ltuple, rtuple) == 0'], block_f)

'_rule_0'

In [29]:
D = rb.block_candset(C, show_progress=False)

0%  100%
[      ]0%  100%
[######] | ETA: 00:00:00
Total time elapsed: 00:00:00


In [30]:
D.head()

Unnamed: 0,_id,ltable_ID,rtable_ID,ltable_name,ltable_address,rtable_name,rtable_address
0,0,a2,b3,Michael Franklin,"1652 Stockton St, San Francisco",Mike Franklin,"1652 Stockton St, San Francisco"
2,2,a3,b2,William Bridge,"3131 Webster St, San Francisco",Bill Bridge,"3131 Webster St, San Francisco"
5,5,a5,b5,Alphonse Kemper,"1702 Post Street, San Francisco",Alfons Kemper,"170 Post St, Apt 4, San Francisco"


## Block Two tuples To Check If a Tuple Pair Would Get Blocked

In [33]:
A.ix[[0]]

Unnamed: 0,ID,name,birth_year,hourly_wage,address,zipcode
0,a1,Kevin Smith,1989,30.0,"607 From St, San Francisco",94107


In [34]:
B.ix[[1]]

Unnamed: 0,ID,name,birth_year,hourly_wage,address,zipcode
1,b2,Bill Bridge,1986,32.0,"3131 Webster St, San Francisco",94107


In [36]:
rb = em.RuleBasedBlocker()
# Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.4
rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], block_f)
rb.add_rule(['birth_year_birth_year_exm(ltuple, rtuple) == 0'], block_f)

'_rule_1'

In [38]:
status = rb.block_tuples(A.ix[0], B.ix[0])
print(status)

True
