{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction\n", "This IPython notebook illustrates how to perform blocking using rule-based blocker.\n", "\n", "First, we need to import *py_entitymatching* package and other libraries as follows:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Import py_entitymatching package\n", "import py_entitymatching as em\n", "import os\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, read the (sample) input tables for blocking purposes." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get the datasets directory\n", "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n", "\n", "# Get the paths of the input tables\n", "path_A = datasets_dir + os.sep + 'person_table_A.csv'\n", "path_B = datasets_dir + os.sep + 'person_table_B.csv'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Read the CSV files and set 'ID' as the key attribute\n", "A = em.read_csv_metadata(path_A, key='ID')\n", "B = em.read_csv_metadata(path_B, key='ID')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDnamebirth_yearhourly_wageaddresszipcode
0a1Kevin Smith198930.0607 From St, San Francisco94107
1a2Michael Franklin198827.51652 Stockton St, San Francisco94122
2a3William Bridge198632.03131 Webster St, San Francisco94107
3a4Binto George198732.5423 Powell St, San Francisco94122
4a5Alphonse Kemper198435.01702 Post Street, San Francisco94122
\n", "
" ], "text/plain": [ " ID name birth_year hourly_wage \\\n", "0 a1 Kevin Smith 1989 30.0 \n", "1 a2 Michael Franklin 1988 27.5 \n", "2 a3 William Bridge 1986 32.0 \n", "3 a4 Binto George 1987 32.5 \n", "4 a5 Alphonse Kemper 1984 35.0 \n", "\n", " address zipcode \n", "0 607 From St, San Francisco 94107 \n", "1 1652 Stockton St, San Francisco 94122 \n", "2 3131 Webster St, San Francisco 94107 \n", "3 423 Powell St, San Francisco 94122 \n", "4 1702 Post Street, San Francisco 94122 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "A.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDnamebirth_yearhourly_wageaddresszipcode
0b1Mark Levene198729.5108 Clement St, San Francisco94107
1b2Bill Bridge198632.03131 Webster St, San Francisco94107
2b3Mike Franklin198827.51652 Stockton St, San Francisco94122
3b4Joseph Kuan198226.0108 South Park, San Francisco94122
4b5Alfons Kemper198435.0170 Post St, Apt 4, San Francisco94122
\n", "
" ], "text/plain": [ " ID name birth_year hourly_wage \\\n", "0 b1 Mark Levene 1987 29.5 \n", "1 b2 Bill Bridge 1986 32.0 \n", "2 b3 Mike Franklin 1988 27.5 \n", "3 b4 Joseph Kuan 1982 26.0 \n", "4 b5 Alfons Kemper 1984 35.0 \n", "\n", " address zipcode \n", "0 108 Clement St, San Francisco 94107 \n", "1 3131 Webster St, San Francisco 94107 \n", "2 1652 Stockton St, San Francisco 94122 \n", "3 108 South Park, San Francisco 94122 \n", "4 170 Post St, Apt 4, San Francisco 94122 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "B.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Generating Features for Blocking" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "block_f = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
feature_nameleft_attributeright_attributeleft_attr_tokenizerright_attr_tokenizersimfunctionfunctionfunction_sourceis_auto_generated
0ID_ID_lev_distIDIDNoneNonelev_dist<function ID_ID_lev_dist at 0x1191d6f28>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
1ID_ID_lev_simIDIDNoneNonelev_sim<function ID_ID_lev_sim at 0x1192432f0>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
2ID_ID_jarIDIDNoneNonejaro<function ID_ID_jar at 0x119243400>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
3ID_ID_jwnIDIDNoneNonejaro_winkler<function ID_ID_jwn at 0x119243378>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
4ID_ID_exmIDIDNoneNoneexact_match<function ID_ID_exm at 0x119243488>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
5ID_ID_jac_qgm_3_qgm_3IDIDqgm_3qgm_3jaccard<function ID_ID_jac_qgm_3_qgm_3 at 0x119243510>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
6name_name_jac_qgm_3_qgm_3namenameqgm_3qgm_3jaccard<function name_name_jac_qgm_3_qgm_3 at 0x119243598>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
7name_name_cos_dlm_dc0_dlm_dc0namenamedlm_dc0dlm_dc0cosine<function name_name_cos_dlm_dc0_dlm_dc0 at 0x119243620>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
8name_name_jac_dlm_dc0_dlm_dc0namenamedlm_dc0dlm_dc0jaccard<function name_name_jac_dlm_dc0_dlm_dc0 at 0x1192436a8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
9name_name_melnamenameNoneNonemonge_elkan<function name_name_mel at 0x119243730>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
10name_name_lev_distnamenameNoneNonelev_dist<function name_name_lev_dist at 0x1192437b8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
11name_name_lev_simnamenameNoneNonelev_sim<function name_name_lev_sim at 0x119243840>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
12name_name_nmwnamenameNoneNoneneedleman_wunsch<function name_name_nmw at 0x1192438c8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
13name_name_swnamenameNoneNonesmith_waterman<function name_name_sw at 0x119243950>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
14birth_year_birth_year_exmbirth_yearbirth_yearNoneNoneexact_match<function birth_year_birth_year_exm at 0x1192439d8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
15birth_year_birth_year_anmbirth_yearbirth_yearNoneNoneabs_norm<function birth_year_birth_year_anm at 0x119243a60>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
16birth_year_birth_year_lev_distbirth_yearbirth_yearNoneNonelev_dist<function birth_year_birth_year_lev_dist at 0x119243ae8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
17birth_year_birth_year_lev_simbirth_yearbirth_yearNoneNonelev_sim<function birth_year_birth_year_lev_sim at 0x119243b70>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
18hourly_wage_hourly_wage_exmhourly_wagehourly_wageNoneNoneexact_match<function hourly_wage_hourly_wage_exm at 0x119243bf8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
19hourly_wage_hourly_wage_anmhourly_wagehourly_wageNoneNoneabs_norm<function hourly_wage_hourly_wage_anm at 0x119243c80>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
20hourly_wage_hourly_wage_lev_disthourly_wagehourly_wageNoneNonelev_dist<function hourly_wage_hourly_wage_lev_dist at 0x119243d08>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
21hourly_wage_hourly_wage_lev_simhourly_wagehourly_wageNoneNonelev_sim<function hourly_wage_hourly_wage_lev_sim at 0x119243d90>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
22zipcode_zipcode_exmzipcodezipcodeNoneNoneexact_match<function zipcode_zipcode_exm at 0x119243e18>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
23zipcode_zipcode_anmzipcodezipcodeNoneNoneabs_norm<function zipcode_zipcode_anm at 0x119243ea0>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
24zipcode_zipcode_lev_distzipcodezipcodeNoneNonelev_dist<function zipcode_zipcode_lev_dist at 0x119243f28>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
25zipcode_zipcode_lev_simzipcodezipcodeNoneNonelev_sim<function zipcode_zipcode_lev_sim at 0x119254048>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
\n", "
" ], "text/plain": [ " feature_name left_attribute right_attribute \\\n", "0 ID_ID_lev_dist ID ID \n", "1 ID_ID_lev_sim ID ID \n", "2 ID_ID_jar ID ID \n", "3 ID_ID_jwn ID ID \n", "4 ID_ID_exm ID ID \n", "5 ID_ID_jac_qgm_3_qgm_3 ID ID \n", "6 name_name_jac_qgm_3_qgm_3 name name \n", "7 name_name_cos_dlm_dc0_dlm_dc0 name name \n", "8 name_name_jac_dlm_dc0_dlm_dc0 name name \n", "9 name_name_mel name name \n", "10 name_name_lev_dist name name \n", "11 name_name_lev_sim name name \n", "12 name_name_nmw name name \n", "13 name_name_sw name name \n", "14 birth_year_birth_year_exm birth_year birth_year \n", "15 birth_year_birth_year_anm birth_year birth_year \n", "16 birth_year_birth_year_lev_dist birth_year birth_year \n", "17 birth_year_birth_year_lev_sim birth_year birth_year \n", "18 hourly_wage_hourly_wage_exm hourly_wage hourly_wage \n", "19 hourly_wage_hourly_wage_anm hourly_wage hourly_wage \n", "20 hourly_wage_hourly_wage_lev_dist hourly_wage hourly_wage \n", "21 hourly_wage_hourly_wage_lev_sim hourly_wage hourly_wage \n", "22 zipcode_zipcode_exm zipcode zipcode \n", "23 zipcode_zipcode_anm zipcode zipcode \n", "24 zipcode_zipcode_lev_dist zipcode zipcode \n", "25 zipcode_zipcode_lev_sim zipcode zipcode \n", "\n", " left_attr_tokenizer right_attr_tokenizer simfunction \\\n", "0 None None lev_dist \n", "1 None None lev_sim \n", "2 None None jaro \n", "3 None None jaro_winkler \n", "4 None None exact_match \n", "5 qgm_3 qgm_3 jaccard \n", "6 qgm_3 qgm_3 jaccard \n", "7 dlm_dc0 dlm_dc0 cosine \n", "8 dlm_dc0 dlm_dc0 jaccard \n", "9 None None monge_elkan \n", "10 None None lev_dist \n", "11 None None lev_sim \n", "12 None None needleman_wunsch \n", "13 None None smith_waterman \n", "14 None None exact_match \n", "15 None None abs_norm \n", "16 None None lev_dist \n", "17 None None lev_sim \n", "18 None None exact_match \n", "19 None None abs_norm \n", "20 None None lev_dist \n", "21 None None lev_sim \n", "22 None None exact_match \n", "23 None None abs_norm \n", "24 None None lev_dist \n", "25 None None lev_sim \n", "\n", " function \\\n", "0 \n", "1 \n", "2 \n", "3 \n", "4 \n", "5 \n", "6 \n", "7 \n", "8 \n", "9 \n", "10 \n", "11 \n", "12 \n", "13 \n", "14 \n", "15 \n", "16 \n", "17 \n", "18 \n", "19 \n", "20 \n", "21 \n", "22 \n", "23 \n", "24 \n", "25 \n", "\n", " function_source \\\n", "0 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "1 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "2 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "3 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "4 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "5 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "6 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "7 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "8 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "9 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "10 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "11 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "12 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "13 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "14 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "15 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "16 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "17 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "18 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "19 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "20 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "21 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "22 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "23 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "24 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "25 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "\n", " is_auto_generated \n", "0 True \n", "1 True \n", "2 True \n", "3 True \n", "4 True \n", "5 True \n", "6 True \n", "7 True \n", "8 True \n", "9 True \n", "10 True \n", "11 True \n", "12 True \n", "13 True \n", "14 True \n", "15 True \n", "16 True \n", "17 True \n", "18 True \n", "19 True \n", "20 True \n", "21 True \n", "22 True \n", "23 True \n", "24 True \n", "25 True " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "block_f" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('ID', 'ID'),\n", " ('name', 'name'),\n", " ('birth_year', 'birth_year'),\n", " ('hourly_wage', 'hourly_wage'),\n", " ('address', 'address'),\n", " ('zipcode', 'zipcode')]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "em._block_c['corres']" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('numeric', 'numeric', 'str_bt_1w_5w', 'numeric')" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "em._atypes1['birth_year'], em._atypes1['hourly_wage'], em._atypes1['name'], em._atypes1['zipcode']" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('numeric', 'numeric', 'str_bt_1w_5w', 'numeric')" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "em._atypes2['birth_year'], em._atypes2['hourly_wage'], em._atypes2['name'], em._atypes2['zipcode']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Different Ways to Block Using Rule Based Blocker" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are three different ways to do overlap blocking:\n", "\n", "1. Block two tables to produce a `candidate set` of tuple pairs.\n", "2. Block a `candidate set` of tuple pairs to typically produce a reduced candidate set of tuple pairs.\n", "3. Block two tuples to check if a tuple pair would get blocked." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Block Tables to Produce a Candidate Set of Tuple Pairs" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'_rule_0'" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rb = em.RuleBasedBlocker()\n", "# Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.4\n", "rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], block_f)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "C = rb.block_tables(A, B, l_output_attrs=['name', 'address'], r_output_attrs=['name', 'address'], show_progress=False)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
_idltable_IDrtable_IDltable_nameltable_addressrtable_namertable_address
00a2b3Michael Franklin1652 Stockton St, San FranciscoMike Franklin1652 Stockton St, San Francisco
11a2b6Michael Franklin1652 Stockton St, San FranciscoMichael Brodie133 Clement Street, San Francisco
22a3b2William Bridge3131 Webster St, San FranciscoBill Bridge3131 Webster St, San Francisco
33a3b6William Bridge3131 Webster St, San FranciscoMichael Brodie133 Clement Street, San Francisco
44a4b2Binto George423 Powell St, San FranciscoBill Bridge3131 Webster St, San Francisco
\n", "
" ], "text/plain": [ " _id ltable_ID rtable_ID ltable_name ltable_address \\\n", "0 0 a2 b3 Michael Franklin 1652 Stockton St, San Francisco \n", "1 1 a2 b6 Michael Franklin 1652 Stockton St, San Francisco \n", "2 2 a3 b2 William Bridge 3131 Webster St, San Francisco \n", "3 3 a3 b6 William Bridge 3131 Webster St, San Francisco \n", "4 4 a4 b2 Binto George 423 Powell St, San Francisco \n", "\n", " rtable_name rtable_address \n", "0 Mike Franklin 1652 Stockton St, San Francisco \n", "1 Michael Brodie 133 Clement Street, San Francisco \n", "2 Bill Bridge 3131 Webster St, San Francisco \n", "3 Michael Brodie 133 Clement Street, San Francisco \n", "4 Bill Bridge 3131 Webster St, San Francisco " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "C.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Block Candidate Set" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'_rule_0'" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rb = em.RuleBasedBlocker()\n", "rb.add_rule(['birth_year_birth_year_exm(ltuple, rtuple) == 0'], block_f)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "D = rb.block_candset(C, show_progress=False)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
_idltable_IDrtable_IDltable_nameltable_addressrtable_namertable_address
00a2b3Michael Franklin1652 Stockton St, San FranciscoMike Franklin1652 Stockton St, San Francisco
22a3b2William Bridge3131 Webster St, San FranciscoBill Bridge3131 Webster St, San Francisco
55a5b5Alphonse Kemper1702 Post Street, San FranciscoAlfons Kemper170 Post St, Apt 4, San Francisco
\n", "
" ], "text/plain": [ " _id ltable_ID rtable_ID ltable_name ltable_address \\\n", "0 0 a2 b3 Michael Franklin 1652 Stockton St, San Francisco \n", "2 2 a3 b2 William Bridge 3131 Webster St, San Francisco \n", "5 5 a5 b5 Alphonse Kemper 1702 Post Street, San Francisco \n", "\n", " rtable_name rtable_address \n", "0 Mike Franklin 1652 Stockton St, San Francisco \n", "2 Bill Bridge 3131 Webster St, San Francisco \n", "5 Alfons Kemper 170 Post St, Apt 4, San Francisco " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "D.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Block Two tuples To Check If a Tuple Pair Would Get Blocked" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDnamebirth_yearhourly_wageaddresszipcode
0a1Kevin Smith198930.0607 From St, San Francisco94107
\n", "
" ], "text/plain": [ " ID name birth_year hourly_wage address \\\n", "0 a1 Kevin Smith 1989 30.0 607 From St, San Francisco \n", "\n", " zipcode \n", "0 94107 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "A.loc[[0]]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDnamebirth_yearhourly_wageaddresszipcode
1b2Bill Bridge198632.03131 Webster St, San Francisco94107
\n", "
" ], "text/plain": [ " ID name birth_year hourly_wage address \\\n", "1 b2 Bill Bridge 1986 32.0 3131 Webster St, San Francisco \n", "\n", " zipcode \n", "1 94107 " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "B.loc[[1]]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'_rule_1'" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rb = em.RuleBasedBlocker()\n", "# Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.4\n", "rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], block_f)\n", "rb.add_rule(['birth_year_birth_year_exm(ltuple, rtuple) == 0'], block_f)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "True\n" ] } ], "source": [ "status = rb.block_tuples(A.loc[0], B.loc[0])\n", "print(status)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.13" } }, "nbformat": 4, "nbformat_minor": 1 }