{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This IPython notebook illustrates how to perform blocking using Overlap blocker.\n", "\n", "First, we need to import *py_entitymatching* package and other libraries as follows:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Import py_entitymatching package\n", "import py_entitymatching as em\n", "import os\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, read the (sample) input tables for blocking purposes." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get the datasets directory\n", "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n", "\n", "# Get the paths of the input tables\n", "path_A = datasets_dir + os.sep + 'person_table_A.csv'\n", "path_B = datasets_dir + os.sep + 'person_table_B.csv'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Read the CSV files and set 'ID' as the key attribute\n", "A = em.read_csv_metadata(path_A, key='ID')\n", "B = em.read_csv_metadata(path_B, key='ID')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | ID | \n", "name | \n", "birth_year | \n", "hourly_wage | \n", "address | \n", "zipcode | \n", "
---|---|---|---|---|---|---|
0 | \n", "a1 | \n", "Kevin Smith | \n", "1989 | \n", "30.0 | \n", "607 From St, San Francisco | \n", "94107 | \n", "
1 | \n", "a2 | \n", "Michael Franklin | \n", "1988 | \n", "27.5 | \n", "1652 Stockton St, San Francisco | \n", "94122 | \n", "
2 | \n", "a3 | \n", "William Bridge | \n", "1986 | \n", "32.0 | \n", "3131 Webster St, San Francisco | \n", "94107 | \n", "
3 | \n", "a4 | \n", "Binto George | \n", "1987 | \n", "32.5 | \n", "423 Powell St, San Francisco | \n", "94122 | \n", "
4 | \n", "a5 | \n", "Alphonse Kemper | \n", "1984 | \n", "35.0 | \n", "1702 Post Street, San Francisco | \n", "94122 | \n", "
\n", " | _id | \n", "ltable_ID | \n", "rtable_ID | \n", "ltable_name | \n", "ltable_birth_year | \n", "ltable_address | \n", "rtable_name | \n", "rtable_birth_year | \n", "rtable_address | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "0 | \n", "a1 | \n", "b1 | \n", "Kevin Smith | \n", "1989 | \n", "607 From St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
1 | \n", "1 | \n", "a2 | \n", "b1 | \n", "Michael Franklin | \n", "1988 | \n", "1652 Stockton St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
2 | \n", "2 | \n", "a3 | \n", "b1 | \n", "William Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
3 | \n", "3 | \n", "a4 | \n", "b1 | \n", "Binto George | \n", "1987 | \n", "423 Powell St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
4 | \n", "4 | \n", "a1 | \n", "b2 | \n", "Kevin Smith | \n", "1989 | \n", "607 From St, San Francisco | \n", "Bill Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "
\n", " | _id | \n", "ltable_ID | \n", "rtable_ID | \n", "ltable_name | \n", "ltable_birth_year | \n", "ltable_address | \n", "rtable_name | \n", "rtable_birth_year | \n", "rtable_address | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "0 | \n", "a1 | \n", "b1 | \n", "Kevin Smith | \n", "1989 | \n", "607 From St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
1 | \n", "1 | \n", "a2 | \n", "b1 | \n", "Michael Franklin | \n", "1988 | \n", "1652 Stockton St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
2 | \n", "2 | \n", "a3 | \n", "b1 | \n", "William Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
3 | \n", "3 | \n", "a4 | \n", "b1 | \n", "Binto George | \n", "1987 | \n", "423 Powell St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
4 | \n", "4 | \n", "a5 | \n", "b1 | \n", "Alphonse Kemper | \n", "1984 | \n", "1702 Post Street, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
\n", " | _id | \n", "ltable_ID | \n", "rtable_ID | \n", "ltable_name | \n", "ltable_birth_year | \n", "ltable_address | \n", "rtable_name | \n", "rtable_birth_year | \n", "rtable_address | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "0 | \n", "a1 | \n", "b1 | \n", "Kevin Smith | \n", "1989 | \n", "607 From St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
1 | \n", "1 | \n", "a2 | \n", "b1 | \n", "Michael Franklin | \n", "1988 | \n", "1652 Stockton St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
2 | \n", "2 | \n", "a3 | \n", "b1 | \n", "William Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
3 | \n", "3 | \n", "a4 | \n", "b1 | \n", "Binto George | \n", "1987 | \n", "423 Powell St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
4 | \n", "4 | \n", "a1 | \n", "b2 | \n", "Kevin Smith | \n", "1989 | \n", "607 From St, San Francisco | \n", "Bill Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "
\n", " | _id | \n", "ltable_ID | \n", "rtable_ID | \n", "ltable_name | \n", "ltable_birth_year | \n", "ltable_address | \n", "rtable_name | \n", "rtable_birth_year | \n", "rtable_address | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "0 | \n", "a1 | \n", "b1 | \n", "Kevin Smith | \n", "1989 | \n", "607 From St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
1 | \n", "1 | \n", "a2 | \n", "b1 | \n", "Michael Franklin | \n", "1988 | \n", "1652 Stockton St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
2 | \n", "2 | \n", "a3 | \n", "b1 | \n", "William Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
3 | \n", "3 | \n", "a4 | \n", "b1 | \n", "Binto George | \n", "1987 | \n", "423 Powell St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
4 | \n", "4 | \n", "a1 | \n", "b2 | \n", "Kevin Smith | \n", "1989 | \n", "607 From St, San Francisco | \n", "Bill Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "
\n", " | _id | \n", "ltable_ID | \n", "rtable_ID | \n", "ltable_name | \n", "ltable_birth_year | \n", "ltable_address | \n", "rtable_name | \n", "rtable_birth_year | \n", "rtable_address | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "0 | \n", "a2 | \n", "b1 | \n", "Michael Franklin | \n", "1988 | \n", "1652 Stockton St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
1 | \n", "1 | \n", "a3 | \n", "b1 | \n", "William Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
2 | \n", "2 | \n", "a4 | \n", "b1 | \n", "Binto George | \n", "1987 | \n", "423 Powell St, San Francisco | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
3 | \n", "3 | \n", "a2 | \n", "b2 | \n", "Michael Franklin | \n", "1988 | \n", "1652 Stockton St, San Francisco | \n", "Bill Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "
4 | \n", "4 | \n", "a3 | \n", "b2 | \n", "William Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "Bill Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "
5 | \n", "5 | \n", "a4 | \n", "b2 | \n", "Binto George | \n", "1987 | \n", "423 Powell St, San Francisco | \n", "Bill Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "
6 | \n", "6 | \n", "a2 | \n", "b3 | \n", "Michael Franklin | \n", "1988 | \n", "1652 Stockton St, San Francisco | \n", "Mike Franklin | \n", "1988 | \n", "1652 Stockton St, San Francisco | \n", "
7 | \n", "7 | \n", "a3 | \n", "b3 | \n", "William Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "Mike Franklin | \n", "1988 | \n", "1652 Stockton St, San Francisco | \n", "
8 | \n", "8 | \n", "a4 | \n", "b3 | \n", "Binto George | \n", "1987 | \n", "423 Powell St, San Francisco | \n", "Mike Franklin | \n", "1988 | \n", "1652 Stockton St, San Francisco | \n", "
9 | \n", "9 | \n", "a2 | \n", "b5 | \n", "Michael Franklin | \n", "1988 | \n", "1652 Stockton St, San Francisco | \n", "Alfons Kemper | \n", "1984 | \n", "170 Post St, Apt 4, San Francisco | \n", "
10 | \n", "10 | \n", "a3 | \n", "b5 | \n", "William Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "Alfons Kemper | \n", "1984 | \n", "170 Post St, Apt 4, San Francisco | \n", "
11 | \n", "11 | \n", "a4 | \n", "b5 | \n", "Binto George | \n", "1987 | \n", "423 Powell St, San Francisco | \n", "Alfons Kemper | \n", "1984 | \n", "170 Post St, Apt 4, San Francisco | \n", "
12 | \n", "12 | \n", "a5 | \n", "b5 | \n", "Alphonse Kemper | \n", "1984 | \n", "1702 Post Street, San Francisco | \n", "Alfons Kemper | \n", "1984 | \n", "170 Post St, Apt 4, San Francisco | \n", "
13 | \n", "13 | \n", "a5 | \n", "b6 | \n", "Alphonse Kemper | \n", "1984 | \n", "1702 Post Street, San Francisco | \n", "Michael Brodie | \n", "1987 | \n", "133 Clement Street, San Francisco | \n", "
0 | \n", "14 | \n", "a1 | \n", "b1 | \n", "Kevin Smith | \n", "1989 | \n", "NaN | \n", "Mark Levene | \n", "1987 | \n", "108 Clement St, San Francisco | \n", "
1 | \n", "15 | \n", "a1 | \n", "b2 | \n", "Kevin Smith | \n", "1989 | \n", "NaN | \n", "Bill Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "
2 | \n", "16 | \n", "a1 | \n", "b3 | \n", "Kevin Smith | \n", "1989 | \n", "NaN | \n", "Mike Franklin | \n", "1988 | \n", "1652 Stockton St, San Francisco | \n", "
3 | \n", "17 | \n", "a1 | \n", "b4 | \n", "Kevin Smith | \n", "1989 | \n", "NaN | \n", "Joseph Kuan | \n", "1982 | \n", "108 South Park, San Francisco | \n", "
4 | \n", "18 | \n", "a1 | \n", "b5 | \n", "Kevin Smith | \n", "1989 | \n", "NaN | \n", "Alfons Kemper | \n", "1984 | \n", "170 Post St, Apt 4, San Francisco | \n", "
5 | \n", "19 | \n", "a1 | \n", "b6 | \n", "Kevin Smith | \n", "1989 | \n", "NaN | \n", "Michael Brodie | \n", "1987 | \n", "133 Clement Street, San Francisco | \n", "
\n", " | _id | \n", "ltable_ID | \n", "rtable_ID | \n", "ltable_name | \n", "ltable_birth_year | \n", "ltable_address | \n", "rtable_name | \n", "rtable_birth_year | \n", "rtable_address | \n", "
---|---|---|---|---|---|---|---|---|---|
6 | \n", "6 | \n", "a3 | \n", "b2 | \n", "William Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "Bill Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "
9 | \n", "9 | \n", "a2 | \n", "b3 | \n", "Michael Franklin | \n", "1988 | \n", "1652 Stockton St, San Francisco | \n", "Mike Franklin | \n", "1988 | \n", "1652 Stockton St, San Francisco | \n", "
16 | \n", "16 | \n", "a5 | \n", "b5 | \n", "Alphonse Kemper | \n", "1984 | \n", "1702 Post Street, San Francisco | \n", "Alfons Kemper | \n", "1984 | \n", "170 Post St, Apt 4, San Francisco | \n", "
\n", " | _id | \n", "ltable_ID | \n", "rtable_ID | \n", "ltable_name | \n", "ltable_birth_year | \n", "ltable_address | \n", "rtable_name | \n", "rtable_birth_year | \n", "rtable_address | \n", "
---|---|---|---|---|---|---|---|---|---|
6 | \n", "6 | \n", "a3 | \n", "b2 | \n", "William Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "Bill Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "
7 | \n", "7 | \n", "a4 | \n", "b2 | \n", "Binto George | \n", "1987 | \n", "423 Powell St, San Francisco | \n", "Bill Bridge | \n", "1986 | \n", "3131 Webster St, San Francisco | \n", "
8 | \n", "8 | \n", "a1 | \n", "b3 | \n", "Kevin Smith | \n", "1989 | \n", "607 From St, San Francisco | \n", "Mike Franklin | \n", "1988 | \n", "1652 Stockton St, San Francisco | \n", "
9 | \n", "9 | \n", "a2 | \n", "b3 | \n", "Michael Franklin | \n", "1988 | \n", "1652 Stockton St, San Francisco | \n", "Mike Franklin | \n", "1988 | \n", "1652 Stockton St, San Francisco | \n", "
16 | \n", "16 | \n", "a5 | \n", "b5 | \n", "Alphonse Kemper | \n", "1984 | \n", "1702 Post Street, San Francisco | \n", "Alfons Kemper | \n", "1984 | \n", "170 Post St, Apt 4, San Francisco | \n", "
\n", " | ID | \n", "name | \n", "birth_year | \n", "hourly_wage | \n", "address | \n", "zipcode | \n", "
---|---|---|---|---|---|---|
0 | \n", "a1 | \n", "Kevin Smith | \n", "1989 | \n", "30.0 | \n", "607 From St, San Francisco | \n", "94107 | \n", "
\n", " | ID | \n", "name | \n", "birth_year | \n", "hourly_wage | \n", "address | \n", "zipcode | \n", "
---|---|---|---|---|---|---|
0 | \n", "b1 | \n", "Mark Levene | \n", "1987 | \n", "29.5 | \n", "108 Clement St, San Francisco | \n", "94107 | \n", "