{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This IPython notebook illustrates how to perform blocking using Overlap blocker.\n", "\n", "First, we need to import *py_entitymatching* package and other libraries as follows:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Import py_entitymatching package\n", "import py_entitymatching as em\n", "import os\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, read the (sample) input tables for blocking purposes." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get the datasets directory\n", "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n", "\n", "# Get the paths of the input tables\n", "path_A = datasets_dir + os.sep + 'person_table_A.csv'\n", "path_B = datasets_dir + os.sep + 'person_table_B.csv'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Read the CSV files and set 'ID' as the key attribute\n", "A = em.read_csv_metadata(path_A, key='ID')\n", "B = em.read_csv_metadata(path_B, key='ID')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | ID | \n", "name | \n", "birth_year | \n", "hourly_wage | \n", "address | \n", "zipcode | \n", "
---|---|---|---|---|---|---|
0 | \n", "a1 | \n", "Kevin Smith | \n", "1989 | \n", "30.0 | \n", "607 From St, San Francisco | \n", "94107 | \n", "
1 | \n", "a2 | \n", "Michael Franklin | \n", "1988 | \n", "27.5 | \n", "1652 Stockton St, San Francisco | \n", "94122 | \n", "
2 | \n", "a3 | \n", "William Bridge | \n", "1986 | \n", "32.0 | \n", "3131 Webster St, San Francisco | \n", "94107 | \n", "
3 | \n", "a4 | \n", "Binto George | \n", "1987 | \n", "32.5 | \n", "423 Powell St, San Francisco | \n", "94122 | \n", "
4 | \n", "a5 | \n", "Alphonse Kemper | \n", "1984 | \n", "35.0 | \n", "1702 Post Street, San Francisco | \n", "94122 | \n", "