{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Import py_entitymatching package\n", "import py_entitymatching as em\n", "import os\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, read the (sample) input tables for blocking purposes" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get the datasets directory\n", "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n", "\n", "# Get the paths of the input tables\n", "path_A = datasets_dir + os.sep + 'person_table_A.csv'\n", "path_B = datasets_dir + os.sep + 'person_table_B.csv'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Read the CSV files and set 'ID' as the key attribute\n", "A = em.read_csv_metadata(path_A, key='ID')\n", "B = em.read_csv_metadata(path_B, key='ID')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The table shows the corresponding attributes along with their respective types. Please confirm that the information has been correctly inferred. If you would like to skip this validation process in the future, please set the flag validate_inferred_attr_types equal to false.\n" ] }, { "data": { "text/html": [ "
\n", " | Left Attribute | \n", "Right Attribute | \n", "Left Attribute Type | \n", "Right Attribute Type | \n", "Example Features | \n", "
---|---|---|---|---|---|
0 | \n", "ID | \n", "ID | \n", "short string (1 word) | \n", "short string (1 word) | \n", "Levenshtein Distance; Levenshtein Similarity | \n", "
1 | \n", "name | \n", "name | \n", "short string (1 word to 5 words) | \n", "short string (1 word to 5 words) | \n", "Jaccard Similarity [3-grams, 3-grams]; Cosine Similarity [Space Delimiter, Space Delimiter] | \n", "
2 | \n", "birth_year | \n", "birth_year | \n", "numeric | \n", "numeric | \n", "Exact Match; Absolute Norm | \n", "
3 | \n", "hourly_wage | \n", "hourly_wage | \n", "numeric | \n", "numeric | \n", "Exact Match; Absolute Norm | \n", "
4 | \n", "address | \n", "address | \n", "short string (1 word to 5 words) | \n", "medium string (5 words to 10 words) | \n", "Not Applicable: Types do not match | \n", "
5 | \n", "zipcode | \n", "zipcode | \n", "numeric | \n", "numeric | \n", "Exact Match; Absolute Norm | \n", "