{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction\n", "This IPython notebook illustrates how to generate features for blocking/matching manually.\n", "\n", "First, we need to import *py_entitymatching* package and other libraries as follows:" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Import py_entitymatching package\n", "import py_entitymatching as em\n", "import os\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, read the (sample) input tables for blocking purposes." ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get the datasets directory\n", "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n", "\n", "# Get the paths of the input tables\n", "path_A = datasets_dir + os.sep + 'person_table_A.csv'\n", "path_B = datasets_dir + os.sep + 'person_table_B.csv'" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Read the CSV files and set 'ID' as the key attribute\n", "A = em.read_csv_metadata(path_A, key='ID')\n", "B = em.read_csv_metadata(path_B, key='ID')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Generating Features for Manually" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting Attribute Types" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "atypes1 = em.get_attr_types(A)\n", "atypes2 = em.get_attr_types(B)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "dict_keys(['ID', '_table', 'birth_year', 'hourly_wage', 'address', 'name', 'zipcode'])" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "atypes1.keys()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "('numeric', 'numeric', 'str_bt_1w_5w', 'str_bt_1w_5w', 'numeric')" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "atypes1['birth_year'], atypes1['hourly_wage'], atypes1['address'], atypes1['name'], atypes1['zipcode']" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "('numeric', 'numeric', 'str_bt_5w_10w', 'str_bt_1w_5w', 'numeric')" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "atypes2['birth_year'], atypes2['hourly_wage'], atypes2['address'], atypes2['name'], atypes2['zipcode']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting Attribute Correspondences" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": true }, "outputs": [], "source": [ "block_c = em.get_attr_corres(A, B)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "dict_keys(['rtable', 'ltable', 'corres'])" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "block_c.keys()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(4635705184, 4635705184, 4635959984, 4635959984)" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "id(A), id(block_c['ltable']), id(B), id(block_c['rtable'])" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('ID', 'ID'),\n", " ('name', 'name'),\n", " ('birth_year', 'birth_year'),\n", " ('hourly_wage', 'hourly_wage'),\n", " ('address', 'address'),\n", " ('zipcode', 'zipcode')]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "block_c['corres']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting Tokenizers" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# for blocking\n", "tok = em.get_tokenizers_for_blocking()\n", "# for matching\n", "# tok = em.get_tokenizers_for_matching()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'alphabetic': ,\n", " 'alphanumeric': ,\n", " 'dlm_dc0': .tok_delim>,\n", " 'qgm_2': .tok_qgram>,\n", " 'qgm_3': .tok_qgram>,\n", " 'wspace': }" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tok" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting Similarity Functions" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# for blocking\n", "sim = em.get_sim_funs_for_blocking()\n", "\n", "# for matching\n", "# sim = em.get_sim_funs_for_matching()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'abs_norm': ,\n", " 'affine': ,\n", " 'cosine': ,\n", " 'dice': ,\n", " 'exact_match': ,\n", " 'hamming_dist': ,\n", " 'hamming_sim': ,\n", " 'jaccard': ,\n", " 'jaro': ,\n", " 'jaro_winkler': ,\n", " 'lev_dist': ,\n", " 'lev_sim': ,\n", " 'monge_elkan': ,\n", " 'needleman_wunsch': ,\n", " 'overlap_coeff': ,\n", " 'rel_diff': ,\n", " 'smith_waterman': }" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sim" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting Features" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [], "source": [ "feature_table = em.get_features(A, B, atypes1, atypes2, block_c, tok, sim)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
feature_nameleft_attributeright_attributeleft_attr_tokenizerright_attr_tokenizersimfunctionfunctionfunction_sourceis_auto_generated
0ID_ID_lev_distIDIDNoneNonelev_dist<function ID_ID_lev_dist at 0x11452b378>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
1ID_ID_lev_simIDIDNoneNonelev_sim<function ID_ID_lev_sim at 0x114515d08>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
2ID_ID_jarIDIDNoneNonejaro<function ID_ID_jar at 0x11452b158>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
3ID_ID_jwnIDIDNoneNonejaro_winkler<function ID_ID_jwn at 0x11452b048>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
4ID_ID_exmIDIDNoneNoneexact_match<function ID_ID_exm at 0x11452b400>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
\n", "
" ], "text/plain": [ " feature_name left_attribute right_attribute left_attr_tokenizer \\\n", "0 ID_ID_lev_dist ID ID None \n", "1 ID_ID_lev_sim ID ID None \n", "2 ID_ID_jar ID ID None \n", "3 ID_ID_jwn ID ID None \n", "4 ID_ID_exm ID ID None \n", "\n", " right_attr_tokenizer simfunction \\\n", "0 None lev_dist \n", "1 None lev_sim \n", "2 None jaro \n", "3 None jaro_winkler \n", "4 None exact_match \n", "\n", " function \\\n", "0 \n", "1 \n", "2 \n", "3 \n", "4 \n", "\n", " function_source \\\n", "0 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "1 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "2 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "3 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "4 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "\n", " is_auto_generated \n", "0 True \n", "1 True \n", "2 True \n", "3 True \n", "4 True " ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_table.head()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "pandas.core.frame.DataFrame" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(feature_table)" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }