{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction\n", "This IPython notebook illustrates how to update attribute types and generate features for blocking/matching manually.\n", "\n", "First, we need to import *py_entitymatching* package and other libraries as follows:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Import py_entitymatching package\n", "import py_entitymatching as em\n", "import os\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, read the (sample) input tables for blocking purposes." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get the datasets directory\n", "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n", "\n", "# Get the paths of the input tables\n", "path_A = datasets_dir + os.sep + 'person_table_A.csv'\n", "path_B = datasets_dir + os.sep + 'person_table_B.csv'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Read the CSV files and set 'ID' as the key attribute\n", "A = em.read_csv_metadata(path_A, key='ID')\n", "B = em.read_csv_metadata(path_B, key='ID')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting Attribute Types" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "atypes1 = em.get_attr_types(A)\n", "atypes2 = em.get_attr_types(B)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['ID', 'zipcode', '_table', 'name', 'hourly_wage', 'address', 'birth_year'])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "atypes1.keys()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('numeric', 'numeric', 'str_bt_1w_5w', 'str_bt_1w_5w', 'numeric')" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "atypes1['birth_year'], atypes1['hourly_wage'], atypes1['address'], atypes1['name'], atypes1['zipcode']" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('numeric', 'numeric', 'str_bt_5w_10w', 'str_bt_1w_5w', 'numeric')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "atypes2['birth_year'], atypes2['hourly_wage'], atypes2['address'], atypes2['name'], atypes2['zipcode']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Updating Attribute Types" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('str_bt_1w_5w', 'str_bt_5w_10w')" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "atypes1['address'], atypes2['address']" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "atypes1['address'] = 'str_bt_1w_5w'\n", "atypes2['address'] = 'str_bt_1w_5w'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting Attribute Correspondences" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "block_c = em.get_attr_corres(A, B)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['corres', 'rtable', 'ltable'])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "block_c.keys()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(4509225032, 4509225032, 4509225816, 4509225816)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "id(A), id(block_c['ltable']), id(B), id(block_c['rtable'])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('ID', 'ID'),\n", " ('name', 'name'),\n", " ('birth_year', 'birth_year'),\n", " ('hourly_wage', 'hourly_wage'),\n", " ('address', 'address'),\n", " ('zipcode', 'zipcode')]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "block_c['corres']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Updating Attribute Correspondences" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "block_c['corres'] = [('name', 'name'),\n", " ('birth_year', 'birth_year'),\n", " ('hourly_wage', 'hourly_wage'),\n", " ('address', 'address'),\n", " ('zipcode', 'zipcode')]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting Tokenizers" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# for blocking\n", "tok = em.get_tokenizers_for_blocking() \n", "# for matching \n", "#tok = em.get_tokenizers_for_matching() " ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'alphabetic': ,\n", " 'alphanumeric': ,\n", " 'dlm_dc0': .tok_delim>,\n", " 'qgm_2': .tok_qgram>,\n", " 'qgm_3': .tok_qgram>,\n", " 'wspace': }" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tok" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting Similarity Functions" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#for blocking\n", "sim = em.get_sim_funs_for_blocking()\n", "\n", "#for matching\n", "#sim = em.get_sim_funs_for_matching()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'abs_norm': ,\n", " 'affine': ,\n", " 'cosine': ,\n", " 'dice': ,\n", " 'exact_match': ,\n", " 'hamming_dist': ,\n", " 'hamming_sim': ,\n", " 'jaccard': ,\n", " 'jaro': ,\n", " 'jaro_winkler': ,\n", " 'lev_dist': ,\n", " 'lev_sim': ,\n", " 'monge_elkan': ,\n", " 'needleman_wunsch': ,\n", " 'overlap_coeff': ,\n", " 'rel_diff': ,\n", " 'smith_waterman': }" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sim" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting Features" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "feature_table = em.get_features(A, B, atypes1, atypes2, block_c, tok, sim)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
feature_nameleft_attributeright_attributeleft_attr_tokenizerright_attr_tokenizersimfunctionfunctionfunction_sourceis_auto_generated
16address_address_jac_qgm_3_qgm_3addressaddressqgm_3qgm_3jaccard<function address_address_jac_qgm_3_qgm_3 at 0x10f959c80>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
17address_address_cos_dlm_dc0_dlm_dc0addressaddressdlm_dc0dlm_dc0cosine<function address_address_cos_dlm_dc0_dlm_dc0 at 0x10f959d08>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
18address_address_jac_dlm_dc0_dlm_dc0addressaddressdlm_dc0dlm_dc0jaccard<function address_address_jac_dlm_dc0_dlm_dc0 at 0x10f959d90>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
19address_address_meladdressaddressNoneNonemonge_elkan<function address_address_mel at 0x10f959e18>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
20address_address_lev_distaddressaddressNoneNonelev_dist<function address_address_lev_dist at 0x10f959ea0>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
21address_address_lev_simaddressaddressNoneNonelev_sim<function address_address_lev_sim at 0x10f959f28>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
22address_address_nmwaddressaddressNoneNoneneedleman_wunsch<function address_address_nmw at 0x10f9bb048>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
23address_address_swaddressaddressNoneNonesmith_waterman<function address_address_sw at 0x10f9bb0d0>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
\n", "
" ], "text/plain": [ " feature_name left_attribute right_attribute \\\n", "16 address_address_jac_qgm_3_qgm_3 address address \n", "17 address_address_cos_dlm_dc0_dlm_dc0 address address \n", "18 address_address_jac_dlm_dc0_dlm_dc0 address address \n", "19 address_address_mel address address \n", "20 address_address_lev_dist address address \n", "21 address_address_lev_sim address address \n", "22 address_address_nmw address address \n", "23 address_address_sw address address \n", "\n", " left_attr_tokenizer right_attr_tokenizer simfunction \\\n", "16 qgm_3 qgm_3 jaccard \n", "17 dlm_dc0 dlm_dc0 cosine \n", "18 dlm_dc0 dlm_dc0 jaccard \n", "19 None None monge_elkan \n", "20 None None lev_dist \n", "21 None None lev_sim \n", "22 None None needleman_wunsch \n", "23 None None smith_waterman \n", "\n", " function \\\n", "16 \n", "17 \n", "18 \n", "19 \n", "20 \n", "21 \n", "22 \n", "23 \n", "\n", " function_source \\\n", "16 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "17 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "18 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "19 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "20 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "21 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "22 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "23 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "\n", " is_auto_generated \n", "16 True \n", "17 True \n", "18 True \n", "19 True \n", "20 True \n", "21 True \n", "22 True \n", "23 True " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_table[feature_table.left_attribute == 'address']" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pandas.core.frame.DataFrame" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(feature_table)" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 1 }