{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Import py_entitymatching package\n", "import py_entitymatching as em\n", "import os\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, read the (sample) input tables for blocking purposes" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get the datasets directory\n", "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n", "\n", "# Get the paths of the input tables\n", "path_A = datasets_dir + os.sep + 'person_table_A.csv'\n", "path_B = datasets_dir + os.sep + 'person_table_B.csv'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Read the CSV files and set 'ID' as the key attribute\n", "A = em.read_csv_metadata(path_A, key='ID')\n", "B = em.read_csv_metadata(path_B, key='ID')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The table shows the corresponding attributes along with their respective types. Please confirm that the information has been correctly inferred. If you would like to skip this validation process in the future, please set the flag validate_inferred_attr_types equal to false.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Left AttributeRight AttributeLeft Attribute TypeRight Attribute TypeExample Features
0IDIDshort string (1 word)short string (1 word)Levenshtein Distance; Levenshtein Similarity
1namenameshort string (1 word to 5 words)short string (1 word to 5 words)Jaccard Similarity [3-grams, 3-grams]; Cosine Similarity [Space Delimiter, Space Delimiter]
2birth_yearbirth_yearnumericnumericExact Match; Absolute Norm
3hourly_wagehourly_wagenumericnumericExact Match; Absolute Norm
4addressaddressshort string (1 word to 5 words)medium string (5 words to 10 words)Not Applicable: Types do not match
5zipcodezipcodenumericnumericExact Match; Absolute Norm
\n", "
" ], "text/plain": [ " Left Attribute Right Attribute Left Attribute Type \\\n", "0 ID ID short string (1 word) \n", "1 name name short string (1 word to 5 words) \n", "2 birth_year birth_year numeric \n", "3 hourly_wage hourly_wage numeric \n", "4 address address short string (1 word to 5 words) \n", "5 zipcode zipcode numeric \n", "\n", " Right Attribute Type \\\n", "0 short string (1 word) \n", "1 short string (1 word to 5 words) \n", "2 numeric \n", "3 numeric \n", "4 medium string (5 words to 10 words) \n", "5 numeric \n", "\n", " Example Features \n", "0 Levenshtein Distance; Levenshtein Similarity \n", "1 Jaccard Similarity [3-grams, 3-grams]; Cosine Similarity [Space Delimiter, Space Delimiter] \n", "2 Exact Match; Absolute Norm \n", "3 Exact Match; Absolute Norm \n", "4 Not Applicable: Types do not match \n", "5 Exact Match; Absolute Norm " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Do you want to proceed? (y/n):y\n" ] } ], "source": [ "# Calling get_features_for_blocking or get_features_for_matching without setting the flag \n", "# validate_inferred_attr_types to false will result in a validation process that shows the\n", "# user a table containing all of the inferred attribute correspondence and inferred types.\n", "\n", "# Get features (for blocking)\n", "feature_table = em.get_features_for_blocking(A, B)\n", "\n", "# Get features (for matching)\n", "# feature_table = em.get_features_for_blocking(A, B)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Adding Features Declaratively" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Add a feature to do Jaccard on title + authors and add it to F\n", "\n", "# Create a feature declaratively\n", "sim = em.get_sim_funs_for_matching()\n", "tok = em.get_tokenizers_for_matching()\n", "feature_string = \"\"\"jaccard(wspace((ltuple['name'] + ' ' + ltuple['address']).lower()), \n", " wspace((rtuple['name'] + ' ' + rtuple['address']).lower()))\"\"\"\n", "feature = em.get_feature_fn(feature_string, sim, tok)\n", "\n", "# Add feature to F\n", "em.add_feature(feature_table, 'jac_ws_name_address', feature)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 ID_ID_lev_dist\n", "1 ID_ID_lev_sim\n", "2 ID_ID_jar\n", "3 ID_ID_jwn\n", "4 ID_ID_exm\n", "5 ID_ID_jac_qgm_3_qgm_3\n", "6 name_name_jac_qgm_3_qgm_3\n", "7 name_name_cos_dlm_dc0_dlm_dc0\n", "8 name_name_jac_dlm_dc0_dlm_dc0\n", "9 name_name_mel\n", "10 name_name_lev_dist\n", "11 name_name_lev_sim\n", "12 name_name_nmw\n", "13 name_name_sw\n", "14 birth_year_birth_year_exm\n", "15 birth_year_birth_year_anm\n", "16 birth_year_birth_year_lev_dist\n", "17 birth_year_birth_year_lev_sim\n", "18 hourly_wage_hourly_wage_exm\n", "19 hourly_wage_hourly_wage_anm\n", "20 hourly_wage_hourly_wage_lev_dist\n", "21 hourly_wage_hourly_wage_lev_sim\n", "22 zipcode_zipcode_exm\n", "23 zipcode_zipcode_anm\n", "24 zipcode_zipcode_lev_dist\n", "25 zipcode_zipcode_lev_sim\n", "26 jac_ws_name_address\n", "Name: feature_name, dtype: object" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_table.feature_name" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "import fuzzywuzzy.StringMatcher as fz" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6666666666666666" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fz.ratio('xyz', 'ayz')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Create a feature declaratively\n", "sim = em.get_sim_funs_for_matching()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sim['fz_ratio'] = fz.ratio" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'abs_norm': ,\n", " 'affine': ,\n", " 'cosine': ,\n", " 'dice': ,\n", " 'exact_match': ,\n", " 'fz_ratio': ,\n", " 'hamming_dist': ,\n", " 'hamming_sim': ,\n", " 'jaccard': ,\n", " 'jaro': ,\n", " 'jaro_winkler': ,\n", " 'lev_dist': ,\n", " 'lev_sim': ,\n", " 'monge_elkan': ,\n", " 'needleman_wunsch': ,\n", " 'overlap_coeff': ,\n", " 'rel_diff': ,\n", " 'smith_waterman': }" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sim" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_string = \"\"\"fz_ratio((ltuple['name'] + ' ' + ltuple['address']).lower(), \n", " (rtuple['name'] + ' ' + rtuple['address']).lower())\"\"\"\n", "feature = em.get_feature_fn(feature_string, sim, tok)\n", "\n", "# Add feature to F\n", "em.add_feature(feature_table, 'fzratio_name_address', feature)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 ID_ID_lev_dist\n", "1 ID_ID_lev_sim\n", "2 ID_ID_jar\n", "3 ID_ID_jwn\n", "4 ID_ID_exm\n", "5 ID_ID_jac_qgm_3_qgm_3\n", "6 name_name_jac_qgm_3_qgm_3\n", "7 name_name_cos_dlm_dc0_dlm_dc0\n", "8 name_name_jac_dlm_dc0_dlm_dc0\n", "9 name_name_mel\n", "10 name_name_lev_dist\n", "11 name_name_lev_sim\n", "12 name_name_nmw\n", "13 name_name_sw\n", "14 birth_year_birth_year_exm\n", "15 birth_year_birth_year_anm\n", "16 birth_year_birth_year_lev_dist\n", "17 birth_year_birth_year_lev_sim\n", "18 hourly_wage_hourly_wage_exm\n", "19 hourly_wage_hourly_wage_anm\n", "20 hourly_wage_hourly_wage_lev_dist\n", "21 hourly_wage_hourly_wage_lev_sim\n", "22 zipcode_zipcode_exm\n", "23 zipcode_zipcode_anm\n", "24 zipcode_zipcode_lev_dist\n", "25 zipcode_zipcode_lev_sim\n", "26 jac_ws_name_address\n", "27 fzratio_name_address\n", "Name: feature_name, dtype: object" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_table.feature_name" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Adding Blackbox Features" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import fuzzywuzzy.StringMatcher as fz" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def my_feature(ltuple, rtuple):\n", " return(ltuple['name'], rtuple['name'])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "feature_table = em.get_features_for_blocking(A, B)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Help on function add_blackbox_feature in module py_entitymatching.feature.addfeatures:\n", "\n", "add_blackbox_feature(feature_table, feature_name, feature_function)\n", " Adds a black box feature to the feature table.\n", " \n", " Args:\n", " feature_table (DataFrame): The input DataFrame (typically a feature\n", " table) to which the feature must be added.\n", " feature_name (string): The name that should be given to the feature.\n", " feature_function (Python function): A Python function for the black box\n", " feature.\n", " \n", " Returns:\n", " A Boolean value of True is returned if the addition was successful.\n", " \n", " Raises:\n", " AssertionError: If the input `feature_table` is not of type\n", " DataFrame.\n", " AssertionError: If the input `feature_name` is not of type\n", " string.\n", " AssertionError: If the `feature_table` does not have necessary columns\n", " such as 'feature_name', 'left_attribute', 'right_attribute',\n", " 'left_attr_tokenizer',\n", " 'right_attr_tokenizer', 'simfunction', 'function', and\n", " 'function_source' in the DataFrame.\n", " AssertionError: If the `feature_name` is already present in the\n", " feature table.\n", " \n", " Examples:\n", " >>> import py_entitymatching as em\n", " >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')\n", " >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')\n", " >>> block_f = em.get_features_for_blocking(A, B)\n", " >>> def age_diff(ltuple, rtuple):\n", " >>> # assume that the tuples have age attribute and values are valid numbers.\n", " >>> return ltuple['age'] - rtuple['age']\n", " >>> status = em.add_blackbox_feature(block_f, 'age_difference', age_diff)\n", "\n" ] } ], "source": [ "help(em.add_blackbox_feature)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "em.add_blackbox_feature(feature_table, 'blackbox_fz_ratio_name', my_feature)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 ID_ID_lev_dist\n", "1 ID_ID_lev_sim\n", "2 ID_ID_jar\n", "3 ID_ID_jwn\n", "4 ID_ID_exm\n", "5 ID_ID_jac_qgm_3_qgm_3\n", "6 name_name_jac_qgm_3_qgm_3\n", "7 name_name_cos_dlm_dc0_dlm_dc0\n", "8 name_name_jac_dlm_dc0_dlm_dc0\n", "9 name_name_mel\n", "10 name_name_lev_dist\n", "11 name_name_lev_sim\n", "12 name_name_nmw\n", "13 name_name_sw\n", "14 birth_year_birth_year_exm\n", "15 birth_year_birth_year_anm\n", "16 birth_year_birth_year_lev_dist\n", "17 birth_year_birth_year_lev_sim\n", "18 hourly_wage_hourly_wage_exm\n", "19 hourly_wage_hourly_wage_anm\n", "20 hourly_wage_hourly_wage_lev_dist\n", "21 hourly_wage_hourly_wage_lev_sim\n", "22 zipcode_zipcode_exm\n", "23 zipcode_zipcode_anm\n", "24 zipcode_zipcode_lev_dist\n", "25 zipcode_zipcode_lev_sim\n", "26 blackbox_fz_ratio_name\n", "Name: feature_name, dtype: object" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_table.feature_name" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 1 }