{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Import py_entitymatching package\n", "import py_entitymatching as em\n", "import os\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, read the (sample) input tables for blocking purposes" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get the datasets directory\n", "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n", "\n", "# Get the paths of the input tables\n", "path_A = datasets_dir + os.sep + 'person_table_A.csv'\n", "path_B = datasets_dir + os.sep + 'person_table_B.csv'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Read the CSV files and set 'ID' as the key attribute\n", "A = em.read_csv_metadata(path_A, key='ID')\n", "B = em.read_csv_metadata(path_B, key='ID')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get features (for blocking)\n", "feature_table = em.get_features_for_blocking(A, B)\n", "\n", "# Get features (for matching)\n", "# feature_table = em.get_features_for_blocking(A, B)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Adding Features Declaratively" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Add a feature to do Jaccard on title + authors and add it to F\n", "\n", "# Create a feature declaratively\n", "sim = em.get_sim_funs_for_matching()\n", "tok = em.get_tokenizers_for_matching()\n", "feature_string = \"\"\"jaccard(wspace((ltuple['name'] + ' ' + ltuple['address']).lower()), \n", " wspace((rtuple['name'] + ' ' + rtuple['address']).lower()))\"\"\"\n", "feature = em.get_feature_fn(feature_string, sim, tok)\n", "\n", "# Add feature to F\n", "em.add_feature(feature_table, 'jac_ws_name_address', feature)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 ID_ID_lev_dist\n", "1 ID_ID_lev_sim\n", "2 ID_ID_jar\n", "3 ID_ID_jwn\n", "4 ID_ID_exm\n", "5 ID_ID_jac_qgm_3_qgm_3\n", "6 name_name_jac_qgm_3_qgm_3\n", "7 name_name_cos_dlm_dc0_dlm_dc0\n", "8 name_name_jac_dlm_dc0_dlm_dc0\n", "9 name_name_mel\n", "10 name_name_lev_dist\n", "11 name_name_lev_sim\n", "12 name_name_nmw\n", "13 name_name_sw\n", "14 birth_year_birth_year_exm\n", "15 birth_year_birth_year_anm\n", "16 birth_year_birth_year_lev_dist\n", "17 birth_year_birth_year_lev_sim\n", "18 hourly_wage_hourly_wage_exm\n", "19 hourly_wage_hourly_wage_anm\n", "20 hourly_wage_hourly_wage_lev_dist\n", "21 hourly_wage_hourly_wage_lev_sim\n", "22 zipcode_zipcode_exm\n", "23 zipcode_zipcode_anm\n", "24 zipcode_zipcode_lev_dist\n", "25 zipcode_zipcode_lev_sim\n", "26 jac_ws_name_address\n", "Name: feature_name, dtype: object" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_table.feature_name" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import fuzzywuzzy.StringMatcher as fz" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.6666666666666666" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fz.ratio('xyz', 'ayz')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Create a feature declaratively\n", "sim = em.get_sim_funs_for_matching()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sim['fz_ratio'] = fz.ratio" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'abs_norm': ,\n", " 'affine': ,\n", " 'cosine': ,\n", " 'dice': ,\n", " 'exact_match': ,\n", " 'fz_ratio': ,\n", " 'hamming_dist': ,\n", " 'hamming_sim': ,\n", " 'jaccard': ,\n", " 'jaro': ,\n", " 'jaro_winkler': ,\n", " 'lev_dist': ,\n", " 'lev_sim': ,\n", " 'monge_elkan': ,\n", " 'needleman_wunsch': ,\n", " 'overlap_coeff': ,\n", " 'rel_diff': ,\n", " 'smith_waterman': }" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sim" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_string = \"\"\"fz_ratio((ltuple['name'] + ' ' + ltuple['address']).lower(), \n", " (rtuple['name'] + ' ' + rtuple['address']).lower())\"\"\"\n", "feature = em.get_feature_fn(feature_string, sim, tok)\n", "\n", "# Add feature to F\n", "em.add_feature(feature_table, 'fzratio_name_address', feature)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 ID_ID_lev_dist\n", "1 ID_ID_lev_sim\n", "2 ID_ID_jar\n", "3 ID_ID_jwn\n", "4 ID_ID_exm\n", "5 ID_ID_jac_qgm_3_qgm_3\n", "6 name_name_jac_qgm_3_qgm_3\n", "7 name_name_cos_dlm_dc0_dlm_dc0\n", "8 name_name_jac_dlm_dc0_dlm_dc0\n", "9 name_name_mel\n", "10 name_name_lev_dist\n", "11 name_name_lev_sim\n", "12 name_name_nmw\n", "13 name_name_sw\n", "14 birth_year_birth_year_exm\n", "15 birth_year_birth_year_anm\n", "16 birth_year_birth_year_lev_dist\n", "17 birth_year_birth_year_lev_sim\n", "18 hourly_wage_hourly_wage_exm\n", "19 hourly_wage_hourly_wage_anm\n", "20 hourly_wage_hourly_wage_lev_dist\n", "21 hourly_wage_hourly_wage_lev_sim\n", "22 zipcode_zipcode_exm\n", "23 zipcode_zipcode_anm\n", "24 zipcode_zipcode_lev_dist\n", "25 zipcode_zipcode_lev_sim\n", "26 jac_ws_name_address\n", "27 fzratio_name_address\n", "Name: feature_name, dtype: object" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_table.feature_name" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Adding Blackbox Features" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import fuzzywuzzy.StringMatcher as fz" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def my_feature(ltuple, rtuple):\n", " return(ltuple['name'], rtuple['name'])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [], "source": [ "feature_table = em.get_features_for_blocking(A, B)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Help on function add_blackbox_feature in module py_entitymatching.feature.addfeatures:\n", "\n", "add_blackbox_feature(feature_table, feature_name, feature_function)\n", " Adds a black box feature to the feature table.\n", " \n", " Args:\n", " feature_table (DataFrame): The input DataFrame (typically a feature\n", " table) to which the feature must be added.\n", " feature_name (string): The name that should be given to the feature.\n", " feature_function (Python function): A Python function for the black box\n", " feature.\n", " \n", " Returns:\n", " A Boolean value of True is returned if the addition was successful.\n", " \n", " Raises:\n", " AssertionError: If the input `feature_table` is not of type\n", " DataFrame.\n", " AssertionError: If the input `feature_name` is not of type\n", " string.\n", " AssertionError: If the `feature_table` does not have necessary columns\n", " such as 'feature_name', 'left_attribute', 'right_attribute',\n", " 'left_attr_tokenizer',\n", " 'right_attr_tokenizer', 'simfunction', 'function', and\n", " 'function_source' in the DataFrame.\n", " AssertionError: If the `feature_name` is already present in the\n", " feature table.\n", " \n", " Examples:\n", " >>> import py_entitymatching as em\n", " >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')\n", " >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')\n", " >>> block_f = em.get_features_for_blocking(A, B)\n", " >>> def age_diff(ltuple, rtuple):\n", " >>> # assume that the tuples have age attribute and values are valid numbers.\n", " >>> return ltuple['age'] - rtuple['age']\n", " >>> status = em.add_blackbox_feature(block_f, 'age_difference', age_diff)\n", "\n" ] } ], "source": [ "help(em.add_blackbox_feature)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "em.add_blackbox_feature(feature_table, 'blackbox_fz_ratio_name', my_feature)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 ID_ID_lev_dist\n", "1 ID_ID_lev_sim\n", "2 ID_ID_jar\n", "3 ID_ID_jwn\n", "4 ID_ID_exm\n", "5 ID_ID_jac_qgm_3_qgm_3\n", "6 name_name_jac_qgm_3_qgm_3\n", "7 name_name_cos_dlm_dc0_dlm_dc0\n", "8 name_name_jac_dlm_dc0_dlm_dc0\n", "9 name_name_mel\n", "10 name_name_lev_dist\n", "11 name_name_lev_sim\n", "12 name_name_nmw\n", "13 name_name_sw\n", "14 birth_year_birth_year_exm\n", "15 birth_year_birth_year_anm\n", "16 birth_year_birth_year_lev_dist\n", "17 birth_year_birth_year_lev_sim\n", "18 hourly_wage_hourly_wage_exm\n", "19 hourly_wage_hourly_wage_anm\n", "20 hourly_wage_hourly_wage_lev_dist\n", "21 hourly_wage_hourly_wage_lev_sim\n", "22 zipcode_zipcode_exm\n", "23 zipcode_zipcode_anm\n", "24 zipcode_zipcode_lev_dist\n", "25 zipcode_zipcode_lev_sim\n", "26 blackbox_fz_ratio_name\n", "Name: feature_name, dtype: object" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_table.feature_name" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3.0 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }