{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import py_entitymatching package\n",
    "import py_entitymatching as em\n",
    "import os\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then, read the (sample) input tables for blocking purposes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Get the datasets directory\n",
    "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n",
    "\n",
    "# Get the paths of the input tables\n",
    "path_A = datasets_dir + os.sep + 'person_table_A.csv'\n",
    "path_B = datasets_dir + os.sep + 'person_table_B.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Read the CSV files and set 'ID' as the key attribute\n",
    "A = em.read_csv_metadata(path_A, key='ID')\n",
    "B = em.read_csv_metadata(path_B, key='ID')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The table shows the corresponding attributes along with their respective types. Please confirm that the information  has been correctly inferred. If you would like to skip this validation process in the future, please set the flag    validate_inferred_attr_types equal to false.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Left Attribute</th>\n",
       "      <th>Right Attribute</th>\n",
       "      <th>Left Attribute Type</th>\n",
       "      <th>Right Attribute Type</th>\n",
       "      <th>Example Features</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ID</td>\n",
       "      <td>ID</td>\n",
       "      <td>short string (1 word)</td>\n",
       "      <td>short string (1 word)</td>\n",
       "      <td>Levenshtein Distance; Levenshtein Similarity</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>name</td>\n",
       "      <td>name</td>\n",
       "      <td>short string (1 word to 5 words)</td>\n",
       "      <td>short string (1 word to 5 words)</td>\n",
       "      <td>Jaccard Similarity [3-grams, 3-grams]; Cosine Similarity [Space Delimiter, Space Delimiter]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>birth_year</td>\n",
       "      <td>birth_year</td>\n",
       "      <td>numeric</td>\n",
       "      <td>numeric</td>\n",
       "      <td>Exact Match; Absolute Norm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>hourly_wage</td>\n",
       "      <td>hourly_wage</td>\n",
       "      <td>numeric</td>\n",
       "      <td>numeric</td>\n",
       "      <td>Exact Match; Absolute Norm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>address</td>\n",
       "      <td>address</td>\n",
       "      <td>short string (1 word to 5 words)</td>\n",
       "      <td>medium string (5 words to 10 words)</td>\n",
       "      <td>Not Applicable: Types do not match</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>zipcode</td>\n",
       "      <td>zipcode</td>\n",
       "      <td>numeric</td>\n",
       "      <td>numeric</td>\n",
       "      <td>Exact Match; Absolute Norm</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Left Attribute Right Attribute               Left Attribute Type  \\\n",
       "0             ID              ID             short string (1 word)   \n",
       "1           name            name  short string (1 word to 5 words)   \n",
       "2     birth_year      birth_year                           numeric   \n",
       "3    hourly_wage     hourly_wage                           numeric   \n",
       "4        address         address  short string (1 word to 5 words)   \n",
       "5        zipcode         zipcode                           numeric   \n",
       "\n",
       "                  Right Attribute Type  \\\n",
       "0                short string (1 word)   \n",
       "1     short string (1 word to 5 words)   \n",
       "2                              numeric   \n",
       "3                              numeric   \n",
       "4  medium string (5 words to 10 words)   \n",
       "5                              numeric   \n",
       "\n",
       "                                                                              Example Features  \n",
       "0                                                 Levenshtein Distance; Levenshtein Similarity  \n",
       "1  Jaccard Similarity [3-grams, 3-grams]; Cosine Similarity [Space Delimiter, Space Delimiter]  \n",
       "2                                                                   Exact Match; Absolute Norm  \n",
       "3                                                                   Exact Match; Absolute Norm  \n",
       "4                                                           Not Applicable: Types do not match  \n",
       "5                                                                   Exact Match; Absolute Norm  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Do you want to proceed? (y/n):y\n"
     ]
    }
   ],
   "source": [
    "# Calling get_features_for_blocking or get_features_for_matching without setting the flag \n",
    "# validate_inferred_attr_types to false will result in a validation process that shows the\n",
    "# user a table containing all of the inferred attribute correspondence and inferred types.\n",
    "\n",
    "# Get features (for blocking)\n",
    "feature_table = em.get_features_for_blocking(A, B)\n",
    "\n",
    "# Get features (for matching)\n",
    "# feature_table = em.get_features_for_blocking(A, B)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Adding Features Declaratively"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Add a feature to do Jaccard on title + authors and add it to F\n",
    "\n",
    "# Create a feature declaratively\n",
    "sim = em.get_sim_funs_for_matching()\n",
    "tok = em.get_tokenizers_for_matching()\n",
    "feature_string = \"\"\"jaccard(wspace((ltuple['name'] + ' ' + ltuple['address']).lower()), \n",
    "                            wspace((rtuple['name'] + ' ' + rtuple['address']).lower()))\"\"\"\n",
    "feature = em.get_feature_fn(feature_string, sim, tok)\n",
    "\n",
    "# Add feature to F\n",
    "em.add_feature(feature_table, 'jac_ws_name_address', feature)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0                       ID_ID_lev_dist\n",
       "1                        ID_ID_lev_sim\n",
       "2                            ID_ID_jar\n",
       "3                            ID_ID_jwn\n",
       "4                            ID_ID_exm\n",
       "5                ID_ID_jac_qgm_3_qgm_3\n",
       "6            name_name_jac_qgm_3_qgm_3\n",
       "7        name_name_cos_dlm_dc0_dlm_dc0\n",
       "8        name_name_jac_dlm_dc0_dlm_dc0\n",
       "9                        name_name_mel\n",
       "10                  name_name_lev_dist\n",
       "11                   name_name_lev_sim\n",
       "12                       name_name_nmw\n",
       "13                        name_name_sw\n",
       "14           birth_year_birth_year_exm\n",
       "15           birth_year_birth_year_anm\n",
       "16      birth_year_birth_year_lev_dist\n",
       "17       birth_year_birth_year_lev_sim\n",
       "18         hourly_wage_hourly_wage_exm\n",
       "19         hourly_wage_hourly_wage_anm\n",
       "20    hourly_wage_hourly_wage_lev_dist\n",
       "21     hourly_wage_hourly_wage_lev_sim\n",
       "22                 zipcode_zipcode_exm\n",
       "23                 zipcode_zipcode_anm\n",
       "24            zipcode_zipcode_lev_dist\n",
       "25             zipcode_zipcode_lev_sim\n",
       "26                 jac_ws_name_address\n",
       "Name: feature_name, dtype: object"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_table.feature_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import fuzzywuzzy.StringMatcher as fz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6666666666666666"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fz.ratio('xyz', 'ayz')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Create a feature declaratively\n",
    "sim = em.get_sim_funs_for_matching()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sim['fz_ratio'] = fz.ratio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'abs_norm': <function py_entitymatching.feature.simfunctions.abs_norm>,\n",
       " 'affine': <function py_entitymatching.feature.simfunctions.affine>,\n",
       " 'cosine': <function py_entitymatching.feature.simfunctions.cosine>,\n",
       " 'dice': <function py_entitymatching.feature.simfunctions.dice>,\n",
       " 'exact_match': <function py_entitymatching.feature.simfunctions.exact_match>,\n",
       " 'fz_ratio': <function Levenshtein._levenshtein.ratio>,\n",
       " 'hamming_dist': <function py_entitymatching.feature.simfunctions.hamming_dist>,\n",
       " 'hamming_sim': <function py_entitymatching.feature.simfunctions.hamming_sim>,\n",
       " 'jaccard': <function py_entitymatching.feature.simfunctions.jaccard>,\n",
       " 'jaro': <function py_entitymatching.feature.simfunctions.jaro>,\n",
       " 'jaro_winkler': <function py_entitymatching.feature.simfunctions.jaro_winkler>,\n",
       " 'lev_dist': <function py_entitymatching.feature.simfunctions.lev_dist>,\n",
       " 'lev_sim': <function py_entitymatching.feature.simfunctions.lev_sim>,\n",
       " 'monge_elkan': <function py_entitymatching.feature.simfunctions.monge_elkan>,\n",
       " 'needleman_wunsch': <function py_entitymatching.feature.simfunctions.needleman_wunsch>,\n",
       " 'overlap_coeff': <function py_entitymatching.feature.simfunctions.overlap_coeff>,\n",
       " 'rel_diff': <function py_entitymatching.feature.simfunctions.rel_diff>,\n",
       " 'smith_waterman': <function py_entitymatching.feature.simfunctions.smith_waterman>}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_string = \"\"\"fz_ratio((ltuple['name'] + ' ' + ltuple['address']).lower(), \n",
    "                            (rtuple['name'] + ' ' + rtuple['address']).lower())\"\"\"\n",
    "feature = em.get_feature_fn(feature_string, sim, tok)\n",
    "\n",
    "# Add feature to F\n",
    "em.add_feature(feature_table, 'fzratio_name_address', feature)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0                       ID_ID_lev_dist\n",
       "1                        ID_ID_lev_sim\n",
       "2                            ID_ID_jar\n",
       "3                            ID_ID_jwn\n",
       "4                            ID_ID_exm\n",
       "5                ID_ID_jac_qgm_3_qgm_3\n",
       "6            name_name_jac_qgm_3_qgm_3\n",
       "7        name_name_cos_dlm_dc0_dlm_dc0\n",
       "8        name_name_jac_dlm_dc0_dlm_dc0\n",
       "9                        name_name_mel\n",
       "10                  name_name_lev_dist\n",
       "11                   name_name_lev_sim\n",
       "12                       name_name_nmw\n",
       "13                        name_name_sw\n",
       "14           birth_year_birth_year_exm\n",
       "15           birth_year_birth_year_anm\n",
       "16      birth_year_birth_year_lev_dist\n",
       "17       birth_year_birth_year_lev_sim\n",
       "18         hourly_wage_hourly_wage_exm\n",
       "19         hourly_wage_hourly_wage_anm\n",
       "20    hourly_wage_hourly_wage_lev_dist\n",
       "21     hourly_wage_hourly_wage_lev_sim\n",
       "22                 zipcode_zipcode_exm\n",
       "23                 zipcode_zipcode_anm\n",
       "24            zipcode_zipcode_lev_dist\n",
       "25             zipcode_zipcode_lev_sim\n",
       "26                 jac_ws_name_address\n",
       "27                fzratio_name_address\n",
       "Name: feature_name, dtype: object"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_table.feature_name"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Adding Blackbox Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import fuzzywuzzy.StringMatcher as fz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def my_feature(ltuple, rtuple):\n",
    "    return(ltuple['name'], rtuple['name'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_table = em.get_features_for_blocking(A, B)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Help on function add_blackbox_feature in module py_entitymatching.feature.addfeatures:\n",
      "\n",
      "add_blackbox_feature(feature_table, feature_name, feature_function)\n",
      "    Adds a black box feature to the feature table.\n",
      "    \n",
      "    Args:\n",
      "        feature_table (DataFrame): The input DataFrame (typically a feature\n",
      "            table) to which the feature must be added.\n",
      "        feature_name (string): The name that should be given to the feature.\n",
      "        feature_function (Python function): A Python function for the black box\n",
      "            feature.\n",
      "    \n",
      "    Returns:\n",
      "        A Boolean value of True is returned if the addition was successful.\n",
      "    \n",
      "    Raises:\n",
      "        AssertionError: If the input `feature_table` is not of type\n",
      "            DataFrame.\n",
      "        AssertionError: If the input `feature_name` is not of type\n",
      "            string.\n",
      "        AssertionError: If the `feature_table` does not have necessary columns\n",
      "            such as 'feature_name', 'left_attribute', 'right_attribute',\n",
      "            'left_attr_tokenizer',\n",
      "            'right_attr_tokenizer', 'simfunction', 'function', and\n",
      "            'function_source' in the DataFrame.\n",
      "        AssertionError: If the `feature_name` is already present in the\n",
      "            feature table.\n",
      "    \n",
      "    Examples:\n",
      "        >>> import py_entitymatching as em\n",
      "        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')\n",
      "        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')\n",
      "        >>> block_f = em.get_features_for_blocking(A, B)\n",
      "        >>> def age_diff(ltuple, rtuple):\n",
      "        >>>     # assume that the tuples have age attribute and values are valid numbers.\n",
      "        >>>   return ltuple['age'] - rtuple['age']\n",
      "        >>> status = em.add_blackbox_feature(block_f, 'age_difference', age_diff)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "help(em.add_blackbox_feature)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "em.add_blackbox_feature(feature_table, 'blackbox_fz_ratio_name', my_feature)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0                       ID_ID_lev_dist\n",
       "1                        ID_ID_lev_sim\n",
       "2                            ID_ID_jar\n",
       "3                            ID_ID_jwn\n",
       "4                            ID_ID_exm\n",
       "5                ID_ID_jac_qgm_3_qgm_3\n",
       "6            name_name_jac_qgm_3_qgm_3\n",
       "7        name_name_cos_dlm_dc0_dlm_dc0\n",
       "8        name_name_jac_dlm_dc0_dlm_dc0\n",
       "9                        name_name_mel\n",
       "10                  name_name_lev_dist\n",
       "11                   name_name_lev_sim\n",
       "12                       name_name_nmw\n",
       "13                        name_name_sw\n",
       "14           birth_year_birth_year_exm\n",
       "15           birth_year_birth_year_anm\n",
       "16      birth_year_birth_year_lev_dist\n",
       "17       birth_year_birth_year_lev_sim\n",
       "18         hourly_wage_hourly_wage_exm\n",
       "19         hourly_wage_hourly_wage_anm\n",
       "20    hourly_wage_hourly_wage_lev_dist\n",
       "21     hourly_wage_hourly_wage_lev_sim\n",
       "22                 zipcode_zipcode_exm\n",
       "23                 zipcode_zipcode_anm\n",
       "24            zipcode_zipcode_lev_dist\n",
       "25             zipcode_zipcode_lev_sim\n",
       "26              blackbox_fz_ratio_name\n",
       "Name: feature_name, dtype: object"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_table.feature_name"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}