{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Import py_entitymatching package\n",
    "import py_entitymatching as em\n",
    "import os\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then, read the (sample) input tables for blocking purposes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Get the datasets directory\n",
    "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n",
    "\n",
    "# Get the paths of the input tables\n",
    "path_A = datasets_dir + os.sep + 'person_table_A.csv'\n",
    "path_B = datasets_dir + os.sep + 'person_table_B.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Read the CSV files and set 'ID' as the key attribute\n",
    "A = em.read_csv_metadata(path_A, key='ID')\n",
    "B = em.read_csv_metadata(path_B, key='ID')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Get features (for blocking)\n",
    "feature_table = em.get_features_for_blocking(A, B)\n",
    "\n",
    "# Get features (for matching)\n",
    "# feature_table = em.get_features_for_blocking(A, B)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Adding Features Declaratively"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Add a feature to do Jaccard on title + authors and add it to F\n",
    "\n",
    "# Create a feature declaratively\n",
    "sim = em.get_sim_funs_for_matching()\n",
    "tok = em.get_tokenizers_for_matching()\n",
    "feature_string = \"\"\"jaccard(wspace((ltuple['name'] + ' ' + ltuple['address']).lower()), \n",
    "                            wspace((rtuple['name'] + ' ' + rtuple['address']).lower()))\"\"\"\n",
    "feature = em.get_feature_fn(feature_string, sim, tok)\n",
    "\n",
    "# Add feature to F\n",
    "em.add_feature(feature_table, 'jac_ws_name_address', feature)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0                       ID_ID_lev_dist\n",
       "1                        ID_ID_lev_sim\n",
       "2                            ID_ID_jar\n",
       "3                            ID_ID_jwn\n",
       "4                            ID_ID_exm\n",
       "5                ID_ID_jac_qgm_3_qgm_3\n",
       "6            name_name_jac_qgm_3_qgm_3\n",
       "7        name_name_cos_dlm_dc0_dlm_dc0\n",
       "8        name_name_jac_dlm_dc0_dlm_dc0\n",
       "9                        name_name_mel\n",
       "10                  name_name_lev_dist\n",
       "11                   name_name_lev_sim\n",
       "12                       name_name_nmw\n",
       "13                        name_name_sw\n",
       "14           birth_year_birth_year_exm\n",
       "15           birth_year_birth_year_anm\n",
       "16      birth_year_birth_year_lev_dist\n",
       "17       birth_year_birth_year_lev_sim\n",
       "18         hourly_wage_hourly_wage_exm\n",
       "19         hourly_wage_hourly_wage_anm\n",
       "20    hourly_wage_hourly_wage_lev_dist\n",
       "21     hourly_wage_hourly_wage_lev_sim\n",
       "22                 zipcode_zipcode_exm\n",
       "23                 zipcode_zipcode_anm\n",
       "24            zipcode_zipcode_lev_dist\n",
       "25             zipcode_zipcode_lev_sim\n",
       "26                 jac_ws_name_address\n",
       "Name: feature_name, dtype: object"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_table.feature_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import fuzzywuzzy.StringMatcher as fz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6666666666666666"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fz.ratio('xyz', 'ayz')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Create a feature declaratively\n",
    "sim = em.get_sim_funs_for_matching()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sim['fz_ratio'] = fz.ratio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'abs_norm': <function py_entitymatching.feature.simfunctions.abs_norm>,\n",
       " 'affine': <function py_entitymatching.feature.simfunctions.affine>,\n",
       " 'cosine': <function py_entitymatching.feature.simfunctions.cosine>,\n",
       " 'dice': <function py_entitymatching.feature.simfunctions.dice>,\n",
       " 'exact_match': <function py_entitymatching.feature.simfunctions.exact_match>,\n",
       " 'fz_ratio': <function Levenshtein._levenshtein.ratio>,\n",
       " 'hamming_dist': <function py_entitymatching.feature.simfunctions.hamming_dist>,\n",
       " 'hamming_sim': <function py_entitymatching.feature.simfunctions.hamming_sim>,\n",
       " 'jaccard': <function py_entitymatching.feature.simfunctions.jaccard>,\n",
       " 'jaro': <function py_entitymatching.feature.simfunctions.jaro>,\n",
       " 'jaro_winkler': <function py_entitymatching.feature.simfunctions.jaro_winkler>,\n",
       " 'lev_dist': <function py_entitymatching.feature.simfunctions.lev_dist>,\n",
       " 'lev_sim': <function py_entitymatching.feature.simfunctions.lev_sim>,\n",
       " 'monge_elkan': <function py_entitymatching.feature.simfunctions.monge_elkan>,\n",
       " 'needleman_wunsch': <function py_entitymatching.feature.simfunctions.needleman_wunsch>,\n",
       " 'overlap_coeff': <function py_entitymatching.feature.simfunctions.overlap_coeff>,\n",
       " 'rel_diff': <function py_entitymatching.feature.simfunctions.rel_diff>,\n",
       " 'smith_waterman': <function py_entitymatching.feature.simfunctions.smith_waterman>}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_string = \"\"\"fz_ratio((ltuple['name'] + ' ' + ltuple['address']).lower(), \n",
    "                            (rtuple['name'] + ' ' + rtuple['address']).lower())\"\"\"\n",
    "feature = em.get_feature_fn(feature_string, sim, tok)\n",
    "\n",
    "# Add feature to F\n",
    "em.add_feature(feature_table, 'fzratio_name_address', feature)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0                       ID_ID_lev_dist\n",
       "1                        ID_ID_lev_sim\n",
       "2                            ID_ID_jar\n",
       "3                            ID_ID_jwn\n",
       "4                            ID_ID_exm\n",
       "5                ID_ID_jac_qgm_3_qgm_3\n",
       "6            name_name_jac_qgm_3_qgm_3\n",
       "7        name_name_cos_dlm_dc0_dlm_dc0\n",
       "8        name_name_jac_dlm_dc0_dlm_dc0\n",
       "9                        name_name_mel\n",
       "10                  name_name_lev_dist\n",
       "11                   name_name_lev_sim\n",
       "12                       name_name_nmw\n",
       "13                        name_name_sw\n",
       "14           birth_year_birth_year_exm\n",
       "15           birth_year_birth_year_anm\n",
       "16      birth_year_birth_year_lev_dist\n",
       "17       birth_year_birth_year_lev_sim\n",
       "18         hourly_wage_hourly_wage_exm\n",
       "19         hourly_wage_hourly_wage_anm\n",
       "20    hourly_wage_hourly_wage_lev_dist\n",
       "21     hourly_wage_hourly_wage_lev_sim\n",
       "22                 zipcode_zipcode_exm\n",
       "23                 zipcode_zipcode_anm\n",
       "24            zipcode_zipcode_lev_dist\n",
       "25             zipcode_zipcode_lev_sim\n",
       "26                 jac_ws_name_address\n",
       "27                fzratio_name_address\n",
       "Name: feature_name, dtype: object"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_table.feature_name"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Adding Blackbox Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import fuzzywuzzy.StringMatcher as fz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def my_feature(ltuple, rtuple):\n",
    "    return(ltuple['name'], rtuple['name'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "feature_table = em.get_features_for_blocking(A, B)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Help on function add_blackbox_feature in module py_entitymatching.feature.addfeatures:\n",
      "\n",
      "add_blackbox_feature(feature_table, feature_name, feature_function)\n",
      "    Adds a black box feature to the feature table.\n",
      "    \n",
      "    Args:\n",
      "        feature_table (DataFrame): The input DataFrame (typically a feature\n",
      "            table) to which the feature must be added.\n",
      "        feature_name (string): The name that should be given to the feature.\n",
      "        feature_function (Python function): A Python function for the black box\n",
      "            feature.\n",
      "    \n",
      "    Returns:\n",
      "        A Boolean value of True is returned if the addition was successful.\n",
      "    \n",
      "    Raises:\n",
      "        AssertionError: If the input `feature_table` is not of type\n",
      "            DataFrame.\n",
      "        AssertionError: If the input `feature_name` is not of type\n",
      "            string.\n",
      "        AssertionError: If the `feature_table` does not have necessary columns\n",
      "            such as 'feature_name', 'left_attribute', 'right_attribute',\n",
      "            'left_attr_tokenizer',\n",
      "            'right_attr_tokenizer', 'simfunction', 'function', and\n",
      "            'function_source' in the DataFrame.\n",
      "        AssertionError: If the `feature_name` is already present in the\n",
      "            feature table.\n",
      "    \n",
      "    Examples:\n",
      "        >>> import py_entitymatching as em\n",
      "        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')\n",
      "        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')\n",
      "        >>> block_f = em.get_features_for_blocking(A, B)\n",
      "        >>> def age_diff(ltuple, rtuple):\n",
      "        >>>     # assume that the tuples have age attribute and values are valid numbers.\n",
      "        >>>   return ltuple['age'] - rtuple['age']\n",
      "        >>> status = em.add_blackbox_feature(block_f, 'age_difference', age_diff)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "help(em.add_blackbox_feature)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "em.add_blackbox_feature(feature_table, 'blackbox_fz_ratio_name', my_feature)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0                       ID_ID_lev_dist\n",
       "1                        ID_ID_lev_sim\n",
       "2                            ID_ID_jar\n",
       "3                            ID_ID_jwn\n",
       "4                            ID_ID_exm\n",
       "5                ID_ID_jac_qgm_3_qgm_3\n",
       "6            name_name_jac_qgm_3_qgm_3\n",
       "7        name_name_cos_dlm_dc0_dlm_dc0\n",
       "8        name_name_jac_dlm_dc0_dlm_dc0\n",
       "9                        name_name_mel\n",
       "10                  name_name_lev_dist\n",
       "11                   name_name_lev_sim\n",
       "12                       name_name_nmw\n",
       "13                        name_name_sw\n",
       "14           birth_year_birth_year_exm\n",
       "15           birth_year_birth_year_anm\n",
       "16      birth_year_birth_year_lev_dist\n",
       "17       birth_year_birth_year_lev_sim\n",
       "18         hourly_wage_hourly_wage_exm\n",
       "19         hourly_wage_hourly_wage_anm\n",
       "20    hourly_wage_hourly_wage_lev_dist\n",
       "21     hourly_wage_hourly_wage_lev_sim\n",
       "22                 zipcode_zipcode_exm\n",
       "23                 zipcode_zipcode_anm\n",
       "24            zipcode_zipcode_lev_dist\n",
       "25             zipcode_zipcode_lev_sim\n",
       "26              blackbox_fz_ratio_name\n",
       "Name: feature_name, dtype: object"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_table.feature_name"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3.0
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}