{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction\n",
    "This IPython notebook illustrates how to update attribute types and generate features for blocking/matching manually.\n",
    "\n",
    "First, we need to import *py_entitymatching* package and other libraries as follows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Import py_entitymatching package\n",
    "import py_entitymatching as em\n",
    "import os\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then, read the (sample) input tables for blocking purposes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Get the datasets directory\n",
    "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n",
    "\n",
    "# Get the paths of the input tables\n",
    "path_A = datasets_dir + os.sep + 'person_table_A.csv'\n",
    "path_B = datasets_dir + os.sep + 'person_table_B.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Read the CSV files and set 'ID' as the key attribute\n",
    "A = em.read_csv_metadata(path_A, key='ID')\n",
    "B = em.read_csv_metadata(path_B, key='ID')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Getting Attribute Types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "atypes1 = em.get_attr_types(A)\n",
    "atypes2 = em.get_attr_types(B)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['ID', 'zipcode', '_table', 'name', 'hourly_wage', 'address', 'birth_year'])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "atypes1.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('numeric', 'numeric', 'str_bt_1w_5w', 'str_bt_1w_5w', 'numeric')"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "atypes1['birth_year'], atypes1['hourly_wage'], atypes1['address'], atypes1['name'], atypes1['zipcode']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('numeric', 'numeric', 'str_bt_5w_10w', 'str_bt_1w_5w', 'numeric')"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "atypes2['birth_year'], atypes2['hourly_wage'], atypes2['address'], atypes2['name'], atypes2['zipcode']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Updating Attribute Types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('str_bt_1w_5w', 'str_bt_5w_10w')"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "atypes1['address'], atypes2['address']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "atypes1['address'] = 'str_bt_1w_5w'\n",
    "atypes2['address'] = 'str_bt_1w_5w'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Getting Attribute Correspondences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "block_c = em.get_attr_corres(A, B)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['corres', 'rtable', 'ltable'])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "block_c.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4509225032, 4509225032, 4509225816, 4509225816)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "id(A), id(block_c['ltable']), id(B), id(block_c['rtable'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('ID', 'ID'),\n",
       " ('name', 'name'),\n",
       " ('birth_year', 'birth_year'),\n",
       " ('hourly_wage', 'hourly_wage'),\n",
       " ('address', 'address'),\n",
       " ('zipcode', 'zipcode')]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "block_c['corres']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Updating Attribute Correspondences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "block_c['corres'] = [('name', 'name'),\n",
    " ('birth_year', 'birth_year'),\n",
    " ('hourly_wage', 'hourly_wage'),\n",
    " ('address', 'address'),\n",
    " ('zipcode', 'zipcode')]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Getting Tokenizers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# for blocking\n",
    "tok = em.get_tokenizers_for_blocking() \n",
    "# for matching \n",
    "#tok = em.get_tokenizers_for_matching() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'alphabetic': <function py_entitymatching.feature.tokenizers.tok_alphabetic>,\n",
       " 'alphanumeric': <function py_entitymatching.feature.tokenizers.tok_alphanumeric>,\n",
       " 'dlm_dc0': <function py_entitymatching.feature.tokenizers._make_tok_delim.<locals>.tok_delim>,\n",
       " 'qgm_2': <function py_entitymatching.feature.tokenizers._make_tok_qgram.<locals>.tok_qgram>,\n",
       " 'qgm_3': <function py_entitymatching.feature.tokenizers._make_tok_qgram.<locals>.tok_qgram>,\n",
       " 'wspace': <function py_entitymatching.feature.tokenizers.tok_wspace>}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tok"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Getting Similarity Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#for blocking\n",
    "sim = em.get_sim_funs_for_blocking()\n",
    "\n",
    "#for matching\n",
    "#sim = em.get_sim_funs_for_matching()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'abs_norm': <function py_entitymatching.feature.simfunctions.abs_norm>,\n",
       " 'affine': <function py_entitymatching.feature.simfunctions.affine>,\n",
       " 'cosine': <function py_entitymatching.feature.simfunctions.cosine>,\n",
       " 'dice': <function py_entitymatching.feature.simfunctions.dice>,\n",
       " 'exact_match': <function py_entitymatching.feature.simfunctions.exact_match>,\n",
       " 'hamming_dist': <function py_entitymatching.feature.simfunctions.hamming_dist>,\n",
       " 'hamming_sim': <function py_entitymatching.feature.simfunctions.hamming_sim>,\n",
       " 'jaccard': <function py_entitymatching.feature.simfunctions.jaccard>,\n",
       " 'jaro': <function py_entitymatching.feature.simfunctions.jaro>,\n",
       " 'jaro_winkler': <function py_entitymatching.feature.simfunctions.jaro_winkler>,\n",
       " 'lev_dist': <function py_entitymatching.feature.simfunctions.lev_dist>,\n",
       " 'lev_sim': <function py_entitymatching.feature.simfunctions.lev_sim>,\n",
       " 'monge_elkan': <function py_entitymatching.feature.simfunctions.monge_elkan>,\n",
       " 'needleman_wunsch': <function py_entitymatching.feature.simfunctions.needleman_wunsch>,\n",
       " 'overlap_coeff': <function py_entitymatching.feature.simfunctions.overlap_coeff>,\n",
       " 'rel_diff': <function py_entitymatching.feature.simfunctions.rel_diff>,\n",
       " 'smith_waterman': <function py_entitymatching.feature.simfunctions.smith_waterman>}"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sim"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Getting Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "feature_table = em.get_features(A, B, atypes1, atypes2, block_c, tok, sim)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>feature_name</th>\n",
       "      <th>left_attribute</th>\n",
       "      <th>right_attribute</th>\n",
       "      <th>left_attr_tokenizer</th>\n",
       "      <th>right_attr_tokenizer</th>\n",
       "      <th>simfunction</th>\n",
       "      <th>function</th>\n",
       "      <th>function_source</th>\n",
       "      <th>is_auto_generated</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>address_address_jac_qgm_3_qgm_3</td>\n",
       "      <td>address</td>\n",
       "      <td>address</td>\n",
       "      <td>qgm_3</td>\n",
       "      <td>qgm_3</td>\n",
       "      <td>jaccard</td>\n",
       "      <td>&lt;function address_address_jac_qgm_3_qgm_3 at 0x10f959c80&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>address_address_cos_dlm_dc0_dlm_dc0</td>\n",
       "      <td>address</td>\n",
       "      <td>address</td>\n",
       "      <td>dlm_dc0</td>\n",
       "      <td>dlm_dc0</td>\n",
       "      <td>cosine</td>\n",
       "      <td>&lt;function address_address_cos_dlm_dc0_dlm_dc0 at 0x10f959d08&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>address_address_jac_dlm_dc0_dlm_dc0</td>\n",
       "      <td>address</td>\n",
       "      <td>address</td>\n",
       "      <td>dlm_dc0</td>\n",
       "      <td>dlm_dc0</td>\n",
       "      <td>jaccard</td>\n",
       "      <td>&lt;function address_address_jac_dlm_dc0_dlm_dc0 at 0x10f959d90&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>address_address_mel</td>\n",
       "      <td>address</td>\n",
       "      <td>address</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>monge_elkan</td>\n",
       "      <td>&lt;function address_address_mel at 0x10f959e18&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>address_address_lev_dist</td>\n",
       "      <td>address</td>\n",
       "      <td>address</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>lev_dist</td>\n",
       "      <td>&lt;function address_address_lev_dist at 0x10f959ea0&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>address_address_lev_sim</td>\n",
       "      <td>address</td>\n",
       "      <td>address</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>lev_sim</td>\n",
       "      <td>&lt;function address_address_lev_sim at 0x10f959f28&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>address_address_nmw</td>\n",
       "      <td>address</td>\n",
       "      <td>address</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>needleman_wunsch</td>\n",
       "      <td>&lt;function address_address_nmw at 0x10f9bb048&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>address_address_sw</td>\n",
       "      <td>address</td>\n",
       "      <td>address</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>smith_waterman</td>\n",
       "      <td>&lt;function address_address_sw at 0x10f9bb0d0&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           feature_name left_attribute right_attribute  \\\n",
       "16      address_address_jac_qgm_3_qgm_3        address         address   \n",
       "17  address_address_cos_dlm_dc0_dlm_dc0        address         address   \n",
       "18  address_address_jac_dlm_dc0_dlm_dc0        address         address   \n",
       "19                  address_address_mel        address         address   \n",
       "20             address_address_lev_dist        address         address   \n",
       "21              address_address_lev_sim        address         address   \n",
       "22                  address_address_nmw        address         address   \n",
       "23                   address_address_sw        address         address   \n",
       "\n",
       "   left_attr_tokenizer right_attr_tokenizer       simfunction  \\\n",
       "16               qgm_3                qgm_3           jaccard   \n",
       "17             dlm_dc0              dlm_dc0            cosine   \n",
       "18             dlm_dc0              dlm_dc0           jaccard   \n",
       "19                None                 None       monge_elkan   \n",
       "20                None                 None          lev_dist   \n",
       "21                None                 None           lev_sim   \n",
       "22                None                 None  needleman_wunsch   \n",
       "23                None                 None    smith_waterman   \n",
       "\n",
       "                                                         function  \\\n",
       "16      <function address_address_jac_qgm_3_qgm_3 at 0x10f959c80>   \n",
       "17  <function address_address_cos_dlm_dc0_dlm_dc0 at 0x10f959d08>   \n",
       "18  <function address_address_jac_dlm_dc0_dlm_dc0 at 0x10f959d90>   \n",
       "19                  <function address_address_mel at 0x10f959e18>   \n",
       "20             <function address_address_lev_dist at 0x10f959ea0>   \n",
       "21              <function address_address_lev_sim at 0x10f959f28>   \n",
       "22                  <function address_address_nmw at 0x10f9bb048>   \n",
       "23                   <function address_address_sw at 0x10f9bb0d0>   \n",
       "\n",
       "                                                                                        function_source  \\\n",
       "16  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "17  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "18  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "19  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "20  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "21  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "22  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "23  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "\n",
       "   is_auto_generated  \n",
       "16              True  \n",
       "17              True  \n",
       "18              True  \n",
       "19              True  \n",
       "20              True  \n",
       "21              True  \n",
       "22              True  \n",
       "23              True  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_table[feature_table.left_attribute == 'address']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pandas.core.frame.DataFrame"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(feature_table)"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}