{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction\n",
    "This IPython notebook illustrates how to generate features for blocking/matching manually.\n",
    "\n",
    "First, we need to import *py_entitymatching* package and other libraries as follows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Import py_entitymatching package\n",
    "import py_entitymatching as em\n",
    "import os\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then, read the (sample) input tables for blocking purposes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Get the datasets directory\n",
    "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n",
    "\n",
    "# Get the paths of the input tables\n",
    "path_A = datasets_dir + os.sep + 'person_table_A.csv'\n",
    "path_B = datasets_dir + os.sep + 'person_table_B.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Read the CSV files and set 'ID' as the key attribute\n",
    "A = em.read_csv_metadata(path_A, key='ID')\n",
    "B = em.read_csv_metadata(path_B, key='ID')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generating Features for Manually"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Getting Attribute Types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "atypes1 = em.get_attr_types(A)\n",
    "atypes2 = em.get_attr_types(B)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['ID', '_table', 'birth_year', 'hourly_wage', 'address', 'name', 'zipcode'])"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "atypes1.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('numeric', 'numeric', 'str_bt_1w_5w', 'str_bt_1w_5w', 'numeric')"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "atypes1['birth_year'], atypes1['hourly_wage'], atypes1['address'], atypes1['name'], atypes1['zipcode']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('numeric', 'numeric', 'str_bt_5w_10w', 'str_bt_1w_5w', 'numeric')"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "atypes2['birth_year'], atypes2['hourly_wage'], atypes2['address'], atypes2['name'], atypes2['zipcode']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Getting Attribute Correspondences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "block_c = em.get_attr_corres(A, B)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['rtable', 'ltable', 'corres'])"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "block_c.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4635705184, 4635705184, 4635959984, 4635959984)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "id(A), id(block_c['ltable']), id(B), id(block_c['rtable'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('ID', 'ID'),\n",
       " ('name', 'name'),\n",
       " ('birth_year', 'birth_year'),\n",
       " ('hourly_wage', 'hourly_wage'),\n",
       " ('address', 'address'),\n",
       " ('zipcode', 'zipcode')]"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "block_c['corres']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Getting Tokenizers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# for blocking\n",
    "tok = em.get_tokenizers_for_blocking()\n",
    "# for matching\n",
    "# tok = em.get_tokenizers_for_matching()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'alphabetic': <function py_entitymatching.feature.tokenizers.tok_alphabetic>,\n",
       " 'alphanumeric': <function py_entitymatching.feature.tokenizers.tok_alphanumeric>,\n",
       " 'dlm_dc0': <function py_entitymatching.feature.tokenizers._make_tok_delim.<locals>.tok_delim>,\n",
       " 'qgm_2': <function py_entitymatching.feature.tokenizers._make_tok_qgram.<locals>.tok_qgram>,\n",
       " 'qgm_3': <function py_entitymatching.feature.tokenizers._make_tok_qgram.<locals>.tok_qgram>,\n",
       " 'wspace': <function py_entitymatching.feature.tokenizers.tok_wspace>}"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tok"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Getting Similarity Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# for blocking\n",
    "sim = em.get_sim_funs_for_blocking()\n",
    "\n",
    "# for matching\n",
    "# sim = em.get_sim_funs_for_matching()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'abs_norm': <function py_entitymatching.feature.simfunctions.abs_norm>,\n",
       " 'affine': <function py_entitymatching.feature.simfunctions.affine>,\n",
       " 'cosine': <function py_entitymatching.feature.simfunctions.cosine>,\n",
       " 'dice': <function py_entitymatching.feature.simfunctions.dice>,\n",
       " 'exact_match': <function py_entitymatching.feature.simfunctions.exact_match>,\n",
       " 'hamming_dist': <function py_entitymatching.feature.simfunctions.hamming_dist>,\n",
       " 'hamming_sim': <function py_entitymatching.feature.simfunctions.hamming_sim>,\n",
       " 'jaccard': <function py_entitymatching.feature.simfunctions.jaccard>,\n",
       " 'jaro': <function py_entitymatching.feature.simfunctions.jaro>,\n",
       " 'jaro_winkler': <function py_entitymatching.feature.simfunctions.jaro_winkler>,\n",
       " 'lev_dist': <function py_entitymatching.feature.simfunctions.lev_dist>,\n",
       " 'lev_sim': <function py_entitymatching.feature.simfunctions.lev_sim>,\n",
       " 'monge_elkan': <function py_entitymatching.feature.simfunctions.monge_elkan>,\n",
       " 'needleman_wunsch': <function py_entitymatching.feature.simfunctions.needleman_wunsch>,\n",
       " 'overlap_coeff': <function py_entitymatching.feature.simfunctions.overlap_coeff>,\n",
       " 'rel_diff': <function py_entitymatching.feature.simfunctions.rel_diff>,\n",
       " 'smith_waterman': <function py_entitymatching.feature.simfunctions.smith_waterman>}"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sim"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Getting Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "feature_table = em.get_features(A, B, atypes1, atypes2, block_c, tok, sim)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>feature_name</th>\n",
       "      <th>left_attribute</th>\n",
       "      <th>right_attribute</th>\n",
       "      <th>left_attr_tokenizer</th>\n",
       "      <th>right_attr_tokenizer</th>\n",
       "      <th>simfunction</th>\n",
       "      <th>function</th>\n",
       "      <th>function_source</th>\n",
       "      <th>is_auto_generated</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ID_ID_lev_dist</td>\n",
       "      <td>ID</td>\n",
       "      <td>ID</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>lev_dist</td>\n",
       "      <td>&lt;function ID_ID_lev_dist at 0x11452b378&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ID_ID_lev_sim</td>\n",
       "      <td>ID</td>\n",
       "      <td>ID</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>lev_sim</td>\n",
       "      <td>&lt;function ID_ID_lev_sim at 0x114515d08&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ID_ID_jar</td>\n",
       "      <td>ID</td>\n",
       "      <td>ID</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>jaro</td>\n",
       "      <td>&lt;function ID_ID_jar at 0x11452b158&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ID_ID_jwn</td>\n",
       "      <td>ID</td>\n",
       "      <td>ID</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>jaro_winkler</td>\n",
       "      <td>&lt;function ID_ID_jwn at 0x11452b048&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ID_ID_exm</td>\n",
       "      <td>ID</td>\n",
       "      <td>ID</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>exact_match</td>\n",
       "      <td>&lt;function ID_ID_exm at 0x11452b400&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     feature_name left_attribute right_attribute left_attr_tokenizer  \\\n",
       "0  ID_ID_lev_dist             ID              ID                None   \n",
       "1   ID_ID_lev_sim             ID              ID                None   \n",
       "2       ID_ID_jar             ID              ID                None   \n",
       "3       ID_ID_jwn             ID              ID                None   \n",
       "4       ID_ID_exm             ID              ID                None   \n",
       "\n",
       "  right_attr_tokenizer   simfunction  \\\n",
       "0                 None      lev_dist   \n",
       "1                 None       lev_sim   \n",
       "2                 None          jaro   \n",
       "3                 None  jaro_winkler   \n",
       "4                 None   exact_match   \n",
       "\n",
       "                                   function  \\\n",
       "0  <function ID_ID_lev_dist at 0x11452b378>   \n",
       "1   <function ID_ID_lev_sim at 0x114515d08>   \n",
       "2       <function ID_ID_jar at 0x11452b158>   \n",
       "3       <function ID_ID_jwn at 0x11452b048>   \n",
       "4       <function ID_ID_exm at 0x11452b400>   \n",
       "\n",
       "                                                                                       function_source  \\\n",
       "0  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "1  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "2  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "3  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "4  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "\n",
       "  is_auto_generated  \n",
       "0              True  \n",
       "1              True  \n",
       "2              True  \n",
       "3              True  \n",
       "4              True  "
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_table.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pandas.core.frame.DataFrame"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(feature_table)"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}