{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction\n",
    "\n",
    "This IPython notebook illustrates how to refine the results of matching using triggers.\n",
    "\n",
    "First, we need to import py_entitymatching package and other libraries as follows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import py_entitymatching package\n",
    "import py_entitymatching as em\n",
    "import os\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then, read the (sample) input tables for matching purposes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Get the datasets directory\n",
    "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n",
    "\n",
    "path_A = datasets_dir + os.sep + 'dblp_demo.csv'\n",
    "path_B = datasets_dir + os.sep + 'acm_demo.csv'\n",
    "path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>_id</th>\n",
       "      <th>ltable_id</th>\n",
       "      <th>rtable_id</th>\n",
       "      <th>ltable_title</th>\n",
       "      <th>ltable_authors</th>\n",
       "      <th>ltable_year</th>\n",
       "      <th>rtable_title</th>\n",
       "      <th>rtable_authors</th>\n",
       "      <th>rtable_year</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>l1223</td>\n",
       "      <td>r498</td>\n",
       "      <td>Dynamic Information Visualization</td>\n",
       "      <td>Yannis E. Ioannidis</td>\n",
       "      <td>1996</td>\n",
       "      <td>Dynamic information visualization</td>\n",
       "      <td>Yannis E. Ioannidis</td>\n",
       "      <td>1996</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>l1563</td>\n",
       "      <td>r1285</td>\n",
       "      <td>Dynamic Load Balancing in Hierarchical Parallel Database Systems</td>\n",
       "      <td>Luc Bouganim, Daniela Florescu, Patrick Valduriez</td>\n",
       "      <td>1996</td>\n",
       "      <td>Dynamic Load Balancing in Hierarchical Parallel Database Systems</td>\n",
       "      <td>Luc Bouganim, Daniela Florescu, Patrick Valduriez</td>\n",
       "      <td>1996</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>l1514</td>\n",
       "      <td>r1348</td>\n",
       "      <td>Query Processing and Optimization in Oracle Rdb</td>\n",
       "      <td>Gennady Antoshenkov, Mohamed Ziauddin</td>\n",
       "      <td>1996</td>\n",
       "      <td>prospector: a content-based multimedia server for massively parallel architectures</td>\n",
       "      <td>S. Choo, W. O'Connell, G. Linerman, H. Chen, K. Ganapathy, A. Biliris, E. Panagos, D. Schrader</td>\n",
       "      <td>1996</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>l206</td>\n",
       "      <td>r1641</td>\n",
       "      <td>An Asymptotically Optimal Multiversion B-Tree</td>\n",
       "      <td>Thomas Ohler, Peter Widmayer, Bruno Becker, Stephan Gschwind, Bernhard Seeger</td>\n",
       "      <td>1996</td>\n",
       "      <td>A complete temporal relational algebra</td>\n",
       "      <td>Debabrata Dey, Terence M. Barron, Veda C. Storey</td>\n",
       "      <td>1996</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>l1589</td>\n",
       "      <td>r495</td>\n",
       "      <td>Evaluating Probabilistic Queries over Imprecise Data</td>\n",
       "      <td>Reynold Cheng, Dmitri V. Kalashnikov, Sunil Prabhakar</td>\n",
       "      <td>2003</td>\n",
       "      <td>Evaluating probabilistic queries over imprecise data</td>\n",
       "      <td>Reynold Cheng, Dmitri V. Kalashnikov, Sunil Prabhakar</td>\n",
       "      <td>2003</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   _id ltable_id rtable_id  \\\n",
       "0    0     l1223      r498   \n",
       "1    1     l1563     r1285   \n",
       "2    2     l1514     r1348   \n",
       "3    3      l206     r1641   \n",
       "4    4     l1589      r495   \n",
       "\n",
       "                                                       ltable_title  \\\n",
       "0                                 Dynamic Information Visualization   \n",
       "1  Dynamic Load Balancing in Hierarchical Parallel Database Systems   \n",
       "2                   Query Processing and Optimization in Oracle Rdb   \n",
       "3                     An Asymptotically Optimal Multiversion B-Tree   \n",
       "4              Evaluating Probabilistic Queries over Imprecise Data   \n",
       "\n",
       "                                                                  ltable_authors  \\\n",
       "0                                                            Yannis E. Ioannidis   \n",
       "1                              Luc Bouganim, Daniela Florescu, Patrick Valduriez   \n",
       "2                                          Gennady Antoshenkov, Mohamed Ziauddin   \n",
       "3  Thomas Ohler, Peter Widmayer, Bruno Becker, Stephan Gschwind, Bernhard Seeger   \n",
       "4                          Reynold Cheng, Dmitri V. Kalashnikov, Sunil Prabhakar   \n",
       "\n",
       "   ltable_year  \\\n",
       "0         1996   \n",
       "1         1996   \n",
       "2         1996   \n",
       "3         1996   \n",
       "4         2003   \n",
       "\n",
       "                                                                         rtable_title  \\\n",
       "0                                                   Dynamic information visualization   \n",
       "1                    Dynamic Load Balancing in Hierarchical Parallel Database Systems   \n",
       "2  prospector: a content-based multimedia server for massively parallel architectures   \n",
       "3                                              A complete temporal relational algebra   \n",
       "4                                Evaluating probabilistic queries over imprecise data   \n",
       "\n",
       "                                                                                   rtable_authors  \\\n",
       "0                                                                             Yannis E. Ioannidis   \n",
       "1                                               Luc Bouganim, Daniela Florescu, Patrick Valduriez   \n",
       "2  S. Choo, W. O'Connell, G. Linerman, H. Chen, K. Ganapathy, A. Biliris, E. Panagos, D. Schrader   \n",
       "3                                                Debabrata Dey, Terence M. Barron, Veda C. Storey   \n",
       "4                                           Reynold Cheng, Dmitri V. Kalashnikov, Sunil Prabhakar   \n",
       "\n",
       "   rtable_year  label  \n",
       "0         1996      1  \n",
       "1         1996      1  \n",
       "2         1996      0  \n",
       "3         1996      0  \n",
       "4         2003      1  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "A = em.read_csv_metadata(path_A, key='id')\n",
    "B = em.read_csv_metadata(path_B, key='id')\n",
    "\n",
    "# Load the pre-labeled data\n",
    "S = em.read_csv_metadata(path_labeled_data, \n",
    "                         key='_id',\n",
    "                         ltable=A, rtable=B, \n",
    "                         fk_ltable='ltable_id', fk_rtable='rtable_id')\n",
    "S.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use a ML Matcher to get Predictions\n",
    "\n",
    "Here we will purposely create a decision tree matcher that does not take the several features into account to show later how triggers can be used to refine the model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Split S into I an J\n",
    "IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)\n",
    "I = IJ['train']\n",
    "J = IJ['test']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Create a Decision Tree Matcher\n",
    "dt = em.DTMatcher(name='DecisionTree', random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>feature_name</th>\n",
       "      <th>left_attribute</th>\n",
       "      <th>right_attribute</th>\n",
       "      <th>left_attr_tokenizer</th>\n",
       "      <th>right_attr_tokenizer</th>\n",
       "      <th>simfunction</th>\n",
       "      <th>function</th>\n",
       "      <th>function_source</th>\n",
       "      <th>is_auto_generated</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>id_id_lev_dist</td>\n",
       "      <td>id</td>\n",
       "      <td>id</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>lev_dist</td>\n",
       "      <td>&lt;function id_id_lev_dist at 0x11b874aa0&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>id_id_lev_sim</td>\n",
       "      <td>id</td>\n",
       "      <td>id</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>lev_sim</td>\n",
       "      <td>&lt;function id_id_lev_sim at 0x11b874d70&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>id_id_jar</td>\n",
       "      <td>id</td>\n",
       "      <td>id</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>jaro</td>\n",
       "      <td>&lt;function id_id_jar at 0x11b874a28&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>id_id_jwn</td>\n",
       "      <td>id</td>\n",
       "      <td>id</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>jaro_winkler</td>\n",
       "      <td>&lt;function id_id_jwn at 0x11b874c80&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>id_id_exm</td>\n",
       "      <td>id</td>\n",
       "      <td>id</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>exact_match</td>\n",
       "      <td>&lt;function id_id_exm at 0x11b874de8&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>id_id_jac_qgm_3_qgm_3</td>\n",
       "      <td>id</td>\n",
       "      <td>id</td>\n",
       "      <td>qgm_3</td>\n",
       "      <td>qgm_3</td>\n",
       "      <td>jaccard</td>\n",
       "      <td>&lt;function id_id_jac_qgm_3_qgm_3 at 0x11b874e60&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>title_title_jac_qgm_3_qgm_3</td>\n",
       "      <td>title</td>\n",
       "      <td>title</td>\n",
       "      <td>qgm_3</td>\n",
       "      <td>qgm_3</td>\n",
       "      <td>jaccard</td>\n",
       "      <td>&lt;function title_title_jac_qgm_3_qgm_3 at 0x11b889050&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>title_title_cos_dlm_dc0_dlm_dc0</td>\n",
       "      <td>title</td>\n",
       "      <td>title</td>\n",
       "      <td>dlm_dc0</td>\n",
       "      <td>dlm_dc0</td>\n",
       "      <td>cosine</td>\n",
       "      <td>&lt;function title_title_cos_dlm_dc0_dlm_dc0 at 0x11b8890c8&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>title_title_mel</td>\n",
       "      <td>title</td>\n",
       "      <td>title</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>monge_elkan</td>\n",
       "      <td>&lt;function title_title_mel at 0x11b889140&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>title_title_lev_dist</td>\n",
       "      <td>title</td>\n",
       "      <td>title</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>lev_dist</td>\n",
       "      <td>&lt;function title_title_lev_dist at 0x11b8891b8&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>title_title_lev_sim</td>\n",
       "      <td>title</td>\n",
       "      <td>title</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>lev_sim</td>\n",
       "      <td>&lt;function title_title_lev_sim at 0x11b889230&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>authors_authors_jac_qgm_3_qgm_3</td>\n",
       "      <td>authors</td>\n",
       "      <td>authors</td>\n",
       "      <td>qgm_3</td>\n",
       "      <td>qgm_3</td>\n",
       "      <td>jaccard</td>\n",
       "      <td>&lt;function authors_authors_jac_qgm_3_qgm_3 at 0x11b8892a8&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>authors_authors_cos_dlm_dc0_dlm_dc0</td>\n",
       "      <td>authors</td>\n",
       "      <td>authors</td>\n",
       "      <td>dlm_dc0</td>\n",
       "      <td>dlm_dc0</td>\n",
       "      <td>cosine</td>\n",
       "      <td>&lt;function authors_authors_cos_dlm_dc0_dlm_dc0 at 0x11b889320&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>authors_authors_mel</td>\n",
       "      <td>authors</td>\n",
       "      <td>authors</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>monge_elkan</td>\n",
       "      <td>&lt;function authors_authors_mel at 0x11b889398&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>authors_authors_lev_dist</td>\n",
       "      <td>authors</td>\n",
       "      <td>authors</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>lev_dist</td>\n",
       "      <td>&lt;function authors_authors_lev_dist at 0x11b889410&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>authors_authors_lev_sim</td>\n",
       "      <td>authors</td>\n",
       "      <td>authors</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>lev_sim</td>\n",
       "      <td>&lt;function authors_authors_lev_sim at 0x11b889488&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>year_year_exm</td>\n",
       "      <td>year</td>\n",
       "      <td>year</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>exact_match</td>\n",
       "      <td>&lt;function year_year_exm at 0x11b889500&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>year_year_anm</td>\n",
       "      <td>year</td>\n",
       "      <td>year</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>abs_norm</td>\n",
       "      <td>&lt;function year_year_anm at 0x11b889578&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>year_year_lev_dist</td>\n",
       "      <td>year</td>\n",
       "      <td>year</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>lev_dist</td>\n",
       "      <td>&lt;function year_year_lev_dist at 0x11b8895f0&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>year_year_lev_sim</td>\n",
       "      <td>year</td>\n",
       "      <td>year</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>lev_sim</td>\n",
       "      <td>&lt;function year_year_lev_sim at 0x11b889668&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           feature_name left_attribute right_attribute  \\\n",
       "0                        id_id_lev_dist             id              id   \n",
       "1                         id_id_lev_sim             id              id   \n",
       "2                             id_id_jar             id              id   \n",
       "3                             id_id_jwn             id              id   \n",
       "4                             id_id_exm             id              id   \n",
       "5                 id_id_jac_qgm_3_qgm_3             id              id   \n",
       "6           title_title_jac_qgm_3_qgm_3          title           title   \n",
       "7       title_title_cos_dlm_dc0_dlm_dc0          title           title   \n",
       "8                       title_title_mel          title           title   \n",
       "9                  title_title_lev_dist          title           title   \n",
       "10                  title_title_lev_sim          title           title   \n",
       "11      authors_authors_jac_qgm_3_qgm_3        authors         authors   \n",
       "12  authors_authors_cos_dlm_dc0_dlm_dc0        authors         authors   \n",
       "13                  authors_authors_mel        authors         authors   \n",
       "14             authors_authors_lev_dist        authors         authors   \n",
       "15              authors_authors_lev_sim        authors         authors   \n",
       "16                        year_year_exm           year            year   \n",
       "17                        year_year_anm           year            year   \n",
       "18                   year_year_lev_dist           year            year   \n",
       "19                    year_year_lev_sim           year            year   \n",
       "\n",
       "   left_attr_tokenizer right_attr_tokenizer   simfunction  \\\n",
       "0                 None                 None      lev_dist   \n",
       "1                 None                 None       lev_sim   \n",
       "2                 None                 None          jaro   \n",
       "3                 None                 None  jaro_winkler   \n",
       "4                 None                 None   exact_match   \n",
       "5                qgm_3                qgm_3       jaccard   \n",
       "6                qgm_3                qgm_3       jaccard   \n",
       "7              dlm_dc0              dlm_dc0        cosine   \n",
       "8                 None                 None   monge_elkan   \n",
       "9                 None                 None      lev_dist   \n",
       "10                None                 None       lev_sim   \n",
       "11               qgm_3                qgm_3       jaccard   \n",
       "12             dlm_dc0              dlm_dc0        cosine   \n",
       "13                None                 None   monge_elkan   \n",
       "14                None                 None      lev_dist   \n",
       "15                None                 None       lev_sim   \n",
       "16                None                 None   exact_match   \n",
       "17                None                 None      abs_norm   \n",
       "18                None                 None      lev_dist   \n",
       "19                None                 None       lev_sim   \n",
       "\n",
       "                                                         function  \\\n",
       "0                        <function id_id_lev_dist at 0x11b874aa0>   \n",
       "1                         <function id_id_lev_sim at 0x11b874d70>   \n",
       "2                             <function id_id_jar at 0x11b874a28>   \n",
       "3                             <function id_id_jwn at 0x11b874c80>   \n",
       "4                             <function id_id_exm at 0x11b874de8>   \n",
       "5                 <function id_id_jac_qgm_3_qgm_3 at 0x11b874e60>   \n",
       "6           <function title_title_jac_qgm_3_qgm_3 at 0x11b889050>   \n",
       "7       <function title_title_cos_dlm_dc0_dlm_dc0 at 0x11b8890c8>   \n",
       "8                       <function title_title_mel at 0x11b889140>   \n",
       "9                  <function title_title_lev_dist at 0x11b8891b8>   \n",
       "10                  <function title_title_lev_sim at 0x11b889230>   \n",
       "11      <function authors_authors_jac_qgm_3_qgm_3 at 0x11b8892a8>   \n",
       "12  <function authors_authors_cos_dlm_dc0_dlm_dc0 at 0x11b889320>   \n",
       "13                  <function authors_authors_mel at 0x11b889398>   \n",
       "14             <function authors_authors_lev_dist at 0x11b889410>   \n",
       "15              <function authors_authors_lev_sim at 0x11b889488>   \n",
       "16                        <function year_year_exm at 0x11b889500>   \n",
       "17                        <function year_year_anm at 0x11b889578>   \n",
       "18                   <function year_year_lev_dist at 0x11b8895f0>   \n",
       "19                    <function year_year_lev_sim at 0x11b889668>   \n",
       "\n",
       "                                                                                        function_source  \\\n",
       "0   from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "1   from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "2   from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "3   from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "4   from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "5   from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "6   from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "7   from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "8   from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "9   from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "10  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "11  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "12  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "13  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "14  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "15  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "16  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "17  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "18  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "19  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "\n",
       "    is_auto_generated  \n",
       "0                True  \n",
       "1                True  \n",
       "2                True  \n",
       "3                True  \n",
       "4                True  \n",
       "5                True  \n",
       "6                True  \n",
       "7                True  \n",
       "8                True  \n",
       "9                True  \n",
       "10               True  \n",
       "11               True  \n",
       "12               True  \n",
       "13               True  \n",
       "14               True  \n",
       "15               True  \n",
       "16               True  \n",
       "17               True  \n",
       "18               True  \n",
       "19               True  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Generate a set of features\n",
    "feature_table = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)\n",
    "feature_table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>feature_name</th>\n",
       "      <th>left_attribute</th>\n",
       "      <th>right_attribute</th>\n",
       "      <th>left_attr_tokenizer</th>\n",
       "      <th>right_attr_tokenizer</th>\n",
       "      <th>simfunction</th>\n",
       "      <th>function</th>\n",
       "      <th>function_source</th>\n",
       "      <th>is_auto_generated</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>id_id_lev_dist</td>\n",
       "      <td>id</td>\n",
       "      <td>id</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>lev_dist</td>\n",
       "      <td>&lt;function id_id_lev_dist at 0x11b874aa0&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>authors_authors_lev_sim</td>\n",
       "      <td>authors</td>\n",
       "      <td>authors</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>lev_sim</td>\n",
       "      <td>&lt;function authors_authors_lev_sim at 0x11b889488&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>year_year_exm</td>\n",
       "      <td>year</td>\n",
       "      <td>year</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>exact_match</td>\n",
       "      <td>&lt;function year_year_exm at 0x11b889500&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>year_year_anm</td>\n",
       "      <td>year</td>\n",
       "      <td>year</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>abs_norm</td>\n",
       "      <td>&lt;function year_year_anm at 0x11b889578&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>year_year_lev_dist</td>\n",
       "      <td>year</td>\n",
       "      <td>year</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>lev_dist</td>\n",
       "      <td>&lt;function year_year_lev_dist at 0x11b8895f0&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>year_year_lev_sim</td>\n",
       "      <td>year</td>\n",
       "      <td>year</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>lev_sim</td>\n",
       "      <td>&lt;function year_year_lev_sim at 0x11b889668&gt;</td>\n",
       "      <td>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               feature_name left_attribute right_attribute  \\\n",
       "0            id_id_lev_dist             id              id   \n",
       "15  authors_authors_lev_sim        authors         authors   \n",
       "16            year_year_exm           year            year   \n",
       "17            year_year_anm           year            year   \n",
       "18       year_year_lev_dist           year            year   \n",
       "19        year_year_lev_sim           year            year   \n",
       "\n",
       "   left_attr_tokenizer right_attr_tokenizer  simfunction  \\\n",
       "0                 None                 None     lev_dist   \n",
       "15                None                 None      lev_sim   \n",
       "16                None                 None  exact_match   \n",
       "17                None                 None     abs_norm   \n",
       "18                None                 None     lev_dist   \n",
       "19                None                 None      lev_sim   \n",
       "\n",
       "                                             function  \\\n",
       "0            <function id_id_lev_dist at 0x11b874aa0>   \n",
       "15  <function authors_authors_lev_sim at 0x11b889488>   \n",
       "16            <function year_year_exm at 0x11b889500>   \n",
       "17            <function year_year_anm at 0x11b889578>   \n",
       "18       <function year_year_lev_dist at 0x11b8895f0>   \n",
       "19        <function year_year_lev_sim at 0x11b889668>   \n",
       "\n",
       "                                                                                        function_source  \\\n",
       "0   from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "15  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "16  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "17  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "18  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "19  from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...   \n",
       "\n",
       "    is_auto_generated  \n",
       "0                True  \n",
       "15               True  \n",
       "16               True  \n",
       "17               True  \n",
       "18               True  \n",
       "19               True  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We will remove many of the features here to purposly create a poor model. This will make it easier \n",
    "# to demonstrate triggers later\n",
    "F = feature_table.drop([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])\n",
    "F"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>_id</th>\n",
       "      <th>ltable_id</th>\n",
       "      <th>rtable_id</th>\n",
       "      <th>id_id_lev_dist</th>\n",
       "      <th>authors_authors_lev_sim</th>\n",
       "      <th>year_year_exm</th>\n",
       "      <th>year_year_anm</th>\n",
       "      <th>year_year_lev_dist</th>\n",
       "      <th>year_year_lev_sim</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>430</th>\n",
       "      <td>430</td>\n",
       "      <td>l1494</td>\n",
       "      <td>r1257</td>\n",
       "      <td>4</td>\n",
       "      <td>0.083333</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>35</td>\n",
       "      <td>l1385</td>\n",
       "      <td>r1160</td>\n",
       "      <td>4</td>\n",
       "      <td>0.271186</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>394</th>\n",
       "      <td>394</td>\n",
       "      <td>l1345</td>\n",
       "      <td>r85</td>\n",
       "      <td>4</td>\n",
       "      <td>0.338462</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>29</td>\n",
       "      <td>l611</td>\n",
       "      <td>r141</td>\n",
       "      <td>3</td>\n",
       "      <td>0.277778</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>181</th>\n",
       "      <td>181</td>\n",
       "      <td>l1164</td>\n",
       "      <td>r1161</td>\n",
       "      <td>2</td>\n",
       "      <td>0.244444</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     _id ltable_id rtable_id  id_id_lev_dist  authors_authors_lev_sim  \\\n",
       "430  430     l1494     r1257               4                 0.083333   \n",
       "35    35     l1385     r1160               4                 0.271186   \n",
       "394  394     l1345       r85               4                 0.338462   \n",
       "29    29      l611      r141               3                 0.277778   \n",
       "181  181     l1164     r1161               2                 0.244444   \n",
       "\n",
       "     year_year_exm  year_year_anm  year_year_lev_dist  year_year_lev_sim  \\\n",
       "430              1            1.0                 0.0                1.0   \n",
       "35               1            1.0                 0.0                1.0   \n",
       "394              1            1.0                 0.0                1.0   \n",
       "29               1            1.0                 0.0                1.0   \n",
       "181              1            1.0                 0.0                1.0   \n",
       "\n",
       "     label  \n",
       "430      0  \n",
       "35       0  \n",
       "394      1  \n",
       "29       0  \n",
       "181      1  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Convert the I into a set of feature vectors using F\n",
    "H = em.extract_feature_vecs(I, \n",
    "                            feature_table=F, \n",
    "                            attrs_after='label',\n",
    "                            show_progress=False)\n",
    "H.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Impute feature vectors with the mean of the column values.\n",
    "H = em.impute_table(H, \n",
    "                exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],\n",
    "                strategy='mean')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Fit the decision tree to the feature vectors\n",
    "dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], target_attr='label')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>_id</th>\n",
       "      <th>ltable_id</th>\n",
       "      <th>rtable_id</th>\n",
       "      <th>id_id_lev_dist</th>\n",
       "      <th>authors_authors_lev_sim</th>\n",
       "      <th>year_year_exm</th>\n",
       "      <th>year_year_anm</th>\n",
       "      <th>year_year_lev_dist</th>\n",
       "      <th>year_year_lev_sim</th>\n",
       "      <th>label</th>\n",
       "      <th>predicted_labels</th>\n",
       "      <th>proba</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>430</th>\n",
       "      <td>430</td>\n",
       "      <td>l1494</td>\n",
       "      <td>r1257</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.083333</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>35</td>\n",
       "      <td>l1385</td>\n",
       "      <td>r1160</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.271186</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>394</th>\n",
       "      <td>394</td>\n",
       "      <td>l1345</td>\n",
       "      <td>r85</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.338462</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>29</td>\n",
       "      <td>l611</td>\n",
       "      <td>r141</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0.277778</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>181</th>\n",
       "      <td>181</td>\n",
       "      <td>l1164</td>\n",
       "      <td>r1161</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.244444</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     _id ltable_id rtable_id  id_id_lev_dist  authors_authors_lev_sim  \\\n",
       "430  430     l1494     r1257             4.0                 0.083333   \n",
       "35    35     l1385     r1160             4.0                 0.271186   \n",
       "394  394     l1345       r85             4.0                 0.338462   \n",
       "29    29      l611      r141             3.0                 0.277778   \n",
       "181  181     l1164     r1161             2.0                 0.244444   \n",
       "\n",
       "     year_year_exm  year_year_anm  year_year_lev_dist  year_year_lev_sim  \\\n",
       "430            1.0            1.0                 0.0                1.0   \n",
       "35             1.0            1.0                 0.0                1.0   \n",
       "394            1.0            1.0                 0.0                1.0   \n",
       "29             1.0            1.0                 0.0                1.0   \n",
       "181            1.0            1.0                 0.0                1.0   \n",
       "\n",
       "     label  predicted_labels  proba  \n",
       "430      0                 0    0.0  \n",
       "35       0                 0    0.0  \n",
       "394      1                 1    1.0  \n",
       "29       0                 0    0.0  \n",
       "181      1                 1    1.0  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Use the decision tree matcher to predict if tuple pairs match\n",
    "dt.predict(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], target_attr='predicted_labels', \n",
    "           return_probs=True, probs_attr='proba', append=True, inplace=True)\n",
    "H.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Debug the ML Matcher\n",
    "\n",
    "Now we will use the debugger to determine what problems exist with our decision tree matcher."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Split H into P and Q\n",
    "PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)\n",
    "P = PQ['train']\n",
    "Q = PQ['test']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Debug RF matcher using GUI\n",
    "em.vis_debug_dt(dt, P, Q, \n",
    "        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],\n",
    "        target_attr='label')\n",
    "\n",
    "# We see with the debugger that the false negatives have completely different values in the Title attribute.\n",
    "# This is most likly because we removed all of the features that compare the Title attribute from each table earlier. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>_id</th>\n",
       "      <th>ltable_id</th>\n",
       "      <th>rtable_id</th>\n",
       "      <th>id_id_lev_dist</th>\n",
       "      <th>authors_authors_lev_sim</th>\n",
       "      <th>year_year_exm</th>\n",
       "      <th>year_year_anm</th>\n",
       "      <th>year_year_lev_dist</th>\n",
       "      <th>year_year_lev_sim</th>\n",
       "      <th>label</th>\n",
       "      <th>predicted_labels</th>\n",
       "      <th>proba</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>371</th>\n",
       "      <td>371</td>\n",
       "      <td>l650</td>\n",
       "      <td>r1594</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.120000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>259</th>\n",
       "      <td>259</td>\n",
       "      <td>l938</td>\n",
       "      <td>r1090</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>346</th>\n",
       "      <td>346</td>\n",
       "      <td>l1681</td>\n",
       "      <td>r693</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.238095</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>184</th>\n",
       "      <td>184</td>\n",
       "      <td>l891</td>\n",
       "      <td>r485</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.137931</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>11</td>\n",
       "      <td>l1189</td>\n",
       "      <td>r1674</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.222222</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.250000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>121</th>\n",
       "      <td>121</td>\n",
       "      <td>l169</td>\n",
       "      <td>r521</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.153846</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>267</th>\n",
       "      <td>267</td>\n",
       "      <td>l120</td>\n",
       "      <td>r1181</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.216667</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147</th>\n",
       "      <td>147</td>\n",
       "      <td>l867</td>\n",
       "      <td>r1263</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.333333</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     _id ltable_id rtable_id  id_id_lev_dist  authors_authors_lev_sim  \\\n",
       "371  371      l650     r1594             4.0                 0.120000   \n",
       "259  259      l938     r1090             5.0                 0.200000   \n",
       "346  346     l1681      r693             4.0                 0.238095   \n",
       "184  184      l891      r485             4.0                 0.137931   \n",
       "11    11     l1189     r1674             4.0                 0.222222   \n",
       "121  121      l169      r521             4.0                 0.153846   \n",
       "267  267      l120     r1181             4.0                 0.216667   \n",
       "147  147      l867     r1263             4.0                 0.142857   \n",
       "\n",
       "     year_year_exm  year_year_anm  year_year_lev_dist  year_year_lev_sim  \\\n",
       "371            1.0            1.0                 0.0                1.0   \n",
       "259            1.0            1.0                 0.0                1.0   \n",
       "346            1.0            1.0                 0.0                1.0   \n",
       "184            1.0            1.0                 0.0                1.0   \n",
       "11             1.0            1.0                 0.0                1.0   \n",
       "121            1.0            1.0                 0.0                1.0   \n",
       "267            1.0            1.0                 0.0                1.0   \n",
       "147            1.0            1.0                 0.0                1.0   \n",
       "\n",
       "     label  predicted_labels     proba  \n",
       "371      1                 0  0.500000  \n",
       "259      1                 0  0.333333  \n",
       "346      1                 0  0.500000  \n",
       "184      1                 0  0.500000  \n",
       "11       1                 0  0.250000  \n",
       "121      1                 0  0.500000  \n",
       "267      1                 0  0.500000  \n",
       "147      1                 0  0.333333  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We can see which tuples are not predicted correctly\n",
    "H[H['label'] != H['predicted_labels']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Using Triggers to Improve Results\n",
    "\n",
    "This, typically involves the following steps:\n",
    "1. Creating the match trigger\n",
    "2. Adding Rules\n",
    "3. Adding a condition status and action\n",
    "3. Using the trigger to improve results\n",
    "\n",
    "## Creating the Match Trigger"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Use the constructor to create a trigger\n",
    "mt = em.MatchTrigger()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Adding Rules\n",
    "Before we can use the rule-based matcher, we need to create rules to evaluate tuple pairs. Each rule is a list of strings. Each string specifies a conjunction of predicates. Each predicate has three parts: (1) an expression, (2) a comparison operator, and (3) a value. The expression is evaluated over a tuple pair, producing a numeric value."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['_rule_0', '_rule_1']"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Add two rules to the rule-based matcher\n",
    "\n",
    "# Since we removed all of the features comparing Title earlier, we want to now add a rule that compares Titles\n",
    "mt.add_cond_rule(['title_title_lev_sim(ltuple, rtuple) > 0.7'], feature_table)\n",
    "# The rule has two predicates, one comparing the titles and the other looking for an exact match of the years\n",
    "mt.add_cond_rule(['title_title_lev_sim(ltuple, rtuple) > 0.4', 'year_year_exm(ltuple, rtuple) == 1'], feature_table)\n",
    "mt.get_rule_names()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Rules can also be deleted from the rule-based matcher\n",
    "mt.delete_rule('_rule_1')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Adding a Condition Status and Action\n",
    "Next, we need to add a condition status and an action to the trigger. Triggers apply the rules added to each tuple pair. If the result is the same value as the condition status, then the action will be carried out."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Since we are using the trigger to fix a problem related to false negatives, we want the condition to be \n",
    "# True and the action to be 1. This way, the trigger will set a prediction to 1 when the rule returns True.\n",
    "\n",
    "mt.add_cond_status(True)\n",
    "mt.add_action(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using the Trigger to Improve Results\n",
    "Now that we have added rules, a condition status, and an action, we can execute the trigger to improve results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>_id</th>\n",
       "      <th>ltable_id</th>\n",
       "      <th>rtable_id</th>\n",
       "      <th>id_id_lev_dist</th>\n",
       "      <th>authors_authors_lev_sim</th>\n",
       "      <th>year_year_exm</th>\n",
       "      <th>year_year_anm</th>\n",
       "      <th>year_year_lev_dist</th>\n",
       "      <th>year_year_lev_sim</th>\n",
       "      <th>label</th>\n",
       "      <th>predicted_labels</th>\n",
       "      <th>proba</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>430</th>\n",
       "      <td>430</td>\n",
       "      <td>l1494</td>\n",
       "      <td>r1257</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.083333</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>35</td>\n",
       "      <td>l1385</td>\n",
       "      <td>r1160</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.271186</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>394</th>\n",
       "      <td>394</td>\n",
       "      <td>l1345</td>\n",
       "      <td>r85</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.338462</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>29</td>\n",
       "      <td>l611</td>\n",
       "      <td>r141</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0.277778</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>181</th>\n",
       "      <td>181</td>\n",
       "      <td>l1164</td>\n",
       "      <td>r1161</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.244444</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     _id ltable_id rtable_id  id_id_lev_dist  authors_authors_lev_sim  \\\n",
       "430  430     l1494     r1257             4.0                 0.083333   \n",
       "35    35     l1385     r1160             4.0                 0.271186   \n",
       "394  394     l1345       r85             4.0                 0.338462   \n",
       "29    29      l611      r141             3.0                 0.277778   \n",
       "181  181     l1164     r1161             2.0                 0.244444   \n",
       "\n",
       "     year_year_exm  year_year_anm  year_year_lev_dist  year_year_lev_sim  \\\n",
       "430            1.0            1.0                 0.0                1.0   \n",
       "35             1.0            1.0                 0.0                1.0   \n",
       "394            1.0            1.0                 0.0                1.0   \n",
       "29             1.0            1.0                 0.0                1.0   \n",
       "181            1.0            1.0                 0.0                1.0   \n",
       "\n",
       "     label  predicted_labels  proba  \n",
       "430      0                 0    0.0  \n",
       "35       0                 0    0.0  \n",
       "394      1                 1    1.0  \n",
       "29       0                 0    0.0  \n",
       "181      1                 1    1.0  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preds = mt.execute(input_table=H, label_column='predicted_labels', inplace=False)\n",
    "preds.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>_id</th>\n",
       "      <th>ltable_id</th>\n",
       "      <th>rtable_id</th>\n",
       "      <th>id_id_lev_dist</th>\n",
       "      <th>authors_authors_lev_sim</th>\n",
       "      <th>year_year_exm</th>\n",
       "      <th>year_year_anm</th>\n",
       "      <th>year_year_lev_dist</th>\n",
       "      <th>year_year_lev_sim</th>\n",
       "      <th>label</th>\n",
       "      <th>predicted_labels</th>\n",
       "      <th>proba</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>11</td>\n",
       "      <td>l1189</td>\n",
       "      <td>r1674</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.222222</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>267</th>\n",
       "      <td>267</td>\n",
       "      <td>l120</td>\n",
       "      <td>r1181</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.216667</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.50</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     _id ltable_id rtable_id  id_id_lev_dist  authors_authors_lev_sim  \\\n",
       "11    11     l1189     r1674             4.0                 0.222222   \n",
       "267  267      l120     r1181             4.0                 0.216667   \n",
       "\n",
       "     year_year_exm  year_year_anm  year_year_lev_dist  year_year_lev_sim  \\\n",
       "11             1.0            1.0                 0.0                1.0   \n",
       "267            1.0            1.0                 0.0                1.0   \n",
       "\n",
       "     label  predicted_labels  proba  \n",
       "11       1                 0   0.25  \n",
       "267      1                 0   0.50  "
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We were able to significantly reduce the number of incorrectly labeled tuple pairs\n",
    "preds[preds['label'] != preds['predicted_labels']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>_id</th>\n",
       "      <th>ltable_id</th>\n",
       "      <th>rtable_id</th>\n",
       "      <th>ltable_title</th>\n",
       "      <th>ltable_authors</th>\n",
       "      <th>ltable_year</th>\n",
       "      <th>rtable_title</th>\n",
       "      <th>rtable_authors</th>\n",
       "      <th>rtable_year</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>11</td>\n",
       "      <td>l1189</td>\n",
       "      <td>r1674</td>\n",
       "      <td>Weimin Du, Xiangning Liu, Abdelsalam Helal</td>\n",
       "      <td>Multiview Access Protocols for Large-Scale Replication</td>\n",
       "      <td>1998</td>\n",
       "      <td>Multiview access protocols for large-scale replication</td>\n",
       "      <td>Xiangning Liu, Abdelsalam Helal, Weimin Du</td>\n",
       "      <td>1998</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>267</th>\n",
       "      <td>267</td>\n",
       "      <td>l120</td>\n",
       "      <td>r1181</td>\n",
       "      <td>w. Bruce kroft, James callan, erik w. Brown</td>\n",
       "      <td>fast incrremental indexiing for fulltext informtion retreval</td>\n",
       "      <td>1994</td>\n",
       "      <td>Fast Incremental Indexing For Full-Text Information Retrieval</td>\n",
       "      <td>Eric W. Brown, James P. Callan, W. Bruce Croft</td>\n",
       "      <td>1994</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     _id ltable_id rtable_id                                 ltable_title  \\\n",
       "11    11     l1189     r1674   Weimin Du, Xiangning Liu, Abdelsalam Helal   \n",
       "267  267      l120     r1181  w. Bruce kroft, James callan, erik w. Brown   \n",
       "\n",
       "                                                   ltable_authors  \\\n",
       "11         Multiview Access Protocols for Large-Scale Replication   \n",
       "267  fast incrremental indexiing for fulltext informtion retreval   \n",
       "\n",
       "     ltable_year  \\\n",
       "11          1998   \n",
       "267         1994   \n",
       "\n",
       "                                                      rtable_title  \\\n",
       "11          Multiview access protocols for large-scale replication   \n",
       "267  Fast Incremental Indexing For Full-Text Information Retrieval   \n",
       "\n",
       "                                     rtable_authors  rtable_year  label  \n",
       "11       Xiangning Liu, Abdelsalam Helal, Weimin Du         1998      1  \n",
       "267  Eric W. Brown, James P. Callan, W. Bruce Croft         1994      1  "
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We can see that the two tuples that are still labeled incorrectly are due to the title and authors being in the\n",
    "# wrong column for one of the tuples.\n",
    "pd.concat([S[S['_id'] == 11], S[S['_id'] == 267]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}