{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction\n",
    "\n",
    "This IPython notebook illustrates how to select the best learning based matcher. First, we need to import py_entitymatching package and other libraries as follows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import py_entitymatching package\n",
    "import py_entitymatching as em\n",
    "import os\n",
    "import pandas as pd\n",
    "\n",
    "# Set the seed value \n",
    "seed = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Get the datasets directory\n",
    "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n",
    "\n",
    "path_A = datasets_dir + os.sep + 'dblp_demo.csv'\n",
    "path_B = datasets_dir + os.sep + 'acm_demo.csv'\n",
    "path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No handlers could be found for logger \"py_entitymatching.io.parsers\"\n"
     ]
    }
   ],
   "source": [
    "A = em.read_csv_metadata(path_A, key='id')\n",
    "B = em.read_csv_metadata(path_B, key='id')\n",
    "# Load the pre-labeled data\n",
    "S = em.read_csv_metadata(path_labeled_data, \n",
    "                         key='_id',\n",
    "                         ltable=A, rtable=B, \n",
    "                         fk_ltable='ltable_id', fk_rtable='rtable_id')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then, split the labeled data into development set and evaluation set and convert them into feature vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split S into I an J\n",
    "IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)\n",
    "I = IJ['train']\n",
    "J = IJ['test']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Generate a set of features\n",
    "F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Convert I into feature vectors using updated F\n",
    "H = em.extract_feature_vecs(I, \n",
    "                            feature_table=F, \n",
    "                            attrs_after='label',\n",
    "                            show_progress=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Compute accuracy of X (Decision Tree) on J"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It involves the following steps:\n",
    "\n",
    "1. Train X using  H\n",
    "2. Convert J into a set of feature vectors (L)\n",
    "3. Predict on L using X\n",
    "4. Evaluate the predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Instantiate the matcher to evaluate.\n",
    "dt = em.DTMatcher(name='DecisionTree', random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train using feature vectors from I \n",
    "dt.fit(table=H, \n",
    "       exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], \n",
    "       target_attr='label')\n",
    "\n",
    "# Convert J into a set of feature vectors using F\n",
    "L = em.extract_feature_vecs(J, feature_table=F,\n",
    "                            attrs_after='label', show_progress=False)\n",
    "\n",
    "# Predict on L \n",
    "predictions = dt.predict(table=L, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], \n",
    "              append=True, target_attr='predicted', inplace=False, return_probs=True,\n",
    "                        probs_attr='proba')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>_id</th>\n",
       "      <th>ltable_id</th>\n",
       "      <th>rtable_id</th>\n",
       "      <th>predicted</th>\n",
       "      <th>proba</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>124</th>\n",
       "      <td>124</td>\n",
       "      <td>l1647</td>\n",
       "      <td>r366</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>54</td>\n",
       "      <td>l332</td>\n",
       "      <td>r1463</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>268</th>\n",
       "      <td>268</td>\n",
       "      <td>l1499</td>\n",
       "      <td>r1725</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>293</th>\n",
       "      <td>293</td>\n",
       "      <td>l759</td>\n",
       "      <td>r1749</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>230</th>\n",
       "      <td>230</td>\n",
       "      <td>l1580</td>\n",
       "      <td>r1711</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     _id ltable_id rtable_id  predicted  proba\n",
       "124  124     l1647      r366        0.0    1.0\n",
       "54    54      l332     r1463        0.0    1.0\n",
       "268  268     l1499     r1725        0.0    1.0\n",
       "293  293      l759     r1749        1.0    1.0\n",
       "230  230     l1580     r1711        1.0    1.0"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions[['_id', 'ltable_id', 'rtable_id', 'predicted', 'proba']].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Precision : 100.0% (68/68)\n",
      "Recall : 93.15% (68/73)\n",
      "F1 : 96.45%\n",
      "False positives : 0 (out of 68 positive predictions)\n",
      "False negatives : 5 (out of 157 negative predictions)\n"
     ]
    }
   ],
   "source": [
    "# Evaluate the predictions\n",
    "eval_result = em.eval_matches(predictions, 'label', 'predicted')\n",
    "em.print_eval_summary(eval_result)"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}