{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction\n", "\n", "This IPython notebook illustrates how to select the best learning based matcher. First, we need to import py_entitymatching package and other libraries as follows:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Import py_entitymatching package\n", "import py_entitymatching as em\n", "import os\n", "import pandas as pd\n", "\n", "# Set the seed value \n", "seed = 0" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get the datasets directory\n", "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n", "\n", "path_A = datasets_dir + os.sep + 'dblp_demo.csv'\n", "path_B = datasets_dir + os.sep + 'acm_demo.csv'\n", "path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "No handlers could be found for logger \"py_entitymatching.io.parsers\"\n" ] } ], "source": [ "A = em.read_csv_metadata(path_A, key='id')\n", "B = em.read_csv_metadata(path_B, key='id')\n", "# Load the pre-labeled data\n", "S = em.read_csv_metadata(path_labeled_data, \n", " key='_id',\n", " ltable=A, rtable=B, \n", " fk_ltable='ltable_id', fk_rtable='rtable_id')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, split the labeled data into development set and evaluation set and convert them into feature vectors" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Split S into I an J\n", "IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)\n", "I = IJ['train']\n", "J = IJ['test']" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Generate a set of features\n", "F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Convert I into feature vectors using updated F\n", "H = em.extract_feature_vecs(I, \n", " feature_table=F, \n", " attrs_after='label',\n", " show_progress=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Compute accuracy of X (Decision Tree) on J" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It involves the following steps:\n", "\n", "1. Train X using H\n", "2. Convert J into a set of feature vectors (L)\n", "3. Predict on L using X\n", "4. Evaluate the predictions" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Instantiate the matcher to evaluate.\n", "dt = em.DTMatcher(name='DecisionTree', random_state=0)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Train using feature vectors from I \n", "dt.fit(table=H, \n", " exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], \n", " target_attr='label')\n", "\n", "# Convert J into a set of feature vectors using F\n", "L = em.extract_feature_vecs(J, feature_table=F,\n", " attrs_after='label', show_progress=False)\n", "\n", "# Predict on L \n", "predictions = dt.predict(table=L, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], \n", " append=True, target_attr='predicted', inplace=False, return_probs=True,\n", " probs_attr='proba')" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
_idltable_idrtable_idpredictedproba
124124l1647r3660.01.0
5454l332r14630.01.0
268268l1499r17250.01.0
293293l759r17491.01.0
230230l1580r17111.01.0
\n", "
" ], "text/plain": [ " _id ltable_id rtable_id predicted proba\n", "124 124 l1647 r366 0.0 1.0\n", "54 54 l332 r1463 0.0 1.0\n", "268 268 l1499 r1725 0.0 1.0\n", "293 293 l759 r1749 1.0 1.0\n", "230 230 l1580 r1711 1.0 1.0" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predictions[['_id', 'ltable_id', 'rtable_id', 'predicted', 'proba']].head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Precision : 100.0% (68/68)\n", "Recall : 93.15% (68/73)\n", "F1 : 96.45%\n", "False positives : 0 (out of 68 positive predictions)\n", "False negatives : 5 (out of 157 negative predictions)\n" ] } ], "source": [ "# Evaluate the predictions\n", "eval_result = em.eval_matches(predictions, 'label', 'predicted')\n", "em.print_eval_summary(eval_result)" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.13" } }, "nbformat": 4, "nbformat_minor": 1 }