{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction\n", "\n", "This IPython notebook illustrates how to select the best learning based matcher. First, we need to import py_entitymatching package and other libraries as follows:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Import py_entitymatching package\n", "import py_entitymatching as em\n", "import os\n", "import pandas as pd\n", "\n", "# Set the seed value \n", "seed = 0" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Get the datasets directory\n", "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n", "\n", "path_A = datasets_dir + os.sep + 'dblp_demo.csv'\n", "path_B = datasets_dir + os.sep + 'acm_demo.csv'\n", "path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Metadata file is not present in the given path; proceeding to read the csv file.\n", "Metadata file is not present in the given path; proceeding to read the csv file.\n" ] } ], "source": [ "A = em.read_csv_metadata(path_A, key='id')\n", "B = em.read_csv_metadata(path_B, key='id')\n", "# Load the pre-labeled data\n", "S = em.read_csv_metadata(path_labeled_data, \n", " key='_id',\n", " ltable=A, rtable=B, \n", " fk_ltable='ltable_id', fk_rtable='rtable_id')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, split the labeled data into development set and evaluation set and convert them into feature vectors" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Split S into I an J\n", "IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)\n", "I = IJ['train']\n", "J = IJ['test']" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Generate a set of features\n", "F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Convert I into feature vectors using updated F\n", "H = em.extract_feature_vecs(I, \n", " feature_table=F, \n", " attrs_after='label',\n", " show_progress=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Compute accuracy of X (Decision Tree) on J" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It involves the following steps:\n", "\n", "1. Train X using H\n", "2. Convert J into a set of feature vectors (L)\n", "3. Predict on L using X\n", "4. Evaluate the predictions" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Instantiate the matcher to evaluate.\n", "dt = em.DTMatcher(name='DecisionTree', random_state=0)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Train using feature vectors from I \n", "dt.fit(table=H, \n", " exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], \n", " target_attr='label')\n", "\n", "# Convert J into a set of feature vectors using F\n", "L = em.extract_feature_vecs(J, feature_table=F,\n", " attrs_after='label', show_progress=False)\n", "\n", "# Predict on L \n", "predictions = dt.predict(table=L, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], \n", " append=True, target_attr='predicted', inplace=False, return_probs=True,\n", " probs_attr='proba')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
_idltable_idrtable_idpredictedproba
124124l1647r36600.0
5454l332r146300.0
268268l1499r172500.0
293293l759r174911.0
230230l1580r171111.0
\n", "
" ], "text/plain": [ " _id ltable_id rtable_id predicted proba\n", "124 124 l1647 r366 0 0.0\n", "54 54 l332 r1463 0 0.0\n", "268 268 l1499 r1725 0 0.0\n", "293 293 l759 r1749 1 1.0\n", "230 230 l1580 r1711 1 1.0" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predictions[['_id', 'ltable_id', 'rtable_id', 'predicted', 'proba']].head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Precision : 100.0% (68/68)\n", "Recall : 93.15% (68/73)\n", "F1 : 96.45%\n", "False positives : 0 (out of 68 positive predictions)\n", "False negatives : 5 (out of 157 negative predictions)\n" ] } ], "source": [ "# Evaluate the predictions\n", "eval_result = em.eval_matches(predictions, 'label', 'predicted')\n", "em.print_eval_summary(eval_result)" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 1 }