{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction\n", "\n", "This IPython notebook illustrates how to refine the results of matching using triggers.\n", "\n", "First, we need to import py_entitymatching package and other libraries as follows:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Import py_entitymatching package\n", "import py_entitymatching as em\n", "import os\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, read the (sample) input tables for matching purposes." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get the datasets directory\n", "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n", "\n", "path_A = datasets_dir + os.sep + 'dblp_demo.csv'\n", "path_B = datasets_dir + os.sep + 'acm_demo.csv'\n", "path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv'" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | _id | \n", "ltable_id | \n", "rtable_id | \n", "ltable_title | \n", "ltable_authors | \n", "ltable_year | \n", "rtable_title | \n", "rtable_authors | \n", "rtable_year | \n", "label | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0 | \n", "l1223 | \n", "r498 | \n", "Dynamic Information Visualization | \n", "Yannis E. Ioannidis | \n", "1996 | \n", "Dynamic information visualization | \n", "Yannis E. Ioannidis | \n", "1996 | \n", "1 | \n", "
1 | \n", "1 | \n", "l1563 | \n", "r1285 | \n", "Dynamic Load Balancing in Hierarchical Parallel Database Systems | \n", "Luc Bouganim, Daniela Florescu, Patrick Valduriez | \n", "1996 | \n", "Dynamic Load Balancing in Hierarchical Parallel Database Systems | \n", "Luc Bouganim, Daniela Florescu, Patrick Valduriez | \n", "1996 | \n", "1 | \n", "
2 | \n", "2 | \n", "l1514 | \n", "r1348 | \n", "Query Processing and Optimization in Oracle Rdb | \n", "Gennady Antoshenkov, Mohamed Ziauddin | \n", "1996 | \n", "prospector: a content-based multimedia server for massively parallel architectures | \n", "S. Choo, W. O'Connell, G. Linerman, H. Chen, K. Ganapathy, A. Biliris, E. Panagos, D. Schrader | \n", "1996 | \n", "0 | \n", "
3 | \n", "3 | \n", "l206 | \n", "r1641 | \n", "An Asymptotically Optimal Multiversion B-Tree | \n", "Thomas Ohler, Peter Widmayer, Bruno Becker, Stephan Gschwind, Bernhard Seeger | \n", "1996 | \n", "A complete temporal relational algebra | \n", "Debabrata Dey, Terence M. Barron, Veda C. Storey | \n", "1996 | \n", "0 | \n", "
4 | \n", "4 | \n", "l1589 | \n", "r495 | \n", "Evaluating Probabilistic Queries over Imprecise Data | \n", "Reynold Cheng, Dmitri V. Kalashnikov, Sunil Prabhakar | \n", "2003 | \n", "Evaluating probabilistic queries over imprecise data | \n", "Reynold Cheng, Dmitri V. Kalashnikov, Sunil Prabhakar | \n", "2003 | \n", "1 | \n", "
\n", " | feature_name | \n", "left_attribute | \n", "right_attribute | \n", "left_attr_tokenizer | \n", "right_attr_tokenizer | \n", "simfunction | \n", "function | \n", "function_source | \n", "is_auto_generated | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "id_id_lev_dist | \n", "id | \n", "id | \n", "None | \n", "None | \n", "lev_dist | \n", "<function id_id_lev_dist at 0x11b874aa0> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
1 | \n", "id_id_lev_sim | \n", "id | \n", "id | \n", "None | \n", "None | \n", "lev_sim | \n", "<function id_id_lev_sim at 0x11b874d70> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
2 | \n", "id_id_jar | \n", "id | \n", "id | \n", "None | \n", "None | \n", "jaro | \n", "<function id_id_jar at 0x11b874a28> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
3 | \n", "id_id_jwn | \n", "id | \n", "id | \n", "None | \n", "None | \n", "jaro_winkler | \n", "<function id_id_jwn at 0x11b874c80> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
4 | \n", "id_id_exm | \n", "id | \n", "id | \n", "None | \n", "None | \n", "exact_match | \n", "<function id_id_exm at 0x11b874de8> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
5 | \n", "id_id_jac_qgm_3_qgm_3 | \n", "id | \n", "id | \n", "qgm_3 | \n", "qgm_3 | \n", "jaccard | \n", "<function id_id_jac_qgm_3_qgm_3 at 0x11b874e60> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
6 | \n", "title_title_jac_qgm_3_qgm_3 | \n", "title | \n", "title | \n", "qgm_3 | \n", "qgm_3 | \n", "jaccard | \n", "<function title_title_jac_qgm_3_qgm_3 at 0x11b889050> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
7 | \n", "title_title_cos_dlm_dc0_dlm_dc0 | \n", "title | \n", "title | \n", "dlm_dc0 | \n", "dlm_dc0 | \n", "cosine | \n", "<function title_title_cos_dlm_dc0_dlm_dc0 at 0x11b8890c8> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
8 | \n", "title_title_mel | \n", "title | \n", "title | \n", "None | \n", "None | \n", "monge_elkan | \n", "<function title_title_mel at 0x11b889140> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
9 | \n", "title_title_lev_dist | \n", "title | \n", "title | \n", "None | \n", "None | \n", "lev_dist | \n", "<function title_title_lev_dist at 0x11b8891b8> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
10 | \n", "title_title_lev_sim | \n", "title | \n", "title | \n", "None | \n", "None | \n", "lev_sim | \n", "<function title_title_lev_sim at 0x11b889230> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
11 | \n", "authors_authors_jac_qgm_3_qgm_3 | \n", "authors | \n", "authors | \n", "qgm_3 | \n", "qgm_3 | \n", "jaccard | \n", "<function authors_authors_jac_qgm_3_qgm_3 at 0x11b8892a8> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
12 | \n", "authors_authors_cos_dlm_dc0_dlm_dc0 | \n", "authors | \n", "authors | \n", "dlm_dc0 | \n", "dlm_dc0 | \n", "cosine | \n", "<function authors_authors_cos_dlm_dc0_dlm_dc0 at 0x11b889320> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
13 | \n", "authors_authors_mel | \n", "authors | \n", "authors | \n", "None | \n", "None | \n", "monge_elkan | \n", "<function authors_authors_mel at 0x11b889398> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
14 | \n", "authors_authors_lev_dist | \n", "authors | \n", "authors | \n", "None | \n", "None | \n", "lev_dist | \n", "<function authors_authors_lev_dist at 0x11b889410> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
15 | \n", "authors_authors_lev_sim | \n", "authors | \n", "authors | \n", "None | \n", "None | \n", "lev_sim | \n", "<function authors_authors_lev_sim at 0x11b889488> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
16 | \n", "year_year_exm | \n", "year | \n", "year | \n", "None | \n", "None | \n", "exact_match | \n", "<function year_year_exm at 0x11b889500> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
17 | \n", "year_year_anm | \n", "year | \n", "year | \n", "None | \n", "None | \n", "abs_norm | \n", "<function year_year_anm at 0x11b889578> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
18 | \n", "year_year_lev_dist | \n", "year | \n", "year | \n", "None | \n", "None | \n", "lev_dist | \n", "<function year_year_lev_dist at 0x11b8895f0> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
19 | \n", "year_year_lev_sim | \n", "year | \n", "year | \n", "None | \n", "None | \n", "lev_sim | \n", "<function year_year_lev_sim at 0x11b889668> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
\n", " | feature_name | \n", "left_attribute | \n", "right_attribute | \n", "left_attr_tokenizer | \n", "right_attr_tokenizer | \n", "simfunction | \n", "function | \n", "function_source | \n", "is_auto_generated | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "id_id_lev_dist | \n", "id | \n", "id | \n", "None | \n", "None | \n", "lev_dist | \n", "<function id_id_lev_dist at 0x11b874aa0> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
15 | \n", "authors_authors_lev_sim | \n", "authors | \n", "authors | \n", "None | \n", "None | \n", "lev_sim | \n", "<function authors_authors_lev_sim at 0x11b889488> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
16 | \n", "year_year_exm | \n", "year | \n", "year | \n", "None | \n", "None | \n", "exact_match | \n", "<function year_year_exm at 0x11b889500> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
17 | \n", "year_year_anm | \n", "year | \n", "year | \n", "None | \n", "None | \n", "abs_norm | \n", "<function year_year_anm at 0x11b889578> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
18 | \n", "year_year_lev_dist | \n", "year | \n", "year | \n", "None | \n", "None | \n", "lev_dist | \n", "<function year_year_lev_dist at 0x11b8895f0> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
19 | \n", "year_year_lev_sim | \n", "year | \n", "year | \n", "None | \n", "None | \n", "lev_sim | \n", "<function year_year_lev_sim at 0x11b889668> | \n", "from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n", "True | \n", "
\n", " | _id | \n", "ltable_id | \n", "rtable_id | \n", "id_id_lev_dist | \n", "authors_authors_lev_sim | \n", "year_year_exm | \n", "year_year_anm | \n", "year_year_lev_dist | \n", "year_year_lev_sim | \n", "label | \n", "
---|---|---|---|---|---|---|---|---|---|---|
430 | \n", "430 | \n", "l1494 | \n", "r1257 | \n", "4 | \n", "0.083333 | \n", "1 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0 | \n", "
35 | \n", "35 | \n", "l1385 | \n", "r1160 | \n", "4 | \n", "0.271186 | \n", "1 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0 | \n", "
394 | \n", "394 | \n", "l1345 | \n", "r85 | \n", "4 | \n", "0.338462 | \n", "1 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1 | \n", "
29 | \n", "29 | \n", "l611 | \n", "r141 | \n", "3 | \n", "0.277778 | \n", "1 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0 | \n", "
181 | \n", "181 | \n", "l1164 | \n", "r1161 | \n", "2 | \n", "0.244444 | \n", "1 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1 | \n", "
\n", " | _id | \n", "ltable_id | \n", "rtable_id | \n", "id_id_lev_dist | \n", "authors_authors_lev_sim | \n", "year_year_exm | \n", "year_year_anm | \n", "year_year_lev_dist | \n", "year_year_lev_sim | \n", "label | \n", "predicted_labels | \n", "proba | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
430 | \n", "430 | \n", "l1494 | \n", "r1257 | \n", "4.0 | \n", "0.083333 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0 | \n", "0 | \n", "0.0 | \n", "
35 | \n", "35 | \n", "l1385 | \n", "r1160 | \n", "4.0 | \n", "0.271186 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0 | \n", "0 | \n", "0.0 | \n", "
394 | \n", "394 | \n", "l1345 | \n", "r85 | \n", "4.0 | \n", "0.338462 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1 | \n", "1 | \n", "1.0 | \n", "
29 | \n", "29 | \n", "l611 | \n", "r141 | \n", "3.0 | \n", "0.277778 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0 | \n", "0 | \n", "0.0 | \n", "
181 | \n", "181 | \n", "l1164 | \n", "r1161 | \n", "2.0 | \n", "0.244444 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1 | \n", "1 | \n", "1.0 | \n", "
\n", " | _id | \n", "ltable_id | \n", "rtable_id | \n", "id_id_lev_dist | \n", "authors_authors_lev_sim | \n", "year_year_exm | \n", "year_year_anm | \n", "year_year_lev_dist | \n", "year_year_lev_sim | \n", "label | \n", "predicted_labels | \n", "proba | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
371 | \n", "371 | \n", "l650 | \n", "r1594 | \n", "4.0 | \n", "0.120000 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1 | \n", "0 | \n", "0.500000 | \n", "
259 | \n", "259 | \n", "l938 | \n", "r1090 | \n", "5.0 | \n", "0.200000 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1 | \n", "0 | \n", "0.333333 | \n", "
346 | \n", "346 | \n", "l1681 | \n", "r693 | \n", "4.0 | \n", "0.238095 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1 | \n", "0 | \n", "0.500000 | \n", "
184 | \n", "184 | \n", "l891 | \n", "r485 | \n", "4.0 | \n", "0.137931 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1 | \n", "0 | \n", "0.500000 | \n", "
11 | \n", "11 | \n", "l1189 | \n", "r1674 | \n", "4.0 | \n", "0.222222 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1 | \n", "0 | \n", "0.250000 | \n", "
121 | \n", "121 | \n", "l169 | \n", "r521 | \n", "4.0 | \n", "0.153846 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1 | \n", "0 | \n", "0.500000 | \n", "
267 | \n", "267 | \n", "l120 | \n", "r1181 | \n", "4.0 | \n", "0.216667 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1 | \n", "0 | \n", "0.500000 | \n", "
147 | \n", "147 | \n", "l867 | \n", "r1263 | \n", "4.0 | \n", "0.142857 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1 | \n", "0 | \n", "0.333333 | \n", "
\n", " | _id | \n", "ltable_id | \n", "rtable_id | \n", "id_id_lev_dist | \n", "authors_authors_lev_sim | \n", "year_year_exm | \n", "year_year_anm | \n", "year_year_lev_dist | \n", "year_year_lev_sim | \n", "label | \n", "predicted_labels | \n", "proba | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
430 | \n", "430 | \n", "l1494 | \n", "r1257 | \n", "4.0 | \n", "0.083333 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0 | \n", "0 | \n", "0.0 | \n", "
35 | \n", "35 | \n", "l1385 | \n", "r1160 | \n", "4.0 | \n", "0.271186 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0 | \n", "0 | \n", "0.0 | \n", "
394 | \n", "394 | \n", "l1345 | \n", "r85 | \n", "4.0 | \n", "0.338462 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1 | \n", "1 | \n", "1.0 | \n", "
29 | \n", "29 | \n", "l611 | \n", "r141 | \n", "3.0 | \n", "0.277778 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0 | \n", "0 | \n", "0.0 | \n", "
181 | \n", "181 | \n", "l1164 | \n", "r1161 | \n", "2.0 | \n", "0.244444 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1 | \n", "1 | \n", "1.0 | \n", "
\n", " | _id | \n", "ltable_id | \n", "rtable_id | \n", "id_id_lev_dist | \n", "authors_authors_lev_sim | \n", "year_year_exm | \n", "year_year_anm | \n", "year_year_lev_dist | \n", "year_year_lev_sim | \n", "label | \n", "predicted_labels | \n", "proba | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
11 | \n", "11 | \n", "l1189 | \n", "r1674 | \n", "4.0 | \n", "0.222222 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1 | \n", "0 | \n", "0.25 | \n", "
267 | \n", "267 | \n", "l120 | \n", "r1181 | \n", "4.0 | \n", "0.216667 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1 | \n", "0 | \n", "0.50 | \n", "
\n", " | _id | \n", "ltable_id | \n", "rtable_id | \n", "ltable_title | \n", "ltable_authors | \n", "ltable_year | \n", "rtable_title | \n", "rtable_authors | \n", "rtable_year | \n", "label | \n", "
---|---|---|---|---|---|---|---|---|---|---|
11 | \n", "11 | \n", "l1189 | \n", "r1674 | \n", "Weimin Du, Xiangning Liu, Abdelsalam Helal | \n", "Multiview Access Protocols for Large-Scale Replication | \n", "1998 | \n", "Multiview access protocols for large-scale replication | \n", "Xiangning Liu, Abdelsalam Helal, Weimin Du | \n", "1998 | \n", "1 | \n", "
267 | \n", "267 | \n", "l120 | \n", "r1181 | \n", "w. Bruce kroft, James callan, erik w. Brown | \n", "fast incrremental indexiing for fulltext informtion retreval | \n", "1994 | \n", "Fast Incremental Indexing For Full-Text Information Retrieval | \n", "Eric W. Brown, James P. Callan, W. Bruce Croft | \n", "1994 | \n", "1 | \n", "