{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This IPython notebook illustrates how to remove features from feature table.\n", "First, we need to import py_entitymatching package and other libraries as follows:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Import py_entitymatching package\n", "import py_entitymatching as em\n", "import os\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, read the (sample) input tables for blocking purposes" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get the datasets directory\n", "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n", "\n", "# Get the paths of the input tables\n", "path_A = datasets_dir + os.sep + 'person_table_A.csv'\n", "path_B = datasets_dir + os.sep + 'person_table_B.csv'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Read the CSV files and set 'ID' as the key attribute\n", "A = em.read_csv_metadata(path_A, key='ID')\n", "B = em.read_csv_metadata(path_B, key='ID')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get features (for blocking)\n", "feature_table = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False)\n", "# Get features (for matching)\n", "# feature_table = em.get_features_for_matching(A, B)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Removing Features from Feature Table" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pandas.core.frame.DataFrame" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(feature_table)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
feature_nameleft_attributeright_attributeleft_attr_tokenizerright_attr_tokenizersimfunctionfunctionfunction_sourceis_auto_generated
0ID_ID_lev_distIDIDNoneNonelev_dist<function ID_ID_lev_dist at 0x10b5987b8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
1ID_ID_lev_simIDIDNoneNonelev_sim<function ID_ID_lev_sim at 0x10f9b0620>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
2ID_ID_jarIDIDNoneNonejaro<function ID_ID_jar at 0x10f9b0950>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
3ID_ID_jwnIDIDNoneNonejaro_winkler<function ID_ID_jwn at 0x10f9b09d8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
4ID_ID_exmIDIDNoneNoneexact_match<function ID_ID_exm at 0x10f9b08c8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
\n", "
" ], "text/plain": [ " feature_name left_attribute right_attribute left_attr_tokenizer \\\n", "0 ID_ID_lev_dist ID ID None \n", "1 ID_ID_lev_sim ID ID None \n", "2 ID_ID_jar ID ID None \n", "3 ID_ID_jwn ID ID None \n", "4 ID_ID_exm ID ID None \n", "\n", " right_attr_tokenizer simfunction \\\n", "0 None lev_dist \n", "1 None lev_sim \n", "2 None jaro \n", "3 None jaro_winkler \n", "4 None exact_match \n", "\n", " function \\\n", "0 \n", "1 \n", "2 \n", "3 \n", "4 \n", "\n", " function_source \\\n", "0 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "1 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "2 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "3 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "4 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "\n", " is_auto_generated \n", "0 True \n", "1 True \n", "2 True \n", "3 True \n", "4 True " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_table.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Drop first row\n", "feature_table = feature_table.drop(0)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
feature_nameleft_attributeright_attributeleft_attr_tokenizerright_attr_tokenizersimfunctionfunctionfunction_sourceis_auto_generated
1ID_ID_lev_simIDIDNoneNonelev_sim<function ID_ID_lev_sim at 0x10f9b0620>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
2ID_ID_jarIDIDNoneNonejaro<function ID_ID_jar at 0x10f9b0950>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
3ID_ID_jwnIDIDNoneNonejaro_winkler<function ID_ID_jwn at 0x10f9b09d8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
4ID_ID_exmIDIDNoneNoneexact_match<function ID_ID_exm at 0x10f9b08c8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
5ID_ID_jac_qgm_3_qgm_3IDIDqgm_3qgm_3jaccard<function ID_ID_jac_qgm_3_qgm_3 at 0x10f9b0a60>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
\n", "
" ], "text/plain": [ " feature_name left_attribute right_attribute left_attr_tokenizer \\\n", "1 ID_ID_lev_sim ID ID None \n", "2 ID_ID_jar ID ID None \n", "3 ID_ID_jwn ID ID None \n", "4 ID_ID_exm ID ID None \n", "5 ID_ID_jac_qgm_3_qgm_3 ID ID qgm_3 \n", "\n", " right_attr_tokenizer simfunction \\\n", "1 None lev_sim \n", "2 None jaro \n", "3 None jaro_winkler \n", "4 None exact_match \n", "5 qgm_3 jaccard \n", "\n", " function \\\n", "1 \n", "2 \n", "3 \n", "4 \n", "5 \n", "\n", " function_source \\\n", "1 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "2 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "3 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "4 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "5 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "\n", " is_auto_generated \n", "1 True \n", "2 True \n", "3 True \n", "4 True \n", "5 True " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_table.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Remove all the features except involving name (Include only the features where the left attribute is name)\n", "feature_table = feature_table[feature_table.left_attribute=='name']" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
feature_nameleft_attributeright_attributeleft_attr_tokenizerright_attr_tokenizersimfunctionfunctionfunction_sourceis_auto_generated
6name_name_jac_qgm_3_qgm_3namenameqgm_3qgm_3jaccard<function name_name_jac_qgm_3_qgm_3 at 0x10f9b0ae8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
7name_name_cos_dlm_dc0_dlm_dc0namenamedlm_dc0dlm_dc0cosine<function name_name_cos_dlm_dc0_dlm_dc0 at 0x10f9b0b70>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
8name_name_jac_dlm_dc0_dlm_dc0namenamedlm_dc0dlm_dc0jaccard<function name_name_jac_dlm_dc0_dlm_dc0 at 0x10f9b0bf8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
9name_name_melnamenameNoneNonemonge_elkan<function name_name_mel at 0x10f9b0c80>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
10name_name_lev_distnamenameNoneNonelev_dist<function name_name_lev_dist at 0x10f9b0d08>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
11name_name_lev_simnamenameNoneNonelev_sim<function name_name_lev_sim at 0x10f9b0d90>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
12name_name_nmwnamenameNoneNoneneedleman_wunsch<function name_name_nmw at 0x10f9b0e18>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
13name_name_swnamenameNoneNonesmith_waterman<function name_name_sw at 0x10f9b0ea0>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
\n", "
" ], "text/plain": [ " feature_name left_attribute right_attribute \\\n", "6 name_name_jac_qgm_3_qgm_3 name name \n", "7 name_name_cos_dlm_dc0_dlm_dc0 name name \n", "8 name_name_jac_dlm_dc0_dlm_dc0 name name \n", "9 name_name_mel name name \n", "10 name_name_lev_dist name name \n", "11 name_name_lev_sim name name \n", "12 name_name_nmw name name \n", "13 name_name_sw name name \n", "\n", " left_attr_tokenizer right_attr_tokenizer simfunction \\\n", "6 qgm_3 qgm_3 jaccard \n", "7 dlm_dc0 dlm_dc0 cosine \n", "8 dlm_dc0 dlm_dc0 jaccard \n", "9 None None monge_elkan \n", "10 None None lev_dist \n", "11 None None lev_sim \n", "12 None None needleman_wunsch \n", "13 None None smith_waterman \n", "\n", " function \\\n", "6 \n", "7 \n", "8 \n", "9 \n", "10 \n", "11 \n", "12 \n", "13 \n", "\n", " function_source \\\n", "6 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "7 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "8 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "9 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "10 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "11 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "12 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "13 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "\n", " is_auto_generated \n", "6 True \n", "7 True \n", "8 True \n", "9 True \n", "10 True \n", "11 True \n", "12 True \n", "13 True " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_table" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Remove all the features except involving jaccard (Include only the features where the sim function is jaccard)\n", "feature_table = feature_table[feature_table.simfunction=='jaccard']" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
feature_nameleft_attributeright_attributeleft_attr_tokenizerright_attr_tokenizersimfunctionfunctionfunction_sourceis_auto_generated
6name_name_jac_qgm_3_qgm_3namenameqgm_3qgm_3jaccard<function name_name_jac_qgm_3_qgm_3 at 0x10f9b0ae8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
8name_name_jac_dlm_dc0_dlm_dc0namenamedlm_dc0dlm_dc0jaccard<function name_name_jac_dlm_dc0_dlm_dc0 at 0x10f9b0bf8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
\n", "
" ], "text/plain": [ " feature_name left_attribute right_attribute \\\n", "6 name_name_jac_qgm_3_qgm_3 name name \n", "8 name_name_jac_dlm_dc0_dlm_dc0 name name \n", "\n", " left_attr_tokenizer right_attr_tokenizer simfunction \\\n", "6 qgm_3 qgm_3 jaccard \n", "8 dlm_dc0 dlm_dc0 jaccard \n", "\n", " function \\\n", "6 \n", "8 \n", "\n", " function_source \\\n", "6 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "8 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "\n", " is_auto_generated \n", "6 True \n", "8 True " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_table" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 1 }