{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Introduction"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This IPython notebook illustrates how to remove features from feature table.\n",
"First, we need to import py_entitymatching package and other libraries as follows:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Import py_entitymatching package\n",
"import py_entitymatching as em\n",
"import os\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Then, read the (sample) input tables for blocking purposes"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Get the datasets directory\n",
"datasets_dir = em.get_install_path() + os.sep + 'datasets'\n",
"\n",
"# Get the paths of the input tables\n",
"path_A = datasets_dir + os.sep + 'person_table_A.csv'\n",
"path_B = datasets_dir + os.sep + 'person_table_B.csv'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Read the CSV files and set 'ID' as the key attribute\n",
"A = em.read_csv_metadata(path_A, key='ID')\n",
"B = em.read_csv_metadata(path_B, key='ID')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Get features (for blocking)\n",
"feature_table = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False)\n",
"# Get features (for matching)\n",
"# feature_table = em.get_features_for_matching(A, B)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Removing Features from Feature Table"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pandas.core.frame.DataFrame"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(feature_table)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" feature_name | \n",
" left_attribute | \n",
" right_attribute | \n",
" left_attr_tokenizer | \n",
" right_attr_tokenizer | \n",
" simfunction | \n",
" function | \n",
" function_source | \n",
" is_auto_generated | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" ID_ID_lev_dist | \n",
" ID | \n",
" ID | \n",
" None | \n",
" None | \n",
" lev_dist | \n",
" <function ID_ID_lev_dist at 0x10b5987b8> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
" 1 | \n",
" ID_ID_lev_sim | \n",
" ID | \n",
" ID | \n",
" None | \n",
" None | \n",
" lev_sim | \n",
" <function ID_ID_lev_sim at 0x10f9b0620> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
" 2 | \n",
" ID_ID_jar | \n",
" ID | \n",
" ID | \n",
" None | \n",
" None | \n",
" jaro | \n",
" <function ID_ID_jar at 0x10f9b0950> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
" 3 | \n",
" ID_ID_jwn | \n",
" ID | \n",
" ID | \n",
" None | \n",
" None | \n",
" jaro_winkler | \n",
" <function ID_ID_jwn at 0x10f9b09d8> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
" 4 | \n",
" ID_ID_exm | \n",
" ID | \n",
" ID | \n",
" None | \n",
" None | \n",
" exact_match | \n",
" <function ID_ID_exm at 0x10f9b08c8> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" feature_name left_attribute right_attribute left_attr_tokenizer \\\n",
"0 ID_ID_lev_dist ID ID None \n",
"1 ID_ID_lev_sim ID ID None \n",
"2 ID_ID_jar ID ID None \n",
"3 ID_ID_jwn ID ID None \n",
"4 ID_ID_exm ID ID None \n",
"\n",
" right_attr_tokenizer simfunction \\\n",
"0 None lev_dist \n",
"1 None lev_sim \n",
"2 None jaro \n",
"3 None jaro_winkler \n",
"4 None exact_match \n",
"\n",
" function \\\n",
"0 \n",
"1 \n",
"2 \n",
"3 \n",
"4 \n",
"\n",
" function_source \\\n",
"0 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"1 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"2 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"3 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"4 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"\n",
" is_auto_generated \n",
"0 True \n",
"1 True \n",
"2 True \n",
"3 True \n",
"4 True "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_table.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Drop first row\n",
"feature_table = feature_table.drop(0)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" feature_name | \n",
" left_attribute | \n",
" right_attribute | \n",
" left_attr_tokenizer | \n",
" right_attr_tokenizer | \n",
" simfunction | \n",
" function | \n",
" function_source | \n",
" is_auto_generated | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" ID_ID_lev_sim | \n",
" ID | \n",
" ID | \n",
" None | \n",
" None | \n",
" lev_sim | \n",
" <function ID_ID_lev_sim at 0x10f9b0620> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
" 2 | \n",
" ID_ID_jar | \n",
" ID | \n",
" ID | \n",
" None | \n",
" None | \n",
" jaro | \n",
" <function ID_ID_jar at 0x10f9b0950> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
" 3 | \n",
" ID_ID_jwn | \n",
" ID | \n",
" ID | \n",
" None | \n",
" None | \n",
" jaro_winkler | \n",
" <function ID_ID_jwn at 0x10f9b09d8> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
" 4 | \n",
" ID_ID_exm | \n",
" ID | \n",
" ID | \n",
" None | \n",
" None | \n",
" exact_match | \n",
" <function ID_ID_exm at 0x10f9b08c8> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
" 5 | \n",
" ID_ID_jac_qgm_3_qgm_3 | \n",
" ID | \n",
" ID | \n",
" qgm_3 | \n",
" qgm_3 | \n",
" jaccard | \n",
" <function ID_ID_jac_qgm_3_qgm_3 at 0x10f9b0a60> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" feature_name left_attribute right_attribute left_attr_tokenizer \\\n",
"1 ID_ID_lev_sim ID ID None \n",
"2 ID_ID_jar ID ID None \n",
"3 ID_ID_jwn ID ID None \n",
"4 ID_ID_exm ID ID None \n",
"5 ID_ID_jac_qgm_3_qgm_3 ID ID qgm_3 \n",
"\n",
" right_attr_tokenizer simfunction \\\n",
"1 None lev_sim \n",
"2 None jaro \n",
"3 None jaro_winkler \n",
"4 None exact_match \n",
"5 qgm_3 jaccard \n",
"\n",
" function \\\n",
"1 \n",
"2 \n",
"3 \n",
"4 \n",
"5 \n",
"\n",
" function_source \\\n",
"1 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"2 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"3 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"4 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"5 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"\n",
" is_auto_generated \n",
"1 True \n",
"2 True \n",
"3 True \n",
"4 True \n",
"5 True "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_table.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#Remove all the features except involving name (Include only the features where the left attribute is name)\n",
"feature_table = feature_table[feature_table.left_attribute=='name']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" feature_name | \n",
" left_attribute | \n",
" right_attribute | \n",
" left_attr_tokenizer | \n",
" right_attr_tokenizer | \n",
" simfunction | \n",
" function | \n",
" function_source | \n",
" is_auto_generated | \n",
"
\n",
" \n",
" \n",
" \n",
" 6 | \n",
" name_name_jac_qgm_3_qgm_3 | \n",
" name | \n",
" name | \n",
" qgm_3 | \n",
" qgm_3 | \n",
" jaccard | \n",
" <function name_name_jac_qgm_3_qgm_3 at 0x10f9b0ae8> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
" 7 | \n",
" name_name_cos_dlm_dc0_dlm_dc0 | \n",
" name | \n",
" name | \n",
" dlm_dc0 | \n",
" dlm_dc0 | \n",
" cosine | \n",
" <function name_name_cos_dlm_dc0_dlm_dc0 at 0x10f9b0b70> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
" 8 | \n",
" name_name_jac_dlm_dc0_dlm_dc0 | \n",
" name | \n",
" name | \n",
" dlm_dc0 | \n",
" dlm_dc0 | \n",
" jaccard | \n",
" <function name_name_jac_dlm_dc0_dlm_dc0 at 0x10f9b0bf8> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
" 9 | \n",
" name_name_mel | \n",
" name | \n",
" name | \n",
" None | \n",
" None | \n",
" monge_elkan | \n",
" <function name_name_mel at 0x10f9b0c80> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
" 10 | \n",
" name_name_lev_dist | \n",
" name | \n",
" name | \n",
" None | \n",
" None | \n",
" lev_dist | \n",
" <function name_name_lev_dist at 0x10f9b0d08> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
" 11 | \n",
" name_name_lev_sim | \n",
" name | \n",
" name | \n",
" None | \n",
" None | \n",
" lev_sim | \n",
" <function name_name_lev_sim at 0x10f9b0d90> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
" 12 | \n",
" name_name_nmw | \n",
" name | \n",
" name | \n",
" None | \n",
" None | \n",
" needleman_wunsch | \n",
" <function name_name_nmw at 0x10f9b0e18> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
" 13 | \n",
" name_name_sw | \n",
" name | \n",
" name | \n",
" None | \n",
" None | \n",
" smith_waterman | \n",
" <function name_name_sw at 0x10f9b0ea0> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" feature_name left_attribute right_attribute \\\n",
"6 name_name_jac_qgm_3_qgm_3 name name \n",
"7 name_name_cos_dlm_dc0_dlm_dc0 name name \n",
"8 name_name_jac_dlm_dc0_dlm_dc0 name name \n",
"9 name_name_mel name name \n",
"10 name_name_lev_dist name name \n",
"11 name_name_lev_sim name name \n",
"12 name_name_nmw name name \n",
"13 name_name_sw name name \n",
"\n",
" left_attr_tokenizer right_attr_tokenizer simfunction \\\n",
"6 qgm_3 qgm_3 jaccard \n",
"7 dlm_dc0 dlm_dc0 cosine \n",
"8 dlm_dc0 dlm_dc0 jaccard \n",
"9 None None monge_elkan \n",
"10 None None lev_dist \n",
"11 None None lev_sim \n",
"12 None None needleman_wunsch \n",
"13 None None smith_waterman \n",
"\n",
" function \\\n",
"6 \n",
"7 \n",
"8 \n",
"9 \n",
"10 \n",
"11 \n",
"12 \n",
"13 \n",
"\n",
" function_source \\\n",
"6 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"7 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"8 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"9 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"10 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"11 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"12 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"13 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"\n",
" is_auto_generated \n",
"6 True \n",
"7 True \n",
"8 True \n",
"9 True \n",
"10 True \n",
"11 True \n",
"12 True \n",
"13 True "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_table"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#Remove all the features except involving jaccard (Include only the features where the sim function is jaccard)\n",
"feature_table = feature_table[feature_table.simfunction=='jaccard']"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" feature_name | \n",
" left_attribute | \n",
" right_attribute | \n",
" left_attr_tokenizer | \n",
" right_attr_tokenizer | \n",
" simfunction | \n",
" function | \n",
" function_source | \n",
" is_auto_generated | \n",
"
\n",
" \n",
" \n",
" \n",
" 6 | \n",
" name_name_jac_qgm_3_qgm_3 | \n",
" name | \n",
" name | \n",
" qgm_3 | \n",
" qgm_3 | \n",
" jaccard | \n",
" <function name_name_jac_qgm_3_qgm_3 at 0x10f9b0ae8> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
" 8 | \n",
" name_name_jac_dlm_dc0_dlm_dc0 | \n",
" name | \n",
" name | \n",
" dlm_dc0 | \n",
" dlm_dc0 | \n",
" jaccard | \n",
" <function name_name_jac_dlm_dc0_dlm_dc0 at 0x10f9b0bf8> | \n",
" from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" feature_name left_attribute right_attribute \\\n",
"6 name_name_jac_qgm_3_qgm_3 name name \n",
"8 name_name_jac_dlm_dc0_dlm_dc0 name name \n",
"\n",
" left_attr_tokenizer right_attr_tokenizer simfunction \\\n",
"6 qgm_3 qgm_3 jaccard \n",
"8 dlm_dc0 dlm_dc0 jaccard \n",
"\n",
" function \\\n",
"6 \n",
"8 \n",
"\n",
" function_source \\\n",
"6 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"8 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n",
"\n",
" is_auto_generated \n",
"6 True \n",
"8 True "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_table"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
}
},
"nbformat": 4,
"nbformat_minor": 1
}