{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Contents\n", "===\n", " - Introduction\n", " - Removing Features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This IPython notebook illustrates how to remove features from feature table.\n", "First, we need to import py_entitymatching package and other libraries as follows:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/pradap/miniconda3/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", " \"This module will be removed in 0.20.\", DeprecationWarning)\n" ] } ], "source": [ "# Import py_entitymatching package\n", "import py_entitymatching as em\n", "import os\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, read the (sample) input tables for blocking purposes" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get the datasets directory\n", "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n", "\n", "# Get the paths of the input tables\n", "path_A = datasets_dir + os.sep + 'person_table_A.csv'\n", "path_B = datasets_dir + os.sep + 'person_table_B.csv'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Read the CSV files and set 'ID' as the key attribute\n", "A = em.read_csv_metadata(path_A, key='ID')\n", "B = em.read_csv_metadata(path_B, key='ID')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get features\n", "feature_table = em.get_features_for_blocking(A, B)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Removing Features from Feature Table" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "pandas.core.frame.DataFrame" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(feature_table)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
feature_nameleft_attributeright_attributeleft_attr_tokenizerright_attr_tokenizersimfunctionfunctionfunction_sourceis_auto_generated
0ID_ID_lev_distIDIDNoneNonelev_dist<function ID_ID_lev_dist at 0x109a7c048>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
1ID_ID_lev_simIDIDNoneNonelev_sim<function ID_ID_lev_sim at 0x11436a158>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
2ID_ID_jarIDIDNoneNonejaro<function ID_ID_jar at 0x11436a1e0>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
3ID_ID_jwnIDIDNoneNonejaro_winkler<function ID_ID_jwn at 0x11436a268>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
4ID_ID_exmIDIDNoneNoneexact_match<function ID_ID_exm at 0x11436a510>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
\n", "
" ], "text/plain": [ " feature_name left_attribute right_attribute left_attr_tokenizer \\\n", "0 ID_ID_lev_dist ID ID None \n", "1 ID_ID_lev_sim ID ID None \n", "2 ID_ID_jar ID ID None \n", "3 ID_ID_jwn ID ID None \n", "4 ID_ID_exm ID ID None \n", "\n", " right_attr_tokenizer simfunction \\\n", "0 None lev_dist \n", "1 None lev_sim \n", "2 None jaro \n", "3 None jaro_winkler \n", "4 None exact_match \n", "\n", " function \\\n", "0 \n", "1 \n", "2 \n", "3 \n", "4 \n", "\n", " function_source \\\n", "0 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "1 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "2 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "3 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "4 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "\n", " is_auto_generated \n", "0 True \n", "1 True \n", "2 True \n", "3 True \n", "4 True " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_table.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Drop first row\n", "feature_table = feature_table.drop(0)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
feature_nameleft_attributeright_attributeleft_attr_tokenizerright_attr_tokenizersimfunctionfunctionfunction_sourceis_auto_generated
1ID_ID_lev_simIDIDNoneNonelev_sim<function ID_ID_lev_sim at 0x11436a158>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
2ID_ID_jarIDIDNoneNonejaro<function ID_ID_jar at 0x11436a1e0>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
3ID_ID_jwnIDIDNoneNonejaro_winkler<function ID_ID_jwn at 0x11436a268>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
4ID_ID_exmIDIDNoneNoneexact_match<function ID_ID_exm at 0x11436a510>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
5ID_ID_jac_qgm_3_qgm_3IDIDqgm_3qgm_3jaccard<function ID_ID_jac_qgm_3_qgm_3 at 0x11436a6a8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
\n", "
" ], "text/plain": [ " feature_name left_attribute right_attribute left_attr_tokenizer \\\n", "1 ID_ID_lev_sim ID ID None \n", "2 ID_ID_jar ID ID None \n", "3 ID_ID_jwn ID ID None \n", "4 ID_ID_exm ID ID None \n", "5 ID_ID_jac_qgm_3_qgm_3 ID ID qgm_3 \n", "\n", " right_attr_tokenizer simfunction \\\n", "1 None lev_sim \n", "2 None jaro \n", "3 None jaro_winkler \n", "4 None exact_match \n", "5 qgm_3 jaccard \n", "\n", " function \\\n", "1 \n", "2 \n", "3 \n", "4 \n", "5 \n", "\n", " function_source \\\n", "1 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "2 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "3 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "4 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "5 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "\n", " is_auto_generated \n", "1 True \n", "2 True \n", "3 True \n", "4 True \n", "5 True " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_table.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Remove all the features except involving name (Include only the features where the left attribute is name)\n", "feature_table = feature_table[feature_table.left_attribute=='name']" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
feature_nameleft_attributeright_attributeleft_attr_tokenizerright_attr_tokenizersimfunctionfunctionfunction_sourceis_auto_generated
6name_name_jac_qgm_3_qgm_3namenameqgm_3qgm_3jaccard<function name_name_jac_qgm_3_qgm_3 at 0x11436a730>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
7name_name_cos_dlm_dc0_dlm_dc0namenamedlm_dc0dlm_dc0cosine<function name_name_cos_dlm_dc0_dlm_dc0 at 0x11436a7b8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
8name_name_jac_dlm_dc0_dlm_dc0namenamedlm_dc0dlm_dc0jaccard<function name_name_jac_dlm_dc0_dlm_dc0 at 0x11436a840>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
9name_name_melnamenameNoneNonemonge_elkan<function name_name_mel at 0x11436a8c8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
10name_name_lev_distnamenameNoneNonelev_dist<function name_name_lev_dist at 0x11436a950>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
11name_name_lev_simnamenameNoneNonelev_sim<function name_name_lev_sim at 0x11436a9d8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
12name_name_nmwnamenameNoneNoneneedleman_wunsch<function name_name_nmw at 0x11436aa60>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
13name_name_swnamenameNoneNonesmith_waterman<function name_name_sw at 0x11436aae8>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
\n", "
" ], "text/plain": [ " feature_name left_attribute right_attribute \\\n", "6 name_name_jac_qgm_3_qgm_3 name name \n", "7 name_name_cos_dlm_dc0_dlm_dc0 name name \n", "8 name_name_jac_dlm_dc0_dlm_dc0 name name \n", "9 name_name_mel name name \n", "10 name_name_lev_dist name name \n", "11 name_name_lev_sim name name \n", "12 name_name_nmw name name \n", "13 name_name_sw name name \n", "\n", " left_attr_tokenizer right_attr_tokenizer simfunction \\\n", "6 qgm_3 qgm_3 jaccard \n", "7 dlm_dc0 dlm_dc0 cosine \n", "8 dlm_dc0 dlm_dc0 jaccard \n", "9 None None monge_elkan \n", "10 None None lev_dist \n", "11 None None lev_sim \n", "12 None None needleman_wunsch \n", "13 None None smith_waterman \n", "\n", " function \\\n", "6 \n", "7 \n", "8 \n", "9 \n", "10 \n", "11 \n", "12 \n", "13 \n", "\n", " function_source \\\n", "6 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "7 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "8 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "9 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "10 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "11 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "12 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "13 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "\n", " is_auto_generated \n", "6 True \n", "7 True \n", "8 True \n", "9 True \n", "10 True \n", "11 True \n", "12 True \n", "13 True " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_table" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Remove all the features except involving jaccard (Include only the features where the sim function is jaccard)\n", "feature_table = feature_table[feature_table.simfunction=='jaccard']" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
feature_nameleft_attributeright_attributeleft_attr_tokenizerright_attr_tokenizersimfunctionfunctionfunction_sourceis_auto_generated
6name_name_jac_qgm_3_qgm_3namenameqgm_3qgm_3jaccard<function name_name_jac_qgm_3_qgm_3 at 0x11436a730>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
8name_name_jac_dlm_dc0_dlm_dc0namenamedlm_dc0dlm_dc0jaccard<function name_name_jac_dlm_dc0_dlm_dc0 at 0x11436a840>from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ...True
\n", "
" ], "text/plain": [ " feature_name left_attribute right_attribute \\\n", "6 name_name_jac_qgm_3_qgm_3 name name \n", "8 name_name_jac_dlm_dc0_dlm_dc0 name name \n", "\n", " left_attr_tokenizer right_attr_tokenizer simfunction \\\n", "6 qgm_3 qgm_3 jaccard \n", "8 dlm_dc0 dlm_dc0 jaccard \n", "\n", " function \\\n", "6 \n", "8 \n", "\n", " function_source \\\n", "6 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "8 from py_entitymatching.feature.simfunctions import *\\nfrom py_entitymatching.feature.tokenizers ... \n", "\n", " is_auto_generated \n", "6 True \n", "8 True " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_table" ] } ], "metadata": { "kernelspec": { "display_name": "Python [Root]", "language": "python", "name": "Python [Root]" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }