{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Real time linkage" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this notebook, we demonstrate splink's incremental and real time linkage capabilities - specifically:\n", "- the `linker.compare_two_records` function, that allows you to interactively explore the results of a linkage model; and\n", "- the `linker.find_matches_to_new_records` that allows you to incrementally find matches to a small number of new records" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 1: Load a pre-trained linkage model" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import json\n", "from splink.duckdb.duckdb_linker import DuckDBLinker\n", "import altair as alt\n", "alt.renderers.enable('mimetype')\n", "\n", "with open(\"demo_settings/real_time_settings.json\") as f:\n", " trained_settings = json.load(f)\n", "\n", "df = pd.read_csv(\"./data/fake_1000.csv\")\n", "\n", "linker = DuckDBLinker(df, trained_settings)\n", "linker._initialise_df_concat_with_tf()\n", "linker.compute_tf_table(\"first_name\")\n", "linker.compute_tf_table(\"surname\")\n", "linker.compute_tf_table(\"dob\")\n", "linker.compute_tf_table(\"city\")\n", "t = linker.compute_tf_table(\"email\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "application/vnd.vegalite.v4+json": { "$schema": "https://vega.github.io/schema/vega-lite/v5.2.0.json", "config": { "view": { "continuousHeight": 300, "continuousWidth": 400 } }, "data": { "values": [ { "bar_sort_order": 0, "bayes_factor": 0.023816582302252427, "bayes_factor_description": null, "column_name": "Prior", "comparison_vector_value": null, "label_for_charts": "Starting match weight (prior)", "log2_bayes_factor": -5.391889789559854, "m_probability": null, "record_number": 0, "sql_condition": null, "term_frequency_adjustment": null, "u_probability": null, "value_l": "", "value_r": "" }, { "bar_sort_order": 1, "bayes_factor": 0.21876683164040506, "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is 4.57 times less likely to be a match", "column_name": "first_name", "comparison_vector_value": 0, "label_for_charts": "All other comparisons", "log2_bayes_factor": -2.1925340745596764, "m_probability": 0.21528549148688833, "record_number": 0, "sql_condition": "ELSE", "term_frequency_adjustment": false, "u_probability": 0.9840865266118626, "value_l": "Rowe", "value_r": "Scott" }, { "bar_sort_order": 2, "bayes_factor": 1, "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is 4.57 times less likely to be a match", "column_name": "tf_first_name", "comparison_vector_value": 0, "label_for_charts": "All other comparisons", "log2_bayes_factor": 0, "m_probability": 0.21528549148688833, "record_number": 0, "sql_condition": "ELSE", "term_frequency_adjustment": true, "u_probability": 0.9840865266118626, "value_l": "", "value_r": "" }, { "bar_sort_order": 3, "bayes_factor": 92.38584465067325, "bayes_factor_description": "If comparison level is `exact_match` then comparison is 92.39 times more likely to be a match", "column_name": "surname", "comparison_vector_value": 2, "label_for_charts": "exact_match", "log2_bayes_factor": 6.529599913880287, "m_probability": 0.4517645215191846, "record_number": 0, "sql_condition": "surname_l = surname_r", "term_frequency_adjustment": false, "u_probability": 0.004889975550122249, "value_l": "Caleb", "value_r": "Caleb" }, { "bar_sort_order": 4, "bayes_factor": 1.3349633251833741, "bayes_factor_description": "Term frequency adjustment on surname makes comparison 1.33 times more likely to be a match", "column_name": "tf_surname", "comparison_vector_value": 2, "label_for_charts": "Term freq adjustment on surname with weight {cl.tf_adjustment_weight}", "log2_bayes_factor": 0.4168001079781037, "m_probability": null, "record_number": 0, "sql_condition": "surname_l = surname_r", "term_frequency_adjustment": true, "u_probability": null, "value_l": "Caleb", "value_r": "Caleb" }, { "bar_sort_order": 5, "bayes_factor": 0.23088978993311773, "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is 4.33 times less likely to be a match", "column_name": "dob", "comparison_vector_value": 0, "label_for_charts": "All other comparisons", "log2_bayes_factor": -2.114723717091652, "m_probability": 0.22653362300553073, "record_number": 0, "sql_condition": "ELSE", "term_frequency_adjustment": false, "u_probability": 0.9811331331331331, "value_l": "1992-12-20", "value_r": "1990-12-11" }, { "bar_sort_order": 6, "bayes_factor": 1, "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is 4.33 times less likely to be a match", "column_name": "tf_dob", "comparison_vector_value": 0, "label_for_charts": "All other comparisons", "log2_bayes_factor": 0, "m_probability": 0.22653362300553073, "record_number": 0, "sql_condition": "ELSE", "term_frequency_adjustment": true, "u_probability": 0.9811331331331331, "value_l": "", "value_r": "" }, { "bar_sort_order": 7, "bayes_factor": 1, "bayes_factor_description": "If comparison level is `null` then comparison is 1.00 times more likely to be a match", "column_name": "city", "comparison_vector_value": -1, "label_for_charts": "Null", "log2_bayes_factor": 0, "m_probability": null, "record_number": 0, "sql_condition": "city_l IS NULL OR city_r IS NULL", "term_frequency_adjustment": false, "u_probability": null, "value_l": "Lvpreool", "value_r": "nan" }, { "bar_sort_order": 8, "bayes_factor": 1, "bayes_factor_description": "If comparison level is `null` then comparison is 1.00 times more likely to be a match", "column_name": "tf_city", "comparison_vector_value": -1, "label_for_charts": "Null", "log2_bayes_factor": 0, "m_probability": null, "record_number": 0, "sql_condition": "city_l IS NULL OR city_r IS NULL", "term_frequency_adjustment": true, "u_probability": null, "value_l": "", "value_r": "" }, { "bar_sort_order": 9, "bayes_factor": 0.4234380485302649, "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is 2.36 times less likely to be a match", "column_name": "email", "comparison_vector_value": 0, "label_for_charts": "All other comparisons", "log2_bayes_factor": -1.239777184635766, "m_probability": 0.42250907994219916, "record_number": 0, "sql_condition": "ELSE", "term_frequency_adjustment": false, "u_probability": 0.9978061286856716, "value_l": "calebr@thompson.org", "value_r": "c.scott@brooks.com" }, { "bar_sort_order": 10, "bayes_factor": 1, "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is 2.36 times less likely to be a match", "column_name": "tf_email", "comparison_vector_value": 0, "label_for_charts": "All other comparisons", "log2_bayes_factor": 0, "m_probability": 0.42250907994219916, "record_number": 0, "sql_condition": "ELSE", "term_frequency_adjustment": true, "u_probability": 0.9978061286856716, "value_l": "", "value_r": "" }, { "bar_sort_order": 11, "bayes_factor": 0.06282468122305176, "bayes_factor_description": null, "column_name": "Final score", "comparison_vector_value": null, "label_for_charts": "Final score", "log2_bayes_factor": -3.9925247439885574, "m_probability": null, "record_number": 0, "sql_condition": null, "term_frequency_adjustment": null, "u_probability": null, "value_l": "", "value_r": "" }, { "bar_sort_order": 0, "bayes_factor": 0.023816582302252427, "bayes_factor_description": null, "column_name": "Prior", "comparison_vector_value": null, "label_for_charts": "Starting match weight (prior)", "log2_bayes_factor": -5.391889789559854, "m_probability": null, "record_number": 1, "sql_condition": null, "term_frequency_adjustment": null, "u_probability": null, "value_l": "", "value_r": "" }, { "bar_sort_order": 1, "bayes_factor": 1, "bayes_factor_description": "If comparison level is `null` then comparison is 1.00 times more likely to be a match", "column_name": "first_name", "comparison_vector_value": -1, "label_for_charts": "Null", "log2_bayes_factor": 0, "m_probability": null, "record_number": 1, "sql_condition": "first_name_l IS NULL OR first_name_r IS NULL", "term_frequency_adjustment": false, "u_probability": null, "value_l": "Gabriel", "value_r": "nan" }, { "bar_sort_order": 2, "bayes_factor": 1, "bayes_factor_description": "If comparison level is `null` then comparison is 1.00 times more likely to be a match", "column_name": "tf_first_name", "comparison_vector_value": -1, "label_for_charts": "Null", "log2_bayes_factor": 0, "m_probability": null, "record_number": 1, "sql_condition": "first_name_l IS NULL OR first_name_r IS NULL", "term_frequency_adjustment": true, "u_probability": null, "value_l": "", "value_r": "" }, { "bar_sort_order": 3, "bayes_factor": 92.38584465067325, "bayes_factor_description": "If comparison level is `exact_match` then comparison is 92.39 times more likely to be a match", "column_name": "surname", "comparison_vector_value": 2, "label_for_charts": "exact_match", "log2_bayes_factor": 6.529599913880287, "m_probability": 0.4517645215191846, "record_number": 1, "sql_condition": "surname_l = surname_r", "term_frequency_adjustment": false, "u_probability": 0.004889975550122249, "value_l": "Thomas", "value_r": "Thomas" }, { "bar_sort_order": 4, "bayes_factor": 1.0012224938875305, "bayes_factor_description": "Term frequency adjustment on surname makes comparison 1.00 times more likely to be a match", "column_name": "tf_surname", "comparison_vector_value": 2, "label_for_charts": "Term freq adjustment on surname with weight {cl.tf_adjustment_weight}", "log2_bayes_factor": 0.0017626086992596967, "m_probability": null, "record_number": 1, "sql_condition": "surname_l = surname_r", "term_frequency_adjustment": true, "u_probability": null, "value_l": "Thomas", "value_r": "Thomas" }, { "bar_sort_order": 5, "bayes_factor": 0.23088978993311773, "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is 4.33 times less likely to be a match", "column_name": "dob", "comparison_vector_value": 0, "label_for_charts": "All other comparisons", "log2_bayes_factor": -2.114723717091652, "m_probability": 0.22653362300553073, "record_number": 1, "sql_condition": "ELSE", "term_frequency_adjustment": false, "u_probability": 0.9811331331331331, "value_l": "1977-09-13", "value_r": "1977-10-17" }, { "bar_sort_order": 6, "bayes_factor": 1, "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is 4.33 times less likely to be a match", "column_name": "tf_dob", "comparison_vector_value": 0, "label_for_charts": "All other comparisons", "log2_bayes_factor": 0, "m_probability": 0.22653362300553073, "record_number": 1, "sql_condition": "ELSE", "term_frequency_adjustment": true, "u_probability": 0.9811331331331331, "value_l": "", "value_r": "" }, { "bar_sort_order": 7, "bayes_factor": 10.484858675056154, "bayes_factor_description": "If comparison level is `exact_match` then comparison is 10.48 times more likely to be a match", "column_name": "city", "comparison_vector_value": 1, "label_for_charts": "exact_match", "log2_bayes_factor": 3.3902355104306197, "m_probability": 0.5782144900964232, "record_number": 1, "sql_condition": "city_l = city_r", "term_frequency_adjustment": false, "u_probability": 0.0551475711801453, "value_l": "London", "value_r": "London" }, { "bar_sort_order": 8, "bayes_factor": 0.2591617073379083, "bayes_factor_description": "Term frequency adjustment on city makes comparison 3.86 times less likely to be a match", "column_name": "tf_city", "comparison_vector_value": 1, "label_for_charts": "Term freq adjustment on city with weight {cl.tf_adjustment_weight}", "log2_bayes_factor": -1.948075527570922, "m_probability": null, "record_number": 1, "sql_condition": "city_l = city_r", "term_frequency_adjustment": true, "u_probability": null, "value_l": "London", "value_r": "London" }, { "bar_sort_order": 9, "bayes_factor": 1, "bayes_factor_description": "If comparison level is `null` then comparison is 1.00 times more likely to be a match", "column_name": "email", "comparison_vector_value": -1, "label_for_charts": "Null", "log2_bayes_factor": 0, "m_probability": null, "record_number": 1, "sql_condition": "email_l IS NULL OR email_r IS NULL", "term_frequency_adjustment": false, "u_probability": null, "value_l": "gabriel.t54@nichols.info", "value_r": "nan" }, { "bar_sort_order": 10, "bayes_factor": 1, "bayes_factor_description": "If comparison level is `null` then comparison is 1.00 times more likely to be a match", "column_name": "tf_email", "comparison_vector_value": -1, "label_for_charts": "Null", "log2_bayes_factor": 0, "m_probability": null, "record_number": 1, "sql_condition": "email_l IS NULL OR email_r IS NULL", "term_frequency_adjustment": true, "u_probability": null, "value_l": "", "value_r": "" }, { "bar_sort_order": 11, "bayes_factor": 1.3821450218943667, "bayes_factor_description": null, "column_name": "Final score", "comparison_vector_value": null, "label_for_charts": "Final score", "log2_bayes_factor": 0.4669089987877388, "m_probability": null, "record_number": 1, "sql_condition": null, "term_frequency_adjustment": null, "u_probability": null, "value_l": "", "value_r": "" } ] }, "height": 450, "layer": [ { "layer": [ { "encoding": { "color": { "value": "black" }, "size": { "value": 0.5 }, "y": { "field": "zero", "type": "quantitative" } }, "mark": "rule" }, { "encoding": { "color": { "condition": { "test": "(datum.log2_bayes_factor < 0)", "value": "red" }, "value": "green" }, "opacity": { "condition": { "test": "datum.column_name == 'Prior match weight' || datum.column_name == 'Final score'", "value": 1 }, "value": 0.5 }, "tooltip": [ { "field": "column_name", "title": "Comparison column", "type": "nominal" }, { "field": "value_l", "title": "Value (L)", "type": "nominal" }, { "field": "value_r", "title": "Value (R)", "type": "nominal" }, { "field": "label_for_charts", "title": "Label", "type": "ordinal" }, { "field": "sql_condition", "title": "SQL condition", "type": "nominal" }, { "field": "comparison_vector_value", "title": "Comparison vector value", "type": "nominal" }, { "field": "bayes_factor", "format": ",.4f", "title": "Bayes factor = m/u", "type": "quantitative" }, { "field": "log2_bayes_factor", "format": ",.4f", "title": "Match weight = log2(m/u)", "type": "quantitative" }, { "field": "prob", "format": ".4f", "title": "Adjusted match score", "type": "quantitative" }, { "field": "bayes_factor_description", "title": "Match weight description", "type": "nominal" } ], "x": { "axis": { "grid": true, "labelAlign": "center", "labelAngle": -20, "labelExpr": "datum.value == 'Prior' || datum.value == 'Final score' ? '' : datum.value", "labelPadding": 10, "tickBand": "extent", "title": "Column" }, "field": "column_name", "sort": { "field": "bar_sort_order", "order": "ascending" }, "type": "nominal" }, "y": { "axis": { "grid": false, "orient": "left", "title": "log2(Bayes factor)" }, "field": "previous_sum", "type": "quantitative" }, "y2": { "field": "sum" } }, "mark": { "type": "bar", "width": 60 } }, { "encoding": { "color": { "value": "white" }, "text": { "condition": { "field": "log2_bayes_factor", "format": ".2f", "test": "abs(datum.log2_bayes_factor) > 1", "type": "nominal" }, "value": "" }, "x": { "axis": { "labelAngle": 0, "title": "Column" }, "field": "column_name", "sort": { "field": "bar_sort_order", "order": "ascending" }, "type": "nominal" }, "y": { "axis": { "orient": "left" }, "field": "center", "type": "quantitative" } }, "mark": { "fontWeight": "bold", "type": "text" } }, { "encoding": { "color": { "value": "black" }, "text": { "field": "column_name", "type": "nominal" }, "x": { "axis": { "labelAngle": 0, "title": "Column" }, "field": "column_name", "sort": { "field": "bar_sort_order", "order": "ascending" }, "type": "nominal" }, "y": { "field": "sum_top", "type": "quantitative" } }, "mark": { "baseline": "bottom", "dy": -25, "fontWeight": "bold", "type": "text" } }, { "encoding": { "color": { "value": "grey" }, "text": { "field": "value_l", "type": "nominal" }, "x": { "axis": { "labelAngle": 0, "title": "Column" }, "field": "column_name", "sort": { "field": "bar_sort_order", "order": "ascending" }, "type": "nominal" }, "y": { "field": "sum_top", "type": "quantitative" } }, "mark": { "baseline": "bottom", "dy": -13, "fontSize": 8, "type": "text" } }, { "encoding": { "color": { "value": "grey" }, "text": { "field": "value_r", "type": "nominal" }, "x": { "axis": { "labelAngle": 0, "title": "Column" }, "field": "column_name", "sort": { "field": "bar_sort_order", "order": "ascending" }, "type": "nominal" }, "y": { "field": "sum_top", "type": "quantitative" } }, "mark": { "baseline": "bottom", "dy": -5, "fontSize": 8, "type": "text" } } ] }, { "encoding": { "x": { "axis": { "labelAngle": 0, "title": "Column" }, "field": "column_name", "sort": { "field": "bar_sort_order", "order": "ascending" }, "type": "nominal" }, "x2": { "field": "lead" }, "y": { "axis": { "labelExpr": "format(1 / (1 + pow(2, -1*datum.value)), '.2r')", "orient": "right", "title": "Probability" }, "field": "sum", "scale": { "zero": false }, "type": "quantitative" } }, "mark": { "color": "black", "strokeWidth": 2, "type": "rule", "x2Offset": 30, "xOffset": -30 } } ], "params": [ { "bind": { "input": "range", "max": 1, "min": 0, "step": 1 }, "description": "Filter by the interation number", "name": "record_number", "value": 0 } ], "resolve": { "axis": { "y": "independent" } }, "title": { "subtitle": "How each comparison contributes to the final match score", "text": "Match weights waterfall chart" }, "transform": [ { "filter": "(datum.record_number == record_number)" }, { "filter": "(datum.bayes_factor !== 1.0)" }, { "frame": [ null, 0 ], "window": [ { "as": "sum", "field": "log2_bayes_factor", "op": "sum" }, { "as": "lead", "field": "column_name", "op": "lead" } ] }, { "as": "sum", "calculate": "datum.column_name === \"Final score\" ? datum.sum - datum.log2_bayes_factor : datum.sum" }, { "as": "lead", "calculate": "datum.lead === null ? datum.column_name : datum.lead" }, { "as": "previous_sum", "calculate": "datum.column_name === \"Final score\" || datum.column_name === \"Prior match weight\" ? 0 : datum.sum - datum.log2_bayes_factor" }, { "as": "top_label", "calculate": "datum.sum > datum.previous_sum ? datum.column_name : \"\"" }, { "as": "bottom_label", "calculate": "datum.sum < datum.previous_sum ? datum.column_name : \"\"" }, { "as": "sum_top", "calculate": "datum.sum > datum.previous_sum ? datum.sum : datum.previous_sum" }, { "as": "sum_bottom", "calculate": "datum.sum < datum.previous_sum ? datum.sum : datum.previous_sum" }, { "as": "center", "calculate": "(datum.sum + datum.previous_sum) / 2" }, { "as": "text_log2_bayes_factor", "calculate": "(datum.log2_bayes_factor > 0 ? \"+\" : \"\") + datum.log2_bayes_factor" }, { "as": "dy", "calculate": "datum.sum < datum.previous_sum ? 4 : -4" }, { "as": "baseline", "calculate": "datum.sum < datum.previous_sum ? \"top\" : \"bottom\"" }, { "as": "prob", "calculate": "1. / (1 + pow(2, -1.*datum.sum))" }, { "as": "zero", "calculate": "0*datum.sum" } ], "width": { "step": 75 } }, "image/png": "", "text/plain": [ "\n", "\n", "If you see this message, it means the renderer has not been properly enabled\n", "for the frontend that you are using. For more information, see\n", "https://altair-viz.github.io/user_guide/troubleshooting.html\n" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "linker.waterfall_chart(linker.predict().as_record_dict(limit=2))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step Comparing two records\n", "\n", "It's now possible to compute a match weight for any two records using `linker.compare_two_records()`" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
match_weightmatch_probabilityunique_id_lunique_id_rfirst_name_lfirst_name_rgamma_first_nametf_first_name_ltf_first_name_rbf_first_name...tf_city_rbf_citybf_tf_adj_cityemail_lemail_rgamma_emailtf_email_ltf_email_rbf_emailbf_tf_adj_email
013.1616720.99989112LucasLucas20.0012030.00120387.571229...NaN0.4464041.0lucas.smith@hotmail.comlucas.smith@hotmail.com1NaNNaN263.2291681.0
\n", "

1 rows × 39 columns

\n", "
" ], "text/plain": [ " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n", "0 13.161672 0.999891 1 2 Lucas \n", "\n", " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n", "0 Lucas 2 0.001203 0.001203 \n", "\n", " bf_first_name ... tf_city_r bf_city bf_tf_adj_city \\\n", "0 87.571229 ... NaN 0.446404 1.0 \n", "\n", " email_l email_r gamma_email tf_email_l \\\n", "0 lucas.smith@hotmail.com lucas.smith@hotmail.com 1 NaN \n", "\n", " tf_email_r bf_email bf_tf_adj_email \n", "0 NaN 263.229168 1.0 \n", "\n", "[1 rows x 39 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "record_1 = {\n", " 'unique_id':1,\n", " 'first_name': \"Lucas\",\n", " 'surname': \"Smith\",\n", " 'dob': \"1984-01-02\",\n", " 'city': \"London\",\n", " 'email': \"lucas.smith@hotmail.com\"\n", "}\n", "\n", "record_2 = {\n", " 'unique_id':2,\n", " 'first_name': \"Lucas\",\n", " 'surname': \"Smith\",\n", " 'dob': \"1983-02-12\",\n", " 'city': \"Machester\",\n", " 'email': \"lucas.smith@hotmail.com\"\n", "}\n", "linker._settings_obj_._retain_intermediate_calculation_columns = True\n", "linker._settings_obj_._retain_matching_columns = True\n", "\n", "df_two = linker.compare_two_records(record_1, record_2)\n", "df_two.as_pandas_dataframe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 3: Interactive comparisons\n", "\n", "One interesting applicatin of `compare_two_records` is to create a simple interface that allows the user to input two records interactively, and get real time feedback.\n", "\n", "In the following cell we use `ipywidets` for this purpose. ✨✨ Change the values in the text boxes to see the waterfall chart update in real time. ✨✨" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c0e1d6c19a0c4a4a9f1204f3448b3b91", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(VBox(children=(Text(value='1', description='unique_id'), Text(value='Lucas', description='first…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f4afb60cf43141b58803c58e9d193342", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Output()" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import ipywidgets as widgets\n", "fields = [\"unique_id\", \"first_name\",\"surname\",\"dob\",\"email\",\"city\"]\n", "\n", "left_text_boxes = []\n", "right_text_boxes = []\n", "\n", "inputs_to_interactive_output = {}\n", "\n", "for f in fields:\n", " wl = widgets.Text(description=f, value =str(record_1[f]))\n", " left_text_boxes.append(wl)\n", " inputs_to_interactive_output[f\"{f}_l\"] = wl\n", " wr = widgets.Text( description=f, value =str(record_2[f]))\n", " right_text_boxes.append(wr)\n", " inputs_to_interactive_output[f\"{f}_r\"] = wr\n", "\n", "\n", "b1 = widgets.VBox(left_text_boxes)\n", "b2 = widgets.VBox(right_text_boxes)\n", "ui = widgets.HBox([b1,b2])\n", "\n", "def myfn(**kwargs):\n", " my_args = dict(kwargs)\n", " \n", " record_left = {}\n", " record_right = {}\n", " \n", " for key, value in my_args.items():\n", " if value == '':\n", " value = None\n", " if key.endswith(\"_l\"):\n", " record_left[key[:-2]] = value\n", " if key.endswith(\"_r\"):\n", " record_right[key[:-2]] = value\n", " \n", "\n", " linker._settings_obj_._retain_intermediate_calculation_columns = True\n", " linker._settings_obj_._retain_matching_columns = True\n", "\n", " df_two = linker.compare_two_records(record_left, record_right)\n", "\n", " recs = df_two.as_pandas_dataframe().to_dict(orient=\"records\")\n", " from splink.charts import waterfall_chart\n", " display(linker.waterfall_chart(recs, filter_nulls=False))\n", "\n", "\n", "out = widgets.interactive_output(myfn, inputs_to_interactive_output)\n", "\n", "display(ui,out)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Finding matching records interactively\n", "\n", "It is also possible to search the records in the input dataset rapidly using the `linker.find_matches_to_new_records()` function" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
match_weightmatch_probabilityunique_id_lunique_id_rfirst_name_lfirst_name_rgamma_first_nametf_first_name_ltf_first_name_rbf_first_name...tf_city_rbf_citybf_tf_adj_cityemail_lemail_rgamma_emailtf_email_ltf_email_rbf_emailbf_tf_adj_email
223.5317931.0000000123987RobertRobert20.0036100.0036187.571229...0.2127921.0000001.000000robert255@smith.netrobert255@smith.net10.0012670.001267263.2291681.730964
314.5503200.9999581123987RobertRobert20.0036100.0036187.571229...0.2127921.0000001.000000roberta25@smith.netrobert255@smith.net00.0025350.0012670.4234381.000000
410.3886230.9992553123987RobertRobert20.0036100.0036187.571229...0.2127920.4464041.000000NaNrobert255@smith.net-1NaN0.0012671.0000001.000000
02.4272560.8432282123987RobRobert00.0012030.003610.218767...0.21279210.4848590.259162roberta25@smith.netrobert255@smith.net00.0025350.0012670.4234381.000000
6-2.1230900.1866978123987NaNRobert-1NaN0.003611.000000...0.2127921.0000001.000000NaNrobert255@smith.net-1NaN0.0012671.0000001.000000
5-2.2058940.178139754123987NaNRobert-1NaN0.003611.000000...0.2127921.0000001.000000j.c@whige.wortrobert255@smith.net00.0012670.0012670.4234381.000000
1-2.8023090.125383750123987NaNRobert-1NaN0.003611.000000...0.21279210.4848590.259162j.c@white.orgrobert255@smith.net00.0025350.0012670.4234381.000000
\n", "

7 rows × 39 columns

\n", "
" ], "text/plain": [ " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n", "2 23.531793 1.000000 0 123987 Robert \n", "3 14.550320 0.999958 1 123987 Robert \n", "4 10.388623 0.999255 3 123987 Robert \n", "0 2.427256 0.843228 2 123987 Rob \n", "6 -2.123090 0.186697 8 123987 NaN \n", "5 -2.205894 0.178139 754 123987 NaN \n", "1 -2.802309 0.125383 750 123987 NaN \n", "\n", " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n", "2 Robert 2 0.003610 0.00361 \n", "3 Robert 2 0.003610 0.00361 \n", "4 Robert 2 0.003610 0.00361 \n", "0 Robert 0 0.001203 0.00361 \n", "6 Robert -1 NaN 0.00361 \n", "5 Robert -1 NaN 0.00361 \n", "1 Robert -1 NaN 0.00361 \n", "\n", " bf_first_name ... tf_city_r bf_city bf_tf_adj_city \\\n", "2 87.571229 ... 0.212792 1.000000 1.000000 \n", "3 87.571229 ... 0.212792 1.000000 1.000000 \n", "4 87.571229 ... 0.212792 0.446404 1.000000 \n", "0 0.218767 ... 0.212792 10.484859 0.259162 \n", "6 1.000000 ... 0.212792 1.000000 1.000000 \n", "5 1.000000 ... 0.212792 1.000000 1.000000 \n", "1 1.000000 ... 0.212792 10.484859 0.259162 \n", "\n", " email_l email_r gamma_email tf_email_l \\\n", "2 robert255@smith.net robert255@smith.net 1 0.001267 \n", "3 roberta25@smith.net robert255@smith.net 0 0.002535 \n", "4 NaN robert255@smith.net -1 NaN \n", "0 roberta25@smith.net robert255@smith.net 0 0.002535 \n", "6 NaN robert255@smith.net -1 NaN \n", "5 j.c@whige.wort robert255@smith.net 0 0.001267 \n", "1 j.c@white.org robert255@smith.net 0 0.002535 \n", "\n", " tf_email_r bf_email bf_tf_adj_email \n", "2 0.001267 263.229168 1.730964 \n", "3 0.001267 0.423438 1.000000 \n", "4 0.001267 1.000000 1.000000 \n", "0 0.001267 0.423438 1.000000 \n", "6 0.001267 1.000000 1.000000 \n", "5 0.001267 0.423438 1.000000 \n", "1 0.001267 0.423438 1.000000 \n", "\n", "[7 rows x 39 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "record = {'unique_id': 123987,\n", " 'first_name': \"Robert\",\n", " 'surname': \"Alan\",\n", " 'dob': \"1971-05-24\",\n", " 'city': \"London\",\n", " 'email': \"robert255@smith.net\"\n", "}\n", "\n", "\n", "\n", "df_inc = linker.find_matches_to_new_records([record], blocking_rules=[]).as_pandas_dataframe()\n", "df_inc.sort_values(\"match_weight\", ascending=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Interactive interface for finding records\n", "\n", "Again, we can use `ipywidgets` to build an interactive interface for the `linker.find_matches_to_new_records` function" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6887322a0d2d459ca04a2cddeee41da2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "interactive(children=(Text(value='Robert', description='first_name'), Text(value='Alan', description='surname'…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from splink.charts import waterfall_chart\n", "\n", "@widgets.interact(first_name='Robert', surname=\"Alan\", dob=\"1971-05-24\", city=\"London\", email=\"robert255@smith.net\")\n", "def interactive_link(first_name, surname, dob, city, email): \n", "\n", " record = {'unique_id': 123987,\n", " 'first_name': first_name,\n", " 'surname': surname,\n", " 'dob': dob,\n", " 'city': city,\n", " 'email': email,\n", " 'group': 0}\n", "\n", " for key in record.keys():\n", " if type(record[key]) == str:\n", " if record[key].strip() == \"\":\n", " record[key] = None\n", "\n", " \n", " df_inc = linker.find_matches_to_new_records([record], blocking_rules=[f\"(true)\"]).as_pandas_dataframe()\n", " df_inc = df_inc.sort_values(\"match_weight\", ascending=False)\n", " recs = df_inc.to_dict(orient=\"records\")\n", " \n", "\n", "\n", " display(linker.waterfall_chart(recs, filter_nulls=False))\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.vegalite.v4+json": { "$schema": "https://vega.github.io/schema/vega-lite/v5.2.json", "config": { "header": { "title": null }, "mark": { "tooltip": null }, "title": { "anchor": "middle" }, "view": { "height": 60, "width": 400 } }, "data": { "values": [ { "bayes_factor": 0.023816582302252427, "bayes_factor_description": "The probability that two random records drawn at random match is 0.023 or one in 43.0 records.This is equivalent to a starting match weight of -5.392.", "comparison_name": "probability_two_random_records_match", "comparison_sort_order": -1, "comparison_vector_value": 0, "has_tf_adjustments": false, "is_null_level": false, "label_for_charts": "", "log2_bayes_factor": -5.391889789559854, "m_probability": null, "m_probability_description": null, "max_comparison_vector_value": 0, "probability_two_random_records_match": 0.02326254791526835, "sql_condition": null, "tf_adjustment_column": null, "tf_adjustment_weight": null, "u_probability": null, "u_probability_description": null }, { "bayes_factor": 87.57122888658395, "bayes_factor_description": "If comparison level is `exact_match` then comparison is 87.57 times more likely to be a match", "comparison_name": "first_name", "comparison_sort_order": 0, "comparison_vector_value": 2, "has_tf_adjustments": true, "is_null_level": false, "label_for_charts": "exact_match", "log2_bayes_factor": 6.452385051922501, "m_probability": 0.5073501669215337, "m_probability_description": "Amongst matching record comparisons, 50.74% of records are in the exact_match comparison level", "max_comparison_vector_value": 2, "probability_two_random_records_match": 0.02326254791526835, "sql_condition": "first_name_l = first_name_r", "tf_adjustment_column": "first_name", "tf_adjustment_weight": 1, "u_probability": 0.0057935713975033705, "u_probability_description": "Amongst non-matching record comparisons, 0.58% of records are in the exact_match comparison level" }, { "bayes_factor": 27.407809072486973, "bayes_factor_description": "If comparison level is `levenstein <= 2` then comparison is 27.41 times more likely to be a match", "comparison_name": "first_name", "comparison_sort_order": 0, "comparison_vector_value": 1, "has_tf_adjustments": false, "is_null_level": false, "label_for_charts": "Levenstein <= 2", "log2_bayes_factor": 4.776515101395072, "m_probability": 0.27736434159157797, "m_probability_description": "Amongst matching record comparisons, 27.74% of records are in the levenstein <= 2 comparison level", "max_comparison_vector_value": 2, "probability_two_random_records_match": 0.02326254791526835, "sql_condition": "levenshtein(first_name_l, first_name_r) <= 2", "tf_adjustment_column": null, "tf_adjustment_weight": 1, "u_probability": 0.010119901990634016, "u_probability_description": "Amongst non-matching record comparisons, 1.01% of records are in the levenstein <= 2 comparison level" }, { "bayes_factor": 0.21876683164040506, "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is 4.57 times less likely to be a match", "comparison_name": "first_name", "comparison_sort_order": 0, "comparison_vector_value": 0, "has_tf_adjustments": false, "is_null_level": false, "label_for_charts": "All other comparisons", "log2_bayes_factor": -2.1925340745596764, "m_probability": 0.21528549148688833, "m_probability_description": "Amongst matching record comparisons, 21.53% of records are in the all other comparisons comparison level", "max_comparison_vector_value": 2, "probability_two_random_records_match": 0.02326254791526835, "sql_condition": "ELSE", "tf_adjustment_column": null, "tf_adjustment_weight": 1, "u_probability": 0.9840865266118626, "u_probability_description": "Amongst non-matching record comparisons, 98.41% of records are in the all other comparisons comparison level" }, { "bayes_factor": 92.38584465067325, "bayes_factor_description": "If comparison level is `exact_match` then comparison is 92.39 times more likely to be a match", "comparison_name": "surname", "comparison_sort_order": 1, "comparison_vector_value": 2, "has_tf_adjustments": true, "is_null_level": false, "label_for_charts": "exact_match", "log2_bayes_factor": 6.529599913880287, "m_probability": 0.4517645215191846, "m_probability_description": "Amongst matching record comparisons, 45.18% of records are in the exact_match comparison level", "max_comparison_vector_value": 2, "probability_two_random_records_match": 0.02326254791526835, "sql_condition": "surname_l = surname_r", "tf_adjustment_column": "surname", "tf_adjustment_weight": 1, "u_probability": 0.004889975550122249, "u_probability_description": "Amongst non-matching record comparisons, 0.49% of records are in the exact_match comparison level" }, { "bayes_factor": 41.74477904659683, "bayes_factor_description": "If comparison level is `levenstein <= 2` then comparison is 41.74 times more likely to be a match", "comparison_name": "surname", "comparison_sort_order": 1, "comparison_vector_value": 1, "has_tf_adjustments": false, "is_null_level": false, "label_for_charts": "Levenstein <= 2", "log2_bayes_factor": 5.383523868172574, "m_probability": 0.3078165102205689, "m_probability_description": "Amongst matching record comparisons, 30.78% of records are in the levenstein <= 2 comparison level", "max_comparison_vector_value": 2, "probability_two_random_records_match": 0.02326254791526835, "sql_condition": "levenshtein(surname_l, surname_r) <= 2", "tf_adjustment_column": null, "tf_adjustment_weight": 1, "u_probability": 0.007373772654946249, "u_probability_description": "Amongst non-matching record comparisons, 0.74% of records are in the levenstein <= 2 comparison level" }, { "bayes_factor": 0.24340401379756268, "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is 4.11 times less likely to be a match", "comparison_name": "surname", "comparison_sort_order": 1, "comparison_vector_value": 0, "has_tf_adjustments": false, "is_null_level": false, "label_for_charts": "All other comparisons", "log2_bayes_factor": -2.03857513621085, "m_probability": 0.24041896826024636, "m_probability_description": "Amongst matching record comparisons, 24.04% of records are in the all other comparisons comparison level", "max_comparison_vector_value": 2, "probability_two_random_records_match": 0.02326254791526835, "sql_condition": "ELSE", "tf_adjustment_column": null, "tf_adjustment_weight": 1, "u_probability": 0.9877362517949315, "u_probability_description": "Amongst non-matching record comparisons, 98.77% of records are in the all other comparisons comparison level" }, { "bayes_factor": 232.03049287476935, "bayes_factor_description": "If comparison level is `exact_match` then comparison is 232.03 times more likely to be a match", "comparison_name": "dob", "comparison_sort_order": 2, "comparison_vector_value": 2, "has_tf_adjustments": true, "is_null_level": false, "label_for_charts": "exact_match", "log2_bayes_factor": 7.858170603008739, "m_probability": 0.405530771330678, "m_probability_description": "Amongst matching record comparisons, 40.55% of records are in the exact_match comparison level", "max_comparison_vector_value": 2, "probability_two_random_records_match": 0.02326254791526835, "sql_condition": "dob_l = dob_r", "tf_adjustment_column": "dob", "tf_adjustment_weight": 1, "u_probability": 0.0017477477477477479, "u_probability_description": "Amongst non-matching record comparisons, 0.17% of records are in the exact_match comparison level" }, { "bayes_factor": 21.492671620753597, "bayes_factor_description": "If comparison level is `levenstein <= 2` then comparison is 21.49 times more likely to be a match", "comparison_name": "dob", "comparison_sort_order": 2, "comparison_vector_value": 1, "has_tf_adjustments": false, "is_null_level": false, "label_for_charts": "Levenstein <= 2", "log2_bayes_factor": 4.425772921275592, "m_probability": 0.3679356056637918, "m_probability_description": "Amongst matching record comparisons, 36.79% of records are in the levenstein <= 2 comparison level", "max_comparison_vector_value": 2, "probability_two_random_records_match": 0.02326254791526835, "sql_condition": "levenshtein(dob_l, dob_r) <= 2", "tf_adjustment_column": null, "tf_adjustment_weight": 1, "u_probability": 0.01711911911911912, "u_probability_description": "Amongst non-matching record comparisons, 1.71% of records are in the levenstein <= 2 comparison level" }, { "bayes_factor": 0.23088978993311773, "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is 4.33 times less likely to be a match", "comparison_name": "dob", "comparison_sort_order": 2, "comparison_vector_value": 0, "has_tf_adjustments": false, "is_null_level": false, "label_for_charts": "All other comparisons", "log2_bayes_factor": -2.114723717091652, "m_probability": 0.22653362300553073, "m_probability_description": "Amongst matching record comparisons, 22.65% of records are in the all other comparisons comparison level", "max_comparison_vector_value": 2, "probability_two_random_records_match": 0.02326254791526835, "sql_condition": "ELSE", "tf_adjustment_column": null, "tf_adjustment_weight": 1, "u_probability": 0.9811331331331331, "u_probability_description": "Amongst non-matching record comparisons, 98.11% of records are in the all other comparisons comparison level" }, { "bayes_factor": 10.484858675056154, "bayes_factor_description": "If comparison level is `exact_match` then comparison is 10.48 times more likely to be a match", "comparison_name": "city", "comparison_sort_order": 3, "comparison_vector_value": 1, "has_tf_adjustments": true, "is_null_level": false, "label_for_charts": "exact_match", "log2_bayes_factor": 3.3902355104306197, "m_probability": 0.5782144900964232, "m_probability_description": "Amongst matching record comparisons, 57.82% of records are in the exact_match comparison level", "max_comparison_vector_value": 1, "probability_two_random_records_match": 0.02326254791526835, "sql_condition": "city_l = city_r", "tf_adjustment_column": "city", "tf_adjustment_weight": 1, "u_probability": 0.0551475711801453, "u_probability_description": "Amongst non-matching record comparisons, 5.51% of records are in the exact_match comparison level" }, { "bayes_factor": 0.4464035832880252, "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is 2.24 times less likely to be a match", "comparison_name": "city", "comparison_sort_order": 3, "comparison_vector_value": 0, "has_tf_adjustments": false, "is_null_level": false, "label_for_charts": "All other comparisons", "log2_bayes_factor": -1.1635794871398053, "m_probability": 0.4217855099035769, "m_probability_description": "Amongst matching record comparisons, 42.18% of records are in the all other comparisons comparison level", "max_comparison_vector_value": 1, "probability_two_random_records_match": 0.02326254791526835, "sql_condition": "ELSE", "tf_adjustment_column": null, "tf_adjustment_weight": 1, "u_probability": 0.9448524288198547, "u_probability_description": "Amongst non-matching record comparisons, 94.49% of records are in the all other comparisons comparison level" }, { "bayes_factor": 263.2291676754963, "bayes_factor_description": "If comparison level is `exact_match` then comparison is 263.23 times more likely to be a match", "comparison_name": "email", "comparison_sort_order": 4, "comparison_vector_value": 1, "has_tf_adjustments": true, "is_null_level": false, "label_for_charts": "exact_match", "log2_bayes_factor": 8.04017554864013, "m_probability": 0.5774909200578013, "m_probability_description": "Amongst matching record comparisons, 57.75% of records are in the exact_match comparison level", "max_comparison_vector_value": 1, "probability_two_random_records_match": 0.02326254791526835, "sql_condition": "email_l = email_r", "tf_adjustment_column": "email", "tf_adjustment_weight": 1, "u_probability": 0.0021938713143283602, "u_probability_description": "Amongst non-matching record comparisons, 0.22% of records are in the exact_match comparison level" }, { "bayes_factor": 0.4234380485302649, "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is 2.36 times less likely to be a match", "comparison_name": "email", "comparison_sort_order": 4, "comparison_vector_value": 0, "has_tf_adjustments": false, "is_null_level": false, "label_for_charts": "All other comparisons", "log2_bayes_factor": -1.239777184635766, "m_probability": 0.42250907994219916, "m_probability_description": "Amongst matching record comparisons, 42.25% of records are in the all other comparisons comparison level", "max_comparison_vector_value": 1, "probability_two_random_records_match": 0.02326254791526835, "sql_condition": "ELSE", "tf_adjustment_column": null, "tf_adjustment_weight": 1, "u_probability": 0.9978061286856716, "u_probability_description": "Amongst non-matching record comparisons, 99.78% of records are in the all other comparisons comparison level" } ] }, "resolve": { "axis": { "y": "independent" }, "scale": { "y": "independent" } }, "selection": { "zoom_selector": { "bind": "scales", "encodings": [ "x" ], "type": "interval" } }, "title": { "subtitle": "Use mousewheel to zoom", "text": "Model parameters (components of final match weight)" }, "vconcat": [ { "encoding": { "color": { "field": "log2_bayes_factor", "scale": { "domain": [ -10, 0, 10 ], "range": [ "red", "orange", "green" ] }, "title": "Match weight", "type": "quantitative" }, "tooltip": [ { "field": "comparison_name", "title": "Comparison name", "type": "nominal" }, { "field": "probability_two_random_records_match", "format": ".4f", "title": "Probability two random records match", "type": "nominal" }, { "field": "log2_bayes_factor", "format": ",.4f", "title": "Equivalent match weight", "type": "quantitative" }, { "field": "bayes_factor_description", "title": "Match weight description", "type": "nominal" } ], "x": { "axis": { "domain": false, "labels": false, "ticks": false, "title": "" }, "field": "log2_bayes_factor", "scale": { "domain": [ -10, 10 ] }, "type": "quantitative" }, "y": { "axis": { "title": "Prior (starting) match weight", "titleAlign": "right", "titleAngle": 0, "titleFontWeight": "normal" }, "field": "label_for_charts", "sort": { "field": "comparison_vector_value", "order": "descending" }, "type": "nominal" } }, "height": 30, "mark": { "clip": true, "height": 20, "type": "bar" }, "selection": { "zoom_selector": { "bind": "scales", "encodings": [ "x" ], "type": "interval" } }, "transform": [ { "filter": "(datum.comparison_name == 'probability_two_random_records_match')" } ] }, { "encoding": { "color": { "field": "log2_bayes_factor", "scale": { "domain": [ -10, 0, 10 ], "range": [ "red", "orange", "green" ] }, "title": "Match weight", "type": "quantitative" }, "row": { "field": "comparison_name", "header": { "labelAlign": "left", "labelAnchor": "middle", "labelAngle": 0 }, "sort": { "field": "comparison_sort_order" }, "type": "nominal" }, "tooltip": [ { "field": "comparison_name", "title": "Comparison name", "type": "nominal" }, { "field": "label_for_charts", "title": "Label", "type": "ordinal" }, { "field": "sql_condition", "title": "SQL condition", "type": "nominal" }, { "field": "m_probability", "format": ".4f", "title": "M probability", "type": "quantitative" }, { "field": "u_probability", "format": ".4f", "title": "U probability", "type": "quantitative" }, { "field": "bayes_factor", "format": ",.4f", "title": "Bayes factor = m/u", "type": "quantitative" }, { "field": "log2_bayes_factor", "format": ",.4f", "title": "Match weight = log2(m/u)", "type": "quantitative" }, { "field": "bayes_factor_description", "title": "Match weight description", "type": "nominal" } ], "x": { "axis": { "title": "Comparison level match weight = log2(m/u)" }, "field": "log2_bayes_factor", "scale": { "domain": [ -10, 10 ] }, "type": "quantitative" }, "y": { "axis": { "title": null }, "field": "label_for_charts", "sort": { "field": "comparison_vector_value", "order": "descending" }, "type": "nominal" } }, "mark": { "clip": true, "type": "bar" }, "resolve": { "axis": { "y": "independent" }, "scale": { "y": "independent" } }, "selection": { "zoom_selector": { "bind": "scales", "encodings": [ "x" ], "type": "interval" } }, "transform": [ { "filter": "(datum.comparison_name != 'probability_two_random_records_match')" } ] } ] }, "image/png": "", "text/plain": [ "\n", "\n", "If you see this message, it means the renderer has not been properly enabled\n", "for the frontend that you are using. For more information, see\n", "https://altair-viz.github.io/user_guide/troubleshooting.html\n" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "linker.match_weights_chart()" ] } ], "metadata": { "kernelspec": { "display_name": "splink_demos", "language": "python", "name": "splink_demos" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }